diff options
Diffstat (limited to 'src/backend/access/tablesample')
-rw-r--r-- | src/backend/access/tablesample/Makefile | 6 | ||||
-rw-r--r-- | src/backend/access/tablesample/bernoulli.c | 326 | ||||
-rw-r--r-- | src/backend/access/tablesample/system.c | 312 | ||||
-rw-r--r-- | src/backend/access/tablesample/tablesample.c | 355 |
4 files changed, 373 insertions, 626 deletions
diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile index 46eeb59f9c4..68d9ab28147 100644 --- a/src/backend/access/tablesample/Makefile +++ b/src/backend/access/tablesample/Makefile @@ -1,10 +1,10 @@ #------------------------------------------------------------------------- # # Makefile-- -# Makefile for utils/tablesample +# Makefile for access/tablesample # # IDENTIFICATION -# src/backend/utils/tablesample/Makefile +# src/backend/access/tablesample/Makefile # #------------------------------------------------------------------------- @@ -12,6 +12,6 @@ subdir = src/backend/access/tablesample top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = tablesample.o system.o bernoulli.o +OBJS = bernoulli.o system.o tablesample.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c index 0a539008221..cf88f95e757 100644 --- a/src/backend/access/tablesample/bernoulli.c +++ b/src/backend/access/tablesample/bernoulli.c @@ -1,233 +1,231 @@ /*------------------------------------------------------------------------- * * bernoulli.c - * interface routines for BERNOULLI tablesample method + * support routines for BERNOULLI tablesample method * - * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * To ensure repeatability of samples, it is necessary that selection of a + * given tuple be history-independent; otherwise syncscanning would break + * repeatability, to say nothing of logically-irrelevant maintenance such + * as physical extension or shortening of the relation. + * + * To achieve that, we proceed by hashing each candidate TID together with + * the active seed, and then selecting it if the hash is less than the + * cutoff value computed from the selection probability by BeginSampleScan. + * + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * src/backend/utils/tablesample/bernoulli.c + * src/backend/access/tablesample/bernoulli.c * *------------------------------------------------------------------------- */ #include "postgres.h" -#include "fmgr.h" +#ifdef _MSC_VER +#include <float.h> /* for _isnan */ +#endif +#include <math.h> -#include "access/tablesample.h" -#include "access/relscan.h" -#include "nodes/execnodes.h" -#include "nodes/relation.h" +#include "access/hash.h" +#include "access/tsmapi.h" +#include "catalog/pg_type.h" #include "optimizer/clauses.h" -#include "storage/bufmgr.h" -#include "utils/sampling.h" +#include "optimizer/cost.h" +#include "utils/builtins.h" -/* tsdesc */ +/* Private state */ typedef struct { + uint64 cutoff; /* select tuples with hash less than this */ uint32 seed; /* random seed */ - BlockNumber startblock; /* starting block, we use ths for syncscan - * support */ - BlockNumber nblocks; /* number of blocks */ - BlockNumber blockno; /* current block */ - float4 probability; /* probabilty that tuple will be returned - * (0.0-1.0) */ OffsetNumber lt; /* last tuple returned from current block */ - SamplerRandomState randstate; /* random generator tsdesc */ } BernoulliSamplerData; + +static void bernoulli_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); +static void bernoulli_initsamplescan(SampleScanState *node, + int eflags); +static void bernoulli_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); +static OffsetNumber bernoulli_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); + + /* - * Initialize the state. + * Create a TsmRoutine descriptor for the BERNOULLI method. */ Datum -tsm_bernoulli_init(PG_FUNCTION_ARGS) +tsm_bernoulli_handler(PG_FUNCTION_ARGS) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - uint32 seed = PG_GETARG_UINT32(1); - float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); - HeapScanDesc scan = tsdesc->heapScan; - BernoulliSamplerData *sampler; + TsmRoutine *tsm = makeNode(TsmRoutine); + + tsm->parameterTypes = list_make1_oid(FLOAT4OID); + tsm->repeatable_across_queries = true; + tsm->repeatable_across_scans = true; + tsm->SampleScanGetSampleSize = bernoulli_samplescangetsamplesize; + tsm->InitSampleScan = bernoulli_initsamplescan; + tsm->BeginSampleScan = bernoulli_beginsamplescan; + tsm->NextSampleBlock = NULL; + tsm->NextSampleTuple = bernoulli_nextsampletuple; + tsm->EndSampleScan = NULL; + + PG_RETURN_POINTER(tsm); +} - if (percent < 0 || percent > 100) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("invalid sample size"), - errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); +/* + * Sample size estimation. + */ +static void +bernoulli_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples) +{ + Node *pctnode; + float4 samplefract; - sampler = palloc0(sizeof(BernoulliSamplerData)); + /* Try to extract an estimate for the sample percentage */ + pctnode = (Node *) linitial(paramexprs); + pctnode = estimate_expression_value(root, pctnode); - /* Remember initial values for reinit */ - sampler->seed = seed; - sampler->startblock = scan->rs_startblock; - sampler->nblocks = scan->rs_nblocks; - sampler->blockno = InvalidBlockNumber; - sampler->probability = percent / 100; - sampler->lt = InvalidOffsetNumber; - sampler_random_init_state(sampler->seed, sampler->randstate); + if (IsA(pctnode, Const) && + !((Const *) pctnode)->constisnull) + { + samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue); + if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract)) + samplefract /= 100.0f; + else + { + /* Default samplefract if the value is bogus */ + samplefract = 0.1f; + } + } + else + { + /* Default samplefract if we didn't obtain a non-null Const */ + samplefract = 0.1f; + } + + /* We'll visit all pages of the baserel */ + *pages = baserel->pages; - tsdesc->tsmdata = (void *) sampler; + *tuples = clamp_row_est(baserel->tuples * samplefract); +} - PG_RETURN_VOID(); +/* + * Initialize during executor setup. + */ +static void +bernoulli_initsamplescan(SampleScanState *node, int eflags) +{ + node->tsm_state = palloc0(sizeof(BernoulliSamplerData)); } /* - * Get next block number to read or InvalidBlockNumber if we are at the - * end of the relation. + * Examine parameters and prepare for a sample scan. */ -Datum -tsm_bernoulli_nextblock(PG_FUNCTION_ARGS) +static void +bernoulli_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata; + BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state; + double percent = DatumGetFloat4(params[0]); + + if (percent < 0 || percent > 100 || isnan(percent)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("sample percentage must be between 0 and 100"))); /* - * Bernoulli sampling scans all blocks on the table and supports syncscan - * so loop from startblock to startblock instead of from 0 to nblocks. + * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to + * store that as a uint64, of course. Note that this gives strictly + * correct behavior at the limits of zero or one probability. */ - if (sampler->blockno == InvalidBlockNumber) - sampler->blockno = sampler->startblock; - else - { - sampler->blockno++; - - if (sampler->blockno >= sampler->nblocks) - sampler->blockno = 0; - - if (sampler->blockno == sampler->startblock) - PG_RETURN_UINT32(InvalidBlockNumber); - } + sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100); + sampler->seed = seed; + sampler->lt = InvalidOffsetNumber; - PG_RETURN_UINT32(sampler->blockno); + /* + * Use bulkread, since we're scanning all pages. But pagemode visibility + * checking is a win only at larger sampling fractions. The 25% cutoff + * here is based on very limited experimentation. + */ + node->use_bulkread = true; + node->use_pagemode = (percent >= 25); } /* - * Get next tuple from current block. - * - * This method implements the main logic in bernoulli sampling. - * The algorithm simply generates new random number (in 0.0-1.0 range) and if - * it falls within user specified probability (in the same range) return the - * tuple offset. - * - * It is ok here to return tuple offset without knowing if tuple is visible - * and not check it via examinetuple. The reason for that is that we do the - * coinflip (random number generation) for every tuple in the table. Since all - * tuples have same probability of being returned the visible and invisible - * tuples will be returned in same ratio as they have in the actual table. - * This means that there is no skew towards either visible or invisible tuples - * and the number of visible tuples returned from the executor node should - * match the fraction of visible tuples which was specified by user. + * Select next sampled tuple in current block. * - * This is faster than doing the coinflip in examinetuple because we don't - * have to do visibility checks on uninteresting tuples. + * It is OK here to return an offset without knowing if the tuple is visible + * (or even exists). The reason is that we do the coinflip for every tuple + * offset in the table. Since all tuples have the same probability of being + * returned, it doesn't matter if we do extra coinflips for invisible tuples. * - * If we reach end of the block return InvalidOffsetNumber which tells + * When we reach end of the block, return InvalidOffsetNumber which tells * SampleScan to go to next block. */ -Datum -tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS) +static OffsetNumber +bernoulli_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - OffsetNumber maxoffset = PG_GETARG_UINT16(2); - BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata; + BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state; OffsetNumber tupoffset = sampler->lt; - float4 probability = sampler->probability; + uint32 hashinput[3]; + /* Advance to first/next tuple in block */ if (tupoffset == InvalidOffsetNumber) tupoffset = FirstOffsetNumber; else tupoffset++; /* - * Loop over tuple offsets until the random generator returns value that - * is within the probability of returning the tuple or until we reach end - * of the block. + * We compute the hash by applying hash_any to an array of 3 uint32's + * containing the block, offset, and seed. This is efficient to set up, + * and with the current implementation of hash_any, it gives + * machine-independent results, which is a nice property for regression + * testing. * - * (This is our implementation of bernoulli trial) + * These words in the hash input are the same throughout the block: */ - while (sampler_random_fract(sampler->randstate) > probability) + hashinput[0] = blockno; + hashinput[2] = sampler->seed; + + /* + * Loop over tuple offsets until finding suitable TID or reaching end of + * block. + */ + for (; tupoffset <= maxoffset; tupoffset++) { - tupoffset++; + uint32 hash; - if (tupoffset > maxoffset) + hashinput[1] = tupoffset; + + hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, + (int) sizeof(hashinput))); + if (hash < sampler->cutoff) break; } if (tupoffset > maxoffset) - /* Tell SampleScan that we want next block. */ tupoffset = InvalidOffsetNumber; sampler->lt = tupoffset; - PG_RETURN_UINT16(tupoffset); -} - -/* - * Cleanup method. - */ -Datum -tsm_bernoulli_end(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - - pfree(tsdesc->tsmdata); - - PG_RETURN_VOID(); -} - -/* - * Reset tsdesc (called by ReScan). - */ -Datum -tsm_bernoulli_reset(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata; - - sampler->blockno = InvalidBlockNumber; - sampler->lt = InvalidOffsetNumber; - sampler_random_init_state(sampler->seed, sampler->randstate); - - PG_RETURN_VOID(); -} - -/* - * Costing function. - */ -Datum -tsm_bernoulli_cost(PG_FUNCTION_ARGS) -{ - PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); - Path *path = (Path *) PG_GETARG_POINTER(1); - RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); - List *args = (List *) PG_GETARG_POINTER(3); - BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); - double *tuples = (double *) PG_GETARG_POINTER(5); - Node *pctnode; - float4 samplesize; - - *pages = baserel->pages; - - pctnode = linitial(args); - pctnode = estimate_expression_value(root, pctnode); - - if (IsA(pctnode, RelabelType)) - pctnode = (Node *) ((RelabelType *) pctnode)->arg; - - if (IsA(pctnode, Const)) - { - samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); - samplesize /= 100.0; - } - else - { - /* Default samplesize if the estimation didn't return Const. */ - samplesize = 0.1f; - } - - *tuples = path->rows * samplesize; - path->rows = *tuples; - - PG_RETURN_VOID(); + return tupoffset; } diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c index 1d834369a4b..43c5dab7161 100644 --- a/src/backend/access/tablesample/system.c +++ b/src/backend/access/tablesample/system.c @@ -1,186 +1,260 @@ /*------------------------------------------------------------------------- * * system.c - * interface routines for system tablesample method + * support routines for SYSTEM tablesample method * + * To ensure repeatability of samples, it is necessary that selection of a + * given tuple be history-independent; otherwise syncscanning would break + * repeatability, to say nothing of logically-irrelevant maintenance such + * as physical extension or shortening of the relation. * - * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * To achieve that, we proceed by hashing each candidate block number together + * with the active seed, and then selecting it if the hash is less than the + * cutoff value computed from the selection probability by BeginSampleScan. + * + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * src/backend/utils/tablesample/system.c + * src/backend/access/tablesample/system.c * *------------------------------------------------------------------------- */ #include "postgres.h" -#include "fmgr.h" +#ifdef _MSC_VER +#include <float.h> /* for _isnan */ +#endif +#include <math.h> -#include "access/tablesample.h" +#include "access/hash.h" #include "access/relscan.h" -#include "nodes/execnodes.h" -#include "nodes/relation.h" +#include "access/tsmapi.h" +#include "catalog/pg_type.h" #include "optimizer/clauses.h" -#include "storage/bufmgr.h" -#include "utils/sampling.h" +#include "optimizer/cost.h" +#include "utils/builtins.h" -/* - * State - */ +/* Private state */ typedef struct { - BlockSamplerData bs; + uint64 cutoff; /* select blocks with hash less than this */ uint32 seed; /* random seed */ - BlockNumber nblocks; /* number of block in relation */ - int samplesize; /* number of blocks to return */ + BlockNumber nextblock; /* next block to consider sampling */ OffsetNumber lt; /* last tuple returned from current block */ } SystemSamplerData; -/* - * Initializes the state. - */ -Datum -tsm_system_init(PG_FUNCTION_ARGS) -{ - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - uint32 seed = PG_GETARG_UINT32(1); - float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); - HeapScanDesc scan = tsdesc->heapScan; - SystemSamplerData *sampler; +static void system_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples); +static void system_initsamplescan(SampleScanState *node, + int eflags); +static void system_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed); +static BlockNumber system_nextsampleblock(SampleScanState *node); +static OffsetNumber system_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset); - if (percent < 0 || percent > 100) - ereport(ERROR, - (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), - errmsg("invalid sample size"), - errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); - - sampler = palloc0(sizeof(SystemSamplerData)); - - /* Remember initial values for reinit */ - sampler->seed = seed; - sampler->nblocks = scan->rs_nblocks; - sampler->samplesize = 1 + (int) (sampler->nblocks * (percent / 100.0)); - sampler->lt = InvalidOffsetNumber; - - BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, - sampler->seed); - - tsdesc->tsmdata = (void *) sampler; - - PG_RETURN_VOID(); -} /* - * Get next block number or InvalidBlockNumber when we're done. - * - * Uses the same logic as ANALYZE for picking the random blocks. + * Create a TsmRoutine descriptor for the SYSTEM method. */ Datum -tsm_system_nextblock(PG_FUNCTION_ARGS) +tsm_system_handler(PG_FUNCTION_ARGS) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; - BlockNumber blockno; - - if (!BlockSampler_HasMore(&sampler->bs)) - PG_RETURN_UINT32(InvalidBlockNumber); - - blockno = BlockSampler_Next(&sampler->bs); - - PG_RETURN_UINT32(blockno); + TsmRoutine *tsm = makeNode(TsmRoutine); + + tsm->parameterTypes = list_make1_oid(FLOAT4OID); + tsm->repeatable_across_queries = true; + tsm->repeatable_across_scans = true; + tsm->SampleScanGetSampleSize = system_samplescangetsamplesize; + tsm->InitSampleScan = system_initsamplescan; + tsm->BeginSampleScan = system_beginsamplescan; + tsm->NextSampleBlock = system_nextsampleblock; + tsm->NextSampleTuple = system_nextsampletuple; + tsm->EndSampleScan = NULL; + + PG_RETURN_POINTER(tsm); } /* - * Get next tuple offset in current block or InvalidOffsetNumber if we are done - * with this block. + * Sample size estimation. */ -Datum -tsm_system_nexttuple(PG_FUNCTION_ARGS) +static void +system_samplescangetsamplesize(PlannerInfo *root, + RelOptInfo *baserel, + List *paramexprs, + BlockNumber *pages, + double *tuples) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - OffsetNumber maxoffset = PG_GETARG_UINT16(2); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; - OffsetNumber tupoffset = sampler->lt; + Node *pctnode; + float4 samplefract; - if (tupoffset == InvalidOffsetNumber) - tupoffset = FirstOffsetNumber; - else - tupoffset++; + /* Try to extract an estimate for the sample percentage */ + pctnode = (Node *) linitial(paramexprs); + pctnode = estimate_expression_value(root, pctnode); - if (tupoffset > maxoffset) - tupoffset = InvalidOffsetNumber; + if (IsA(pctnode, Const) && + !((Const *) pctnode)->constisnull) + { + samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue); + if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract)) + samplefract /= 100.0f; + else + { + /* Default samplefract if the value is bogus */ + samplefract = 0.1f; + } + } + else + { + /* Default samplefract if we didn't obtain a non-null Const */ + samplefract = 0.1f; + } - sampler->lt = tupoffset; + /* We'll visit a sample of the pages ... */ + *pages = clamp_row_est(baserel->pages * samplefract); - PG_RETURN_UINT16(tupoffset); + /* ... and hopefully get a representative number of tuples from them */ + *tuples = clamp_row_est(baserel->tuples * samplefract); } /* - * Cleanup method. + * Initialize during executor setup. */ -Datum -tsm_system_end(PG_FUNCTION_ARGS) +static void +system_initsamplescan(SampleScanState *node, int eflags) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - - pfree(tsdesc->tsmdata); - - PG_RETURN_VOID(); + node->tsm_state = palloc0(sizeof(SystemSamplerData)); } /* - * Reset state (called by ReScan). + * Examine parameters and prepare for a sample scan. */ -Datum -tsm_system_reset(PG_FUNCTION_ARGS) +static void +system_beginsamplescan(SampleScanState *node, + Datum *params, + int nparams, + uint32 seed) { - TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); - SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + double percent = DatumGetFloat4(params[0]); + if (percent < 0 || percent > 100 || isnan(percent)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT), + errmsg("sample percentage must be between 0 and 100"))); + + /* + * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to + * store that as a uint64, of course. Note that this gives strictly + * correct behavior at the limits of zero or one probability. + */ + sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100); + sampler->seed = seed; + sampler->nextblock = 0; sampler->lt = InvalidOffsetNumber; - BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, - sampler->seed); - PG_RETURN_VOID(); + /* + * Bulkread buffer access strategy probably makes sense unless we're + * scanning a very small fraction of the table. The 1% cutoff here is a + * guess. We should use pagemode visibility checking, since we scan all + * tuples on each selected page. + */ + node->use_bulkread = (percent >= 1); + node->use_pagemode = true; } /* - * Costing function. + * Select next block to sample. */ -Datum -tsm_system_cost(PG_FUNCTION_ARGS) +static BlockNumber +system_nextsampleblock(SampleScanState *node) { - PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); - Path *path = (Path *) PG_GETARG_POINTER(1); - RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); - List *args = (List *) PG_GETARG_POINTER(3); - BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); - double *tuples = (double *) PG_GETARG_POINTER(5); - Node *pctnode; - float4 samplesize; + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + HeapScanDesc scan = node->ss.ss_currentScanDesc; + BlockNumber nextblock = sampler->nextblock; + uint32 hashinput[2]; + + /* + * We compute the hash by applying hash_any to an array of 2 uint32's + * containing the block number and seed. This is efficient to set up, and + * with the current implementation of hash_any, it gives + * machine-independent results, which is a nice property for regression + * testing. + * + * These words in the hash input are the same throughout the block: + */ + hashinput[1] = sampler->seed; + + /* + * Loop over block numbers until finding suitable block or reaching end of + * relation. + */ + for (; nextblock < scan->rs_nblocks; nextblock++) + { + uint32 hash; - pctnode = linitial(args); - pctnode = estimate_expression_value(root, pctnode); + hashinput[0] = nextblock; - if (IsA(pctnode, RelabelType)) - pctnode = (Node *) ((RelabelType *) pctnode)->arg; + hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput, + (int) sizeof(hashinput))); + if (hash < sampler->cutoff) + break; + } - if (IsA(pctnode, Const)) + if (nextblock < scan->rs_nblocks) { - samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); - samplesize /= 100.0; + /* Found a suitable block; remember where we should start next time */ + sampler->nextblock = nextblock + 1; + return nextblock; } + + /* Done, but let's reset nextblock to 0 for safety. */ + sampler->nextblock = 0; + return InvalidBlockNumber; +} + +/* + * Select next sampled tuple in current block. + * + * In block sampling, we just want to sample all the tuples in each selected + * block. + * + * It is OK here to return an offset without knowing if the tuple is visible + * (or even exists); nodeSamplescan.c will deal with that. + * + * When we reach end of the block, return InvalidOffsetNumber which tells + * SampleScan to go to next block. + */ +static OffsetNumber +system_nextsampletuple(SampleScanState *node, + BlockNumber blockno, + OffsetNumber maxoffset) +{ + SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state; + OffsetNumber tupoffset = sampler->lt; + + /* Advance to next possible offset on page */ + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; else - { - /* Default samplesize if the estimation didn't return Const. */ - samplesize = 0.1f; - } + tupoffset++; - *pages = baserel->pages * samplesize; - *tuples = path->rows * samplesize; - path->rows = *tuples; + /* Done? */ + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; - PG_RETURN_VOID(); + return tupoffset; } diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c index f21d42c8e38..b8ad7ced743 100644 --- a/src/backend/access/tablesample/tablesample.c +++ b/src/backend/access/tablesample/tablesample.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * tablesample.c - * TABLESAMPLE internal API + * Support functions for TABLESAMPLE feature * * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -10,356 +10,31 @@ * IDENTIFICATION * src/backend/access/tablesample/tablesample.c * - * TABLESAMPLE is the SQL standard clause for sampling the relations. - * - * The API is interface between the Executor and the TABLESAMPLE Methods. - * - * TABLESAMPLE Methods are implementations of actual sampling algorithms which - * can be used for returning a sample of the source relation. - * Methods don't read the table directly but are asked for block number and - * tuple offset which they want to examine (or return) and the tablesample - * interface implemented here does the reading for them. - * - * We currently only support sampling of the physical relations, but in the - * future we might extend the API to support subqueries as well. - * * ------------------------------------------------------------------------- */ #include "postgres.h" -#include "access/tablesample.h" - -#include "catalog/pg_tablesample_method.h" -#include "miscadmin.h" -#include "pgstat.h" -#include "storage/bufmgr.h" -#include "storage/predicate.h" -#include "utils/rel.h" -#include "utils/tqual.h" - - -static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan); - - -/* - * Initialize the TABLESAMPLE Descriptor and the TABLESAMPLE Method. - */ -TableSampleDesc * -tablesample_init(SampleScanState *scanstate, TableSampleClause *tablesample) -{ - FunctionCallInfoData fcinfo; - int i; - List *args = tablesample->args; - ListCell *arg; - ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; - TableSampleDesc *tsdesc = (TableSampleDesc *) palloc0(sizeof(TableSampleDesc)); - - /* Load functions */ - fmgr_info(tablesample->tsminit, &(tsdesc->tsminit)); - fmgr_info(tablesample->tsmnextblock, &(tsdesc->tsmnextblock)); - fmgr_info(tablesample->tsmnexttuple, &(tsdesc->tsmnexttuple)); - if (OidIsValid(tablesample->tsmexaminetuple)) - fmgr_info(tablesample->tsmexaminetuple, &(tsdesc->tsmexaminetuple)); - else - tsdesc->tsmexaminetuple.fn_oid = InvalidOid; - fmgr_info(tablesample->tsmreset, &(tsdesc->tsmreset)); - fmgr_info(tablesample->tsmend, &(tsdesc->tsmend)); - - InitFunctionCallInfoData(fcinfo, &tsdesc->tsminit, - list_length(args) + 2, - InvalidOid, NULL, NULL); - - tsdesc->tupDesc = scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; - tsdesc->heapScan = scanstate->ss.ss_currentScanDesc; - - /* First argument for init function is always TableSampleDesc */ - fcinfo.arg[0] = PointerGetDatum(tsdesc); - fcinfo.argnull[0] = false; +#include "access/tsmapi.h" - /* - * Second arg for init function is always REPEATABLE. - * - * If tablesample->repeatable is NULL then REPEATABLE clause was not - * specified, and we insert a random value as default. - * - * When specified, the expression cannot evaluate to NULL. - */ - if (tablesample->repeatable) - { - ExprState *argstate = ExecInitExpr((Expr *) tablesample->repeatable, - (PlanState *) scanstate); - - fcinfo.arg[1] = ExecEvalExpr(argstate, econtext, - &fcinfo.argnull[1], NULL); - if (fcinfo.argnull[1]) - ereport(ERROR, - (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), - errmsg("REPEATABLE clause must be NOT NULL numeric value"))); - } - else - { - fcinfo.arg[1] = UInt32GetDatum(random()); - fcinfo.argnull[1] = false; - } - - /* Rest of the arguments come from user. */ - i = 2; - foreach(arg, args) - { - Expr *argexpr = (Expr *) lfirst(arg); - ExprState *argstate = ExecInitExpr(argexpr, (PlanState *) scanstate); - - fcinfo.arg[i] = ExecEvalExpr(argstate, econtext, - &fcinfo.argnull[i], NULL); - i++; - } - Assert(i == fcinfo.nargs); - - (void) FunctionCallInvoke(&fcinfo); - - return tsdesc; -} /* - * Get next tuple from TABLESAMPLE Method. - */ -HeapTuple -tablesample_getnext(TableSampleDesc *desc) -{ - HeapScanDesc scan = desc->heapScan; - HeapTuple tuple = &(scan->rs_ctup); - bool pagemode = scan->rs_pageatatime; - BlockNumber blockno; - Page page; - bool page_all_visible; - ItemId itemid; - OffsetNumber tupoffset, - maxoffset; - - if (!scan->rs_inited) - { - /* - * return null immediately if relation is empty - */ - if (scan->rs_nblocks == 0) - { - Assert(!BufferIsValid(scan->rs_cbuf)); - tuple->t_data = NULL; - return NULL; - } - blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, - PointerGetDatum(desc))); - if (!BlockNumberIsValid(blockno)) - { - tuple->t_data = NULL; - return NULL; - } - - heapgetpage(scan, blockno); - scan->rs_inited = true; - } - else - { - /* continue from previously returned page/tuple */ - blockno = scan->rs_cblock; /* current page */ - } - - /* - * When pagemode is disabled, the scan will do visibility checks for each - * tuple it finds so the buffer needs to be locked. - */ - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); - - page = (Page) BufferGetPage(scan->rs_cbuf); - page_all_visible = PageIsAllVisible(page); - maxoffset = PageGetMaxOffsetNumber(page); - - for (;;) - { - CHECK_FOR_INTERRUPTS(); - - tupoffset = DatumGetUInt16(FunctionCall3(&desc->tsmnexttuple, - PointerGetDatum(desc), - UInt32GetDatum(blockno), - UInt16GetDatum(maxoffset))); - - if (OffsetNumberIsValid(tupoffset)) - { - bool visible; - bool found; - - /* Skip invalid tuple pointers. */ - itemid = PageGetItemId(page, tupoffset); - if (!ItemIdIsNormal(itemid)) - continue; - - tuple->t_data = (HeapTupleHeader) PageGetItem((Page) page, itemid); - tuple->t_len = ItemIdGetLength(itemid); - ItemPointerSet(&(tuple->t_self), blockno, tupoffset); - - if (page_all_visible) - visible = true; - else - visible = SampleTupleVisible(tuple, tupoffset, scan); - - /* - * Let the sampling method examine the actual tuple and decide if - * we should return it. - * - * Note that we let it examine even invisible tuples for - * statistical purposes, but not return them since user should - * never see invisible tuples. - */ - if (OidIsValid(desc->tsmexaminetuple.fn_oid)) - { - found = DatumGetBool(FunctionCall4(&desc->tsmexaminetuple, - PointerGetDatum(desc), - UInt32GetDatum(blockno), - PointerGetDatum(tuple), - BoolGetDatum(visible))); - /* Should not happen if sampling method is well written. */ - if (found && !visible) - elog(ERROR, "Sampling method wanted to return invisible tuple"); - } - else - found = visible; - - /* Found visible tuple, return it. */ - if (found) - { - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); - break; - } - else - { - /* Try next tuple from same page. */ - continue; - } - } - - - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, - PointerGetDatum(desc))); - - /* - * Report our new scan position for synchronization purposes. We don't - * do that when moving backwards, however. That would just mess up any - * other forward-moving scanners. - * - * Note: we do this before checking for end of scan so that the final - * state of the position hint is back at the start of the rel. That's - * not strictly necessary, but otherwise when you run the same query - * multiple times the starting position would shift a little bit - * backwards on every invocation, which is confusing. We don't - * guarantee any specific ordering in general, though. - */ - if (scan->rs_syncscan) - ss_report_location(scan->rs_rd, BlockNumberIsValid(blockno) ? - blockno : scan->rs_startblock); - - /* - * Reached end of scan. - */ - if (!BlockNumberIsValid(blockno)) - { - if (BufferIsValid(scan->rs_cbuf)) - ReleaseBuffer(scan->rs_cbuf); - scan->rs_cbuf = InvalidBuffer; - scan->rs_cblock = InvalidBlockNumber; - tuple->t_data = NULL; - scan->rs_inited = false; - return NULL; - } - - heapgetpage(scan, blockno); - - if (!pagemode) - LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); - - page = (Page) BufferGetPage(scan->rs_cbuf); - page_all_visible = PageIsAllVisible(page); - maxoffset = PageGetMaxOffsetNumber(page); - } - - pgstat_count_heap_getnext(scan->rs_rd); - - return &(scan->rs_ctup); -} - -/* - * Reset the sampling to starting state - */ -void -tablesample_reset(TableSampleDesc *desc) -{ - (void) FunctionCall1(&desc->tsmreset, PointerGetDatum(desc)); -} - -/* - * Signal the sampling method that the scan has finished. - */ -void -tablesample_end(TableSampleDesc *desc) -{ - (void) FunctionCall1(&desc->tsmend, PointerGetDatum(desc)); -} - -/* - * Check visibility of the tuple. + * GetTsmRoutine --- get a TsmRoutine struct by invoking the handler. + * + * This is a convenience routine that's just meant to check for errors. */ -static bool -SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) +TsmRoutine * +GetTsmRoutine(Oid tsmhandler) { - /* - * If this scan is reading whole pages at a time, there is already - * visibility info present in rs_vistuples so we can just search it for - * the tupoffset. - */ - if (scan->rs_pageatatime) - { - int start = 0, - end = scan->rs_ntuples - 1; - - /* - * Do the binary search over rs_vistuples, it's already sorted by - * OffsetNumber so we don't need to do any sorting ourselves here. - * - * We could use bsearch() here but it's slower for integers because of - * the function call overhead and because it needs boiler plate code - * it would not save us anything code-wise anyway. - */ - while (start <= end) - { - int mid = start + (end - start) / 2; - OffsetNumber curoffset = scan->rs_vistuples[mid]; - - if (curoffset == tupoffset) - return true; - else if (curoffset > tupoffset) - end = mid - 1; - else - start = mid + 1; - } - - return false; - } - else - { - /* No pagemode, we have to check the tuple itself. */ - Snapshot snapshot = scan->rs_snapshot; - Buffer buffer = scan->rs_cbuf; + Datum datum; + TsmRoutine *routine; - bool visible = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); + datum = OidFunctionCall1(tsmhandler, PointerGetDatum(NULL)); + routine = (TsmRoutine *) DatumGetPointer(datum); - CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, buffer, - snapshot); + if (routine == NULL || !IsA(routine, TsmRoutine)) + elog(ERROR, "tablesample handler function %u did not return a TsmRoutine struct", + tsmhandler); - return visible; - } + return routine; } |