4 files changed, 373 insertions, 626 deletions
diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile
index 46eeb59f9c4..68d9ab28147 100644
--- a/src/backend/access/tablesample/Makefile
+++ b/src/backend/access/tablesample/Makefile
@@ -1,10 +1,10 @@
 #-------------------------------------------------------------------------
 #
 # Makefile--
-#    Makefile for utils/tablesample
+#    Makefile for access/tablesample
 #
 # IDENTIFICATION
-#    src/backend/utils/tablesample/Makefile
+#    src/backend/access/tablesample/Makefile
 #
 #-------------------------------------------------------------------------
 
@@ -12,6 +12,6 @@ subdir = src/backend/access/tablesample
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = tablesample.o system.o bernoulli.o
+OBJS = bernoulli.o system.o tablesample.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c
index 0a539008221..cf88f95e757 100644
--- a/src/backend/access/tablesample/bernoulli.c
+++ b/src/backend/access/tablesample/bernoulli.c
@@ -1,233 +1,231 @@
 /*-------------------------------------------------------------------------
  *
  * bernoulli.c
- *	  interface routines for BERNOULLI tablesample method
+ *	  support routines for BERNOULLI tablesample method
  *
- * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * To ensure repeatability of samples, it is necessary that selection of a
+ * given tuple be history-independent; otherwise syncscanning would break
+ * repeatability, to say nothing of logically-irrelevant maintenance such
+ * as physical extension or shortening of the relation.
+ *
+ * To achieve that, we proceed by hashing each candidate TID together with
+ * the active seed, and then selecting it if the hash is less than the
+ * cutoff value computed from the selection probability by BeginSampleScan.
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  src/backend/utils/tablesample/bernoulli.c
+ *	  src/backend/access/tablesample/bernoulli.c
  *
  *-------------------------------------------------------------------------
  */
 
 #include "postgres.h"
 
-#include "fmgr.h"
+#ifdef _MSC_VER
+#include <float.h>				/* for _isnan */
+#endif
+#include <math.h>
 
-#include "access/tablesample.h"
-#include "access/relscan.h"
-#include "nodes/execnodes.h"
-#include "nodes/relation.h"
+#include "access/hash.h"
+#include "access/tsmapi.h"
+#include "catalog/pg_type.h"
 #include "optimizer/clauses.h"
-#include "storage/bufmgr.h"
-#include "utils/sampling.h"
+#include "optimizer/cost.h"
+#include "utils/builtins.h"
 
 
-/* tsdesc */
+/* Private state */
 typedef struct
 {
+	uint64		cutoff;			/* select tuples with hash less than this */
 	uint32		seed;			/* random seed */
-	BlockNumber startblock;		/* starting block, we use ths for syncscan
-								 * support */
-	BlockNumber nblocks;		/* number of blocks */
-	BlockNumber blockno;		/* current block */
-	float4		probability;	/* probabilty that tuple will be returned
-								 * (0.0-1.0) */
 	OffsetNumber lt;			/* last tuple returned from current block */
-	SamplerRandomState randstate;		/* random generator tsdesc */
 } BernoulliSamplerData;
 
+
+static void bernoulli_samplescangetsamplesize(PlannerInfo *root,
+								  RelOptInfo *baserel,
+								  List *paramexprs,
+								  BlockNumber *pages,
+								  double *tuples);
+static void bernoulli_initsamplescan(SampleScanState *node,
+						 int eflags);
+static void bernoulli_beginsamplescan(SampleScanState *node,
+						  Datum *params,
+						  int nparams,
+						  uint32 seed);
+static OffsetNumber bernoulli_nextsampletuple(SampleScanState *node,
+						  BlockNumber blockno,
+						  OffsetNumber maxoffset);
+
+
 /*
- * Initialize the state.
+ * Create a TsmRoutine descriptor for the BERNOULLI method.
  */
 Datum
-tsm_bernoulli_init(PG_FUNCTION_ARGS)
+tsm_bernoulli_handler(PG_FUNCTION_ARGS)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	uint32		seed = PG_GETARG_UINT32(1);
-	float4		percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2);
-	HeapScanDesc scan = tsdesc->heapScan;
-	BernoulliSamplerData *sampler;
+	TsmRoutine *tsm = makeNode(TsmRoutine);
+
+	tsm->parameterTypes = list_make1_oid(FLOAT4OID);
+	tsm->repeatable_across_queries = true;
+	tsm->repeatable_across_scans = true;
+	tsm->SampleScanGetSampleSize = bernoulli_samplescangetsamplesize;
+	tsm->InitSampleScan = bernoulli_initsamplescan;
+	tsm->BeginSampleScan = bernoulli_beginsamplescan;
+	tsm->NextSampleBlock = NULL;
+	tsm->NextSampleTuple = bernoulli_nextsampletuple;
+	tsm->EndSampleScan = NULL;
+
+	PG_RETURN_POINTER(tsm);
+}
 
-	if (percent < 0 || percent > 100)
-		ereport(ERROR,
-				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-				 errmsg("invalid sample size"),
-				 errhint("Sample size must be numeric value between 0 and 100 (inclusive).")));
+/*
+ * Sample size estimation.
+ */
+static void
+bernoulli_samplescangetsamplesize(PlannerInfo *root,
+								  RelOptInfo *baserel,
+								  List *paramexprs,
+								  BlockNumber *pages,
+								  double *tuples)
+{
+	Node	   *pctnode;
+	float4		samplefract;
 
-	sampler = palloc0(sizeof(BernoulliSamplerData));
+	/* Try to extract an estimate for the sample percentage */
+	pctnode = (Node *) linitial(paramexprs);
+	pctnode = estimate_expression_value(root, pctnode);
 
-	/* Remember initial values for reinit */
-	sampler->seed = seed;
-	sampler->startblock = scan->rs_startblock;
-	sampler->nblocks = scan->rs_nblocks;
-	sampler->blockno = InvalidBlockNumber;
-	sampler->probability = percent / 100;
-	sampler->lt = InvalidOffsetNumber;
-	sampler_random_init_state(sampler->seed, sampler->randstate);
+	if (IsA(pctnode, Const) &&
+		!((Const *) pctnode)->constisnull)
+	{
+		samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
+		if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
+			samplefract /= 100.0f;
+		else
+		{
+			/* Default samplefract if the value is bogus */
+			samplefract = 0.1f;
+		}
+	}
+	else
+	{
+		/* Default samplefract if we didn't obtain a non-null Const */
+		samplefract = 0.1f;
+	}
+
+	/* We'll visit all pages of the baserel */
+	*pages = baserel->pages;
 
-	tsdesc->tsmdata = (void *) sampler;
+	*tuples = clamp_row_est(baserel->tuples * samplefract);
+}
 
-	PG_RETURN_VOID();
+/*
+ * Initialize during executor setup.
+ */
+static void
+bernoulli_initsamplescan(SampleScanState *node, int eflags)
+{
+	node->tsm_state = palloc0(sizeof(BernoulliSamplerData));
 }
 
 /*
- * Get next block number to read or InvalidBlockNumber if we are at the
- * end of the relation.
+ * Examine parameters and prepare for a sample scan.
  */
-Datum
-tsm_bernoulli_nextblock(PG_FUNCTION_ARGS)
+static void
+bernoulli_beginsamplescan(SampleScanState *node,
+						  Datum *params,
+						  int nparams,
+						  uint32 seed)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata;
+	BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
+	double		percent = DatumGetFloat4(params[0]);
+
+	if (percent < 0 || percent > 100 || isnan(percent))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+				 errmsg("sample percentage must be between 0 and 100")));
 
 	/*
-	 * Bernoulli sampling scans all blocks on the table and supports syncscan
-	 * so loop from startblock to startblock instead of from 0 to nblocks.
+	 * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
+	 * store that as a uint64, of course.  Note that this gives strictly
+	 * correct behavior at the limits of zero or one probability.
 	 */
-	if (sampler->blockno == InvalidBlockNumber)
-		sampler->blockno = sampler->startblock;
-	else
-	{
-		sampler->blockno++;
-
-		if (sampler->blockno >= sampler->nblocks)
-			sampler->blockno = 0;
-
-		if (sampler->blockno == sampler->startblock)
-			PG_RETURN_UINT32(InvalidBlockNumber);
-	}
+	sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
+	sampler->seed = seed;
+	sampler->lt = InvalidOffsetNumber;
 
-	PG_RETURN_UINT32(sampler->blockno);
+	/*
+	 * Use bulkread, since we're scanning all pages.  But pagemode visibility
+	 * checking is a win only at larger sampling fractions.  The 25% cutoff
+	 * here is based on very limited experimentation.
+	 */
+	node->use_bulkread = true;
+	node->use_pagemode = (percent >= 25);
 }
 
 /*
- * Get next tuple from current block.
- *
- * This method implements the main logic in bernoulli sampling.
- * The algorithm simply generates new random number (in 0.0-1.0 range) and if
- * it falls within user specified probability (in the same range) return the
- * tuple offset.
- *
- * It is ok here to return tuple offset without knowing if tuple is visible
- * and not check it via examinetuple. The reason for that is that we do the
- * coinflip (random number generation) for every tuple in the table. Since all
- * tuples have same probability of being returned the visible and invisible
- * tuples will be returned in same ratio as they have in the actual table.
- * This means that there is no skew towards either visible or invisible tuples
- * and the number of visible tuples returned from the executor node should
- * match the fraction of visible tuples which was specified by user.
+ * Select next sampled tuple in current block.
  *
- * This is faster than doing the coinflip in examinetuple because we don't
- * have to do visibility checks on uninteresting tuples.
+ * It is OK here to return an offset without knowing if the tuple is visible
+ * (or even exists).  The reason is that we do the coinflip for every tuple
+ * offset in the table.  Since all tuples have the same probability of being
+ * returned, it doesn't matter if we do extra coinflips for invisible tuples.
  *
- * If we reach end of the block return InvalidOffsetNumber which tells
+ * When we reach end of the block, return InvalidOffsetNumber which tells
  * SampleScan to go to next block.
  */
-Datum
-tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS)
+static OffsetNumber
+bernoulli_nextsampletuple(SampleScanState *node,
+						  BlockNumber blockno,
+						  OffsetNumber maxoffset)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	OffsetNumber maxoffset = PG_GETARG_UINT16(2);
-	BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata;
+	BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
 	OffsetNumber tupoffset = sampler->lt;
-	float4		probability = sampler->probability;
+	uint32		hashinput[3];
 
+	/* Advance to first/next tuple in block */
 	if (tupoffset == InvalidOffsetNumber)
 		tupoffset = FirstOffsetNumber;
 	else
 		tupoffset++;
 
 	/*
-	 * Loop over tuple offsets until the random generator returns value that
-	 * is within the probability of returning the tuple or until we reach end
-	 * of the block.
+	 * We compute the hash by applying hash_any to an array of 3 uint32's
+	 * containing the block, offset, and seed.  This is efficient to set up,
+	 * and with the current implementation of hash_any, it gives
+	 * machine-independent results, which is a nice property for regression
+	 * testing.
 	 *
-	 * (This is our implementation of bernoulli trial)
+	 * These words in the hash input are the same throughout the block:
 	 */
-	while (sampler_random_fract(sampler->randstate) > probability)
+	hashinput[0] = blockno;
+	hashinput[2] = sampler->seed;
+
+	/*
+	 * Loop over tuple offsets until finding suitable TID or reaching end of
+	 * block.
+	 */
+	for (; tupoffset <= maxoffset; tupoffset++)
 	{
-		tupoffset++;
+		uint32		hash;
 
-		if (tupoffset > maxoffset)
+		hashinput[1] = tupoffset;
+
+		hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
+									   (int) sizeof(hashinput)));
+		if (hash < sampler->cutoff)
 			break;
 	}
 
 	if (tupoffset > maxoffset)
-		/* Tell SampleScan that we want next block. */
 		tupoffset = InvalidOffsetNumber;
 
 	sampler->lt = tupoffset;
 
-	PG_RETURN_UINT16(tupoffset);
-}
-
-/*
- * Cleanup method.
- */
-Datum
-tsm_bernoulli_end(PG_FUNCTION_ARGS)
-{
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-
-	pfree(tsdesc->tsmdata);
-
-	PG_RETURN_VOID();
-}
-
-/*
- * Reset tsdesc (called by ReScan).
- */
-Datum
-tsm_bernoulli_reset(PG_FUNCTION_ARGS)
-{
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata;
-
-	sampler->blockno = InvalidBlockNumber;
-	sampler->lt = InvalidOffsetNumber;
-	sampler_random_init_state(sampler->seed, sampler->randstate);
-
-	PG_RETURN_VOID();
-}
-
-/*
- * Costing function.
- */
-Datum
-tsm_bernoulli_cost(PG_FUNCTION_ARGS)
-{
-	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-	Path	   *path = (Path *) PG_GETARG_POINTER(1);
-	RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2);
-	List	   *args = (List *) PG_GETARG_POINTER(3);
-	BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4);
-	double	   *tuples = (double *) PG_GETARG_POINTER(5);
-	Node	   *pctnode;
-	float4		samplesize;
-
-	*pages = baserel->pages;
-
-	pctnode = linitial(args);
-	pctnode = estimate_expression_value(root, pctnode);
-
-	if (IsA(pctnode, RelabelType))
-		pctnode = (Node *) ((RelabelType *) pctnode)->arg;
-
-	if (IsA(pctnode, Const))
-	{
-		samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue);
-		samplesize /= 100.0;
-	}
-	else
-	{
-		/* Default samplesize if the estimation didn't return Const. */
-		samplesize = 0.1f;
-	}
-
-	*tuples = path->rows * samplesize;
-	path->rows = *tuples;
-
-	PG_RETURN_VOID();
+	return tupoffset;
 }
diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c
index 1d834369a4b..43c5dab7161 100644
--- a/src/backend/access/tablesample/system.c
+++ b/src/backend/access/tablesample/system.c
@@ -1,186 +1,260 @@
 /*-------------------------------------------------------------------------
  *
  * system.c
- *	  interface routines for system tablesample method
+ *	  support routines for SYSTEM tablesample method
  *
+ * To ensure repeatability of samples, it is necessary that selection of a
+ * given tuple be history-independent; otherwise syncscanning would break
+ * repeatability, to say nothing of logically-irrelevant maintenance such
+ * as physical extension or shortening of the relation.
  *
- * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * To achieve that, we proceed by hashing each candidate block number together
+ * with the active seed, and then selecting it if the hash is less than the
+ * cutoff value computed from the selection probability by BeginSampleScan.
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  src/backend/utils/tablesample/system.c
+ *	  src/backend/access/tablesample/system.c
  *
  *-------------------------------------------------------------------------
  */
 
 #include "postgres.h"
 
-#include "fmgr.h"
+#ifdef _MSC_VER
+#include <float.h>				/* for _isnan */
+#endif
+#include <math.h>
 
-#include "access/tablesample.h"
+#include "access/hash.h"
 #include "access/relscan.h"
-#include "nodes/execnodes.h"
-#include "nodes/relation.h"
+#include "access/tsmapi.h"
+#include "catalog/pg_type.h"
 #include "optimizer/clauses.h"
-#include "storage/bufmgr.h"
-#include "utils/sampling.h"
+#include "optimizer/cost.h"
+#include "utils/builtins.h"
 
 
-/*
- * State
- */
+/* Private state */
 typedef struct
 {
-	BlockSamplerData bs;
+	uint64		cutoff;			/* select blocks with hash less than this */
 	uint32		seed;			/* random seed */
-	BlockNumber nblocks;		/* number of block in relation */
-	int			samplesize;		/* number of blocks to return */
+	BlockNumber nextblock;		/* next block to consider sampling */
 	OffsetNumber lt;			/* last tuple returned from current block */
 } SystemSamplerData;
 
 
-/*
- * Initializes the state.
- */
-Datum
-tsm_system_init(PG_FUNCTION_ARGS)
-{
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	uint32		seed = PG_GETARG_UINT32(1);
-	float4		percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2);
-	HeapScanDesc scan = tsdesc->heapScan;
-	SystemSamplerData *sampler;
+static void system_samplescangetsamplesize(PlannerInfo *root,
+							   RelOptInfo *baserel,
+							   List *paramexprs,
+							   BlockNumber *pages,
+							   double *tuples);
+static void system_initsamplescan(SampleScanState *node,
+					  int eflags);
+static void system_beginsamplescan(SampleScanState *node,
+					   Datum *params,
+					   int nparams,
+					   uint32 seed);
+static BlockNumber system_nextsampleblock(SampleScanState *node);
+static OffsetNumber system_nextsampletuple(SampleScanState *node,
+					   BlockNumber blockno,
+					   OffsetNumber maxoffset);
 
-	if (percent < 0 || percent > 100)
-		ereport(ERROR,
-				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-				 errmsg("invalid sample size"),
-				 errhint("Sample size must be numeric value between 0 and 100 (inclusive).")));
-
-	sampler = palloc0(sizeof(SystemSamplerData));
-
-	/* Remember initial values for reinit */
-	sampler->seed = seed;
-	sampler->nblocks = scan->rs_nblocks;
-	sampler->samplesize = 1 + (int) (sampler->nblocks * (percent / 100.0));
-	sampler->lt = InvalidOffsetNumber;
-
-	BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize,
-					  sampler->seed);
-
-	tsdesc->tsmdata = (void *) sampler;
-
-	PG_RETURN_VOID();
-}
 
 /*
- * Get next block number or InvalidBlockNumber when we're done.
- *
- * Uses the same logic as ANALYZE for picking the random blocks.
+ * Create a TsmRoutine descriptor for the SYSTEM method.
  */
 Datum
-tsm_system_nextblock(PG_FUNCTION_ARGS)
+tsm_system_handler(PG_FUNCTION_ARGS)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
-	BlockNumber blockno;
-
-	if (!BlockSampler_HasMore(&sampler->bs))
-		PG_RETURN_UINT32(InvalidBlockNumber);
-
-	blockno = BlockSampler_Next(&sampler->bs);
-
-	PG_RETURN_UINT32(blockno);
+	TsmRoutine *tsm = makeNode(TsmRoutine);
+
+	tsm->parameterTypes = list_make1_oid(FLOAT4OID);
+	tsm->repeatable_across_queries = true;
+	tsm->repeatable_across_scans = true;
+	tsm->SampleScanGetSampleSize = system_samplescangetsamplesize;
+	tsm->InitSampleScan = system_initsamplescan;
+	tsm->BeginSampleScan = system_beginsamplescan;
+	tsm->NextSampleBlock = system_nextsampleblock;
+	tsm->NextSampleTuple = system_nextsampletuple;
+	tsm->EndSampleScan = NULL;
+
+	PG_RETURN_POINTER(tsm);
 }
 
 /*
- * Get next tuple offset in current block or InvalidOffsetNumber if we are done
- * with this block.
+ * Sample size estimation.
  */
-Datum
-tsm_system_nexttuple(PG_FUNCTION_ARGS)
+static void
+system_samplescangetsamplesize(PlannerInfo *root,
+							   RelOptInfo *baserel,
+							   List *paramexprs,
+							   BlockNumber *pages,
+							   double *tuples)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	OffsetNumber maxoffset = PG_GETARG_UINT16(2);
-	SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
-	OffsetNumber tupoffset = sampler->lt;
+	Node	   *pctnode;
+	float4		samplefract;
 
-	if (tupoffset == InvalidOffsetNumber)
-		tupoffset = FirstOffsetNumber;
-	else
-		tupoffset++;
+	/* Try to extract an estimate for the sample percentage */
+	pctnode = (Node *) linitial(paramexprs);
+	pctnode = estimate_expression_value(root, pctnode);
 
-	if (tupoffset > maxoffset)
-		tupoffset = InvalidOffsetNumber;
+	if (IsA(pctnode, Const) &&
+		!((Const *) pctnode)->constisnull)
+	{
+		samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
+		if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
+			samplefract /= 100.0f;
+		else
+		{
+			/* Default samplefract if the value is bogus */
+			samplefract = 0.1f;
+		}
+	}
+	else
+	{
+		/* Default samplefract if we didn't obtain a non-null Const */
+		samplefract = 0.1f;
+	}
 
-	sampler->lt = tupoffset;
+	/* We'll visit a sample of the pages ... */
+	*pages = clamp_row_est(baserel->pages * samplefract);
 
-	PG_RETURN_UINT16(tupoffset);
+	/* ... and hopefully get a representative number of tuples from them */
+	*tuples = clamp_row_est(baserel->tuples * samplefract);
 }
 
 /*
- * Cleanup method.
+ * Initialize during executor setup.
  */
-Datum
-tsm_system_end(PG_FUNCTION_ARGS)
+static void
+system_initsamplescan(SampleScanState *node, int eflags)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-
-	pfree(tsdesc->tsmdata);
-
-	PG_RETURN_VOID();
+	node->tsm_state = palloc0(sizeof(SystemSamplerData));
 }
 
 /*
- * Reset state (called by ReScan).
+ * Examine parameters and prepare for a sample scan.
  */
-Datum
-tsm_system_reset(PG_FUNCTION_ARGS)
+static void
+system_beginsamplescan(SampleScanState *node,
+					   Datum *params,
+					   int nparams,
+					   uint32 seed)
 {
-	TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-	SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	double		percent = DatumGetFloat4(params[0]);
 
+	if (percent < 0 || percent > 100 || isnan(percent))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+				 errmsg("sample percentage must be between 0 and 100")));
+
+	/*
+	 * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
+	 * store that as a uint64, of course.  Note that this gives strictly
+	 * correct behavior at the limits of zero or one probability.
+	 */
+	sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
+	sampler->seed = seed;
+	sampler->nextblock = 0;
 	sampler->lt = InvalidOffsetNumber;
-	BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize,
-					  sampler->seed);
 
-	PG_RETURN_VOID();
+	/*
+	 * Bulkread buffer access strategy probably makes sense unless we're
+	 * scanning a very small fraction of the table.  The 1% cutoff here is a
+	 * guess.  We should use pagemode visibility checking, since we scan all
+	 * tuples on each selected page.
+	 */
+	node->use_bulkread = (percent >= 1);
+	node->use_pagemode = true;
 }
 
 /*
- * Costing function.
+ * Select next block to sample.
  */
-Datum
-tsm_system_cost(PG_FUNCTION_ARGS)
+static BlockNumber
+system_nextsampleblock(SampleScanState *node)
 {
-	PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-	Path	   *path = (Path *) PG_GETARG_POINTER(1);
-	RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2);
-	List	   *args = (List *) PG_GETARG_POINTER(3);
-	BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4);
-	double	   *tuples = (double *) PG_GETARG_POINTER(5);
-	Node	   *pctnode;
-	float4		samplesize;
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	HeapScanDesc scan = node->ss.ss_currentScanDesc;
+	BlockNumber nextblock = sampler->nextblock;
+	uint32		hashinput[2];
+
+	/*
+	 * We compute the hash by applying hash_any to an array of 2 uint32's
+	 * containing the block number and seed.  This is efficient to set up, and
+	 * with the current implementation of hash_any, it gives
+	 * machine-independent results, which is a nice property for regression
+	 * testing.
+	 *
+	 * These words in the hash input are the same throughout the block:
+	 */
+	hashinput[1] = sampler->seed;
+
+	/*
+	 * Loop over block numbers until finding suitable block or reaching end of
+	 * relation.
+	 */
+	for (; nextblock < scan->rs_nblocks; nextblock++)
+	{
+		uint32		hash;
 
-	pctnode = linitial(args);
-	pctnode = estimate_expression_value(root, pctnode);
+		hashinput[0] = nextblock;
 
-	if (IsA(pctnode, RelabelType))
-		pctnode = (Node *) ((RelabelType *) pctnode)->arg;
+		hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
+									   (int) sizeof(hashinput)));
+		if (hash < sampler->cutoff)
+			break;
+	}
 
-	if (IsA(pctnode, Const))
+	if (nextblock < scan->rs_nblocks)
 	{
-		samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue);
-		samplesize /= 100.0;
+		/* Found a suitable block; remember where we should start next time */
+		sampler->nextblock = nextblock + 1;
+		return nextblock;
 	}
+
+	/* Done, but let's reset nextblock to 0 for safety. */
+	sampler->nextblock = 0;
+	return InvalidBlockNumber;
+}
+
+/*
+ * Select next sampled tuple in current block.
+ *
+ * In block sampling, we just want to sample all the tuples in each selected
+ * block.
+ *
+ * It is OK here to return an offset without knowing if the tuple is visible
+ * (or even exists); nodeSamplescan.c will deal with that.
+ *
+ * When we reach end of the block, return InvalidOffsetNumber which tells
+ * SampleScan to go to next block.
+ */
+static OffsetNumber
+system_nextsampletuple(SampleScanState *node,
+					   BlockNumber blockno,
+					   OffsetNumber maxoffset)
+{
+	SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+	OffsetNumber tupoffset = sampler->lt;
+
+	/* Advance to next possible offset on page */
+	if (tupoffset == InvalidOffsetNumber)
+		tupoffset = FirstOffsetNumber;
 	else
-	{
-		/* Default samplesize if the estimation didn't return Const. */
-		samplesize = 0.1f;
-	}
+		tupoffset++;
 
-	*pages = baserel->pages * samplesize;
-	*tuples = path->rows * samplesize;
-	path->rows = *tuples;
+	/* Done? */
+	if (tupoffset > maxoffset)
+		tupoffset = InvalidOffsetNumber;
+
+	sampler->lt = tupoffset;
 
-	PG_RETURN_VOID();
+	return tupoffset;
 }
diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c
index f21d42c8e38..b8ad7ced743 100644
--- a/src/backend/access/tablesample/tablesample.c
+++ b/src/backend/access/tablesample/tablesample.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * tablesample.c
- *		  TABLESAMPLE internal API
+ *		  Support functions for TABLESAMPLE feature
  *
  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -10,356 +10,31 @@
  * IDENTIFICATION
  *		  src/backend/access/tablesample/tablesample.c
  *
- * TABLESAMPLE is the SQL standard clause for sampling the relations.
- *
- * The API is interface between the Executor and the TABLESAMPLE Methods.
- *
- * TABLESAMPLE Methods are implementations of actual sampling algorithms which
- * can be used for returning a sample of the source relation.
- * Methods don't read the table directly but are asked for block number and
- * tuple offset which they want to examine (or return) and the tablesample
- * interface implemented here does the reading for them.
- *
- * We currently only support sampling of the physical relations, but in the
- * future we might extend the API to support subqueries as well.
- *
  * -------------------------------------------------------------------------
  */
 
 #include "postgres.h"
 
-#include "access/tablesample.h"
-
-#include "catalog/pg_tablesample_method.h"
-#include "miscadmin.h"
-#include "pgstat.h"
-#include "storage/bufmgr.h"
-#include "storage/predicate.h"
-#include "utils/rel.h"
-#include "utils/tqual.h"
-
-
-static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan);
-
-
-/*
- * Initialize the TABLESAMPLE Descriptor and the TABLESAMPLE Method.
- */
-TableSampleDesc *
-tablesample_init(SampleScanState *scanstate, TableSampleClause *tablesample)
-{
-	FunctionCallInfoData fcinfo;
-	int			i;
-	List	   *args = tablesample->args;
-	ListCell   *arg;
-	ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
-	TableSampleDesc *tsdesc = (TableSampleDesc *) palloc0(sizeof(TableSampleDesc));
-
-	/* Load functions */
-	fmgr_info(tablesample->tsminit, &(tsdesc->tsminit));
-	fmgr_info(tablesample->tsmnextblock, &(tsdesc->tsmnextblock));
-	fmgr_info(tablesample->tsmnexttuple, &(tsdesc->tsmnexttuple));
-	if (OidIsValid(tablesample->tsmexaminetuple))
-		fmgr_info(tablesample->tsmexaminetuple, &(tsdesc->tsmexaminetuple));
-	else
-		tsdesc->tsmexaminetuple.fn_oid = InvalidOid;
-	fmgr_info(tablesample->tsmreset, &(tsdesc->tsmreset));
-	fmgr_info(tablesample->tsmend, &(tsdesc->tsmend));
-
-	InitFunctionCallInfoData(fcinfo, &tsdesc->tsminit,
-							 list_length(args) + 2,
-							 InvalidOid, NULL, NULL);
-
-	tsdesc->tupDesc = scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
-	tsdesc->heapScan = scanstate->ss.ss_currentScanDesc;
-
-	/* First argument for init function is always TableSampleDesc */
-	fcinfo.arg[0] = PointerGetDatum(tsdesc);
-	fcinfo.argnull[0] = false;
+#include "access/tsmapi.h"
 
-	/*
-	 * Second arg for init function is always REPEATABLE.
-	 *
-	 * If tablesample->repeatable is NULL then REPEATABLE clause was not
-	 * specified, and we insert a random value as default.
-	 *
-	 * When specified, the expression cannot evaluate to NULL.
-	 */
-	if (tablesample->repeatable)
-	{
-		ExprState  *argstate = ExecInitExpr((Expr *) tablesample->repeatable,
-											(PlanState *) scanstate);
-
-		fcinfo.arg[1] = ExecEvalExpr(argstate, econtext,
-									 &fcinfo.argnull[1], NULL);
-		if (fcinfo.argnull[1])
-			ereport(ERROR,
-					(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
-				errmsg("REPEATABLE clause must be NOT NULL numeric value")));
-	}
-	else
-	{
-		fcinfo.arg[1] = UInt32GetDatum(random());
-		fcinfo.argnull[1] = false;
-	}
-
-	/* Rest of the arguments come from user. */
-	i = 2;
-	foreach(arg, args)
-	{
-		Expr	   *argexpr = (Expr *) lfirst(arg);
-		ExprState  *argstate = ExecInitExpr(argexpr, (PlanState *) scanstate);
-
-		fcinfo.arg[i] = ExecEvalExpr(argstate, econtext,
-									 &fcinfo.argnull[i], NULL);
-		i++;
-	}
-	Assert(i == fcinfo.nargs);
-
-	(void) FunctionCallInvoke(&fcinfo);
-
-	return tsdesc;
-}
 
 /*
- * Get next tuple from TABLESAMPLE Method.
- */
-HeapTuple
-tablesample_getnext(TableSampleDesc *desc)
-{
-	HeapScanDesc scan = desc->heapScan;
-	HeapTuple	tuple = &(scan->rs_ctup);
-	bool		pagemode = scan->rs_pageatatime;
-	BlockNumber blockno;
-	Page		page;
-	bool		page_all_visible;
-	ItemId		itemid;
-	OffsetNumber tupoffset,
-				maxoffset;
-
-	if (!scan->rs_inited)
-	{
-		/*
-		 * return null immediately if relation is empty
-		 */
-		if (scan->rs_nblocks == 0)
-		{
-			Assert(!BufferIsValid(scan->rs_cbuf));
-			tuple->t_data = NULL;
-			return NULL;
-		}
-		blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock,
-											  PointerGetDatum(desc)));
-		if (!BlockNumberIsValid(blockno))
-		{
-			tuple->t_data = NULL;
-			return NULL;
-		}
-
-		heapgetpage(scan, blockno);
-		scan->rs_inited = true;
-	}
-	else
-	{
-		/* continue from previously returned page/tuple */
-		blockno = scan->rs_cblock;		/* current page */
-	}
-
-	/*
-	 * When pagemode is disabled, the scan will do visibility checks for each
-	 * tuple it finds so the buffer needs to be locked.
-	 */
-	if (!pagemode)
-		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
-
-	page = (Page) BufferGetPage(scan->rs_cbuf);
-	page_all_visible = PageIsAllVisible(page);
-	maxoffset = PageGetMaxOffsetNumber(page);
-
-	for (;;)
-	{
-		CHECK_FOR_INTERRUPTS();
-
-		tupoffset = DatumGetUInt16(FunctionCall3(&desc->tsmnexttuple,
-												 PointerGetDatum(desc),
-												 UInt32GetDatum(blockno),
-												 UInt16GetDatum(maxoffset)));
-
-		if (OffsetNumberIsValid(tupoffset))
-		{
-			bool		visible;
-			bool		found;
-
-			/* Skip invalid tuple pointers. */
-			itemid = PageGetItemId(page, tupoffset);
-			if (!ItemIdIsNormal(itemid))
-				continue;
-
-			tuple->t_data = (HeapTupleHeader) PageGetItem((Page) page, itemid);
-			tuple->t_len = ItemIdGetLength(itemid);
-			ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
-
-			if (page_all_visible)
-				visible = true;
-			else
-				visible = SampleTupleVisible(tuple, tupoffset, scan);
-
-			/*
-			 * Let the sampling method examine the actual tuple and decide if
-			 * we should return it.
-			 *
-			 * Note that we let it examine even invisible tuples for
-			 * statistical purposes, but not return them since user should
-			 * never see invisible tuples.
-			 */
-			if (OidIsValid(desc->tsmexaminetuple.fn_oid))
-			{
-				found = DatumGetBool(FunctionCall4(&desc->tsmexaminetuple,
-												   PointerGetDatum(desc),
-												   UInt32GetDatum(blockno),
-												   PointerGetDatum(tuple),
-												   BoolGetDatum(visible)));
-				/* Should not happen if sampling method is well written. */
-				if (found && !visible)
-					elog(ERROR, "Sampling method wanted to return invisible tuple");
-			}
-			else
-				found = visible;
-
-			/* Found visible tuple, return it. */
-			if (found)
-			{
-				if (!pagemode)
-					LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
-				break;
-			}
-			else
-			{
-				/* Try next tuple from same page. */
-				continue;
-			}
-		}
-
-
-		if (!pagemode)
-			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
-
-		blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock,
-											  PointerGetDatum(desc)));
-
-		/*
-		 * Report our new scan position for synchronization purposes. We don't
-		 * do that when moving backwards, however. That would just mess up any
-		 * other forward-moving scanners.
-		 *
-		 * Note: we do this before checking for end of scan so that the final
-		 * state of the position hint is back at the start of the rel.  That's
-		 * not strictly necessary, but otherwise when you run the same query
-		 * multiple times the starting position would shift a little bit
-		 * backwards on every invocation, which is confusing. We don't
-		 * guarantee any specific ordering in general, though.
-		 */
-		if (scan->rs_syncscan)
-			ss_report_location(scan->rs_rd, BlockNumberIsValid(blockno) ?
-							   blockno : scan->rs_startblock);
-
-		/*
-		 * Reached end of scan.
-		 */
-		if (!BlockNumberIsValid(blockno))
-		{
-			if (BufferIsValid(scan->rs_cbuf))
-				ReleaseBuffer(scan->rs_cbuf);
-			scan->rs_cbuf = InvalidBuffer;
-			scan->rs_cblock = InvalidBlockNumber;
-			tuple->t_data = NULL;
-			scan->rs_inited = false;
-			return NULL;
-		}
-
-		heapgetpage(scan, blockno);
-
-		if (!pagemode)
-			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
-
-		page = (Page) BufferGetPage(scan->rs_cbuf);
-		page_all_visible = PageIsAllVisible(page);
-		maxoffset = PageGetMaxOffsetNumber(page);
-	}
-
-	pgstat_count_heap_getnext(scan->rs_rd);
-
-	return &(scan->rs_ctup);
-}
-
-/*
- * Reset the sampling to starting state
- */
-void
-tablesample_reset(TableSampleDesc *desc)
-{
-	(void) FunctionCall1(&desc->tsmreset, PointerGetDatum(desc));
-}
-
-/*
- * Signal the sampling method that the scan has finished.
- */
-void
-tablesample_end(TableSampleDesc *desc)
-{
-	(void) FunctionCall1(&desc->tsmend, PointerGetDatum(desc));
-}
-
-/*
- * Check visibility of the tuple.
+ * GetTsmRoutine --- get a TsmRoutine struct by invoking the handler.
+ *
+ * This is a convenience routine that's just meant to check for errors.
  */
-static bool
-SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
+TsmRoutine *
+GetTsmRoutine(Oid tsmhandler)
 {
-	/*
-	 * If this scan is reading whole pages at a time, there is already
-	 * visibility info present in rs_vistuples so we can just search it for
-	 * the tupoffset.
-	 */
-	if (scan->rs_pageatatime)
-	{
-		int			start = 0,
-					end = scan->rs_ntuples - 1;
-
-		/*
-		 * Do the binary search over rs_vistuples, it's already sorted by
-		 * OffsetNumber so we don't need to do any sorting ourselves here.
-		 *
-		 * We could use bsearch() here but it's slower for integers because of
-		 * the function call overhead and because it needs boiler plate code
-		 * it would not save us anything code-wise anyway.
-		 */
-		while (start <= end)
-		{
-			int			mid = start + (end - start) / 2;
-			OffsetNumber curoffset = scan->rs_vistuples[mid];
-
-			if (curoffset == tupoffset)
-				return true;
-			else if (curoffset > tupoffset)
-				end = mid - 1;
-			else
-				start = mid + 1;
-		}
-
-		return false;
-	}
-	else
-	{
-		/* No pagemode, we have to check the tuple itself. */
-		Snapshot	snapshot = scan->rs_snapshot;
-		Buffer		buffer = scan->rs_cbuf;
+	Datum		datum;
+	TsmRoutine *routine;
 
-		bool		visible = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
+	datum = OidFunctionCall1(tsmhandler, PointerGetDatum(NULL));
+	routine = (TsmRoutine *) DatumGetPointer(datum);
 
-		CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, buffer,
-										snapshot);
+	if (routine == NULL || !IsA(routine, TsmRoutine))
+		elog(ERROR, "tablesample handler function %u did not return a TsmRoutine struct",
+			 tsmhandler);
 
-		return visible;
-	}
+	return routine;
 }