From f6d208d6e51810c73f0e02c477984a6b44627f11 Mon Sep 17 00:00:00 2001 From: Simon Riggs Date: Fri, 15 May 2015 14:37:10 -0400 Subject: TABLESAMPLE, SQL Standard and extensible Add a TABLESAMPLE clause to SELECT statements that allows user to specify random BERNOULLI sampling or block level SYSTEM sampling. Implementation allows for extensible sampling functions to be written, using a standard API. Basic version follows SQLStandard exactly. Usable concrete use cases for the sampling API follow in later commits. Petr Jelinek Reviewed by Michael Paquier and Simon Riggs --- doc/src/sgml/catalogs.sgml | 120 +++++++++++++++++++++++++++++++++++++++++++ doc/src/sgml/ref/select.sgml | 61 +++++++++++++++++++++- 2 files changed, 180 insertions(+), 1 deletion(-) (limited to 'doc/src') diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index 6c51974961b..5b364876093 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -278,6 +278,11 @@ planner statistics + + pg_tablesample_method + table sampling methods + + pg_tablespace tablespaces within this database cluster @@ -6127,6 +6132,121 @@ + + <structname>pg_tabesample_method</structname> + + + pg_am + + + + The catalog pg_tablesample_method stores + information about table sampling methods which can be used in + TABLESAMPLE clause of a SELECT + statement. + + + + <structname>pg_tablesample_method</> Columns + + + + + Name + Type + References + Description + + + + + + oid + oid + + Row identifier (hidden attribute; must be explicitly selected) + + + + tsmname + name + + Name of the sampling method + + + + tsmseqscan + bool + + If true, the sampling method scans the whole table sequentially. + + + + + tsmpagemode + bool + + If true, the sampling method always reads the pages completely. + + + + + tsminit + regproc + pg_proc.oid + Initialize the sampling scan function + + + + tsmnextblock + regproc + pg_proc.oid + Get next block number function + + + + tsmnexttuple + regproc + pg_proc.oid + Get next tuple offset function + + + + tsmexaminetuple + regproc + pg_proc.oid + Function which examines the tuple contents and decides if to + return it, or zero if none + + + + tsmend + regproc + pg_proc.oid + End the sampling scan function + + + + tsmreset + regproc + pg_proc.oid + Restart the state of sampling scan function + + + + tsmcost + regproc + pg_proc.oid + Costing function + + + + +
+ +
+ + <structname>pg_tablespace</structname> diff --git a/doc/src/sgml/ref/select.sgml b/doc/src/sgml/ref/select.sgml index 2295f63c130..42e04660a19 100644 --- a/doc/src/sgml/ref/select.sgml +++ b/doc/src/sgml/ref/select.sgml @@ -49,7 +49,7 @@ SELECT [ ALL | DISTINCT [ ON ( expressionwhere from_item can be one of: - [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] + [ ONLY ] table_name [ * ] [ [ AS ] alias [ ( column_alias [, ...] ) ] ] [ TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] ] [ LATERAL ] ( select ) [ AS ] alias [ ( column_alias [, ...] ) ] with_query_name [ [ AS ] alias [ ( column_alias [, ...] ) ] ] [ LATERAL ] function_name ( [ argument [, ...] ] ) @@ -316,6 +316,50 @@ TABLE [ ONLY ] table_name [ * ] + + TABLESAMPLE sampling_method ( argument [, ...] ) [ REPEATABLE ( seed ) ] + + + Table sample clause after + table_name indicates that + a sampling_method should + be used to retrieve subset of rows in the table. + The sampling_method can be + any sampling method installed in the database. There are currently two + sampling methods available in the standard + PostgreSQL distribution: + + + SYSTEM + + + BERNOULLI + + + Both of these sampling methods currently accept only single argument + which is the percent (floating point from 0 to 100) of the rows to + be returned. + The SYSTEM sampling method does block level + sampling with each block having the same chance of being selected and + returns all rows from each selected block. + The BERNOULLI scans whole table and returns + individual rows with equal probability. Additional sampling methods + may be installed in the database via extensions. + + + The optional parameter REPEATABLE uses the seed + parameter, which can be a number or expression producing a number, as + a random seed for sampling. Note that subsequent commands may return + different results even if same REPEATABLE clause was + specified. This happens because DML statements and + maintenance operations such as VACUUM may affect physical + distribution of data. The setseed() function will not + affect the sampling result when the REPEATABLE + parameter is used. + + + + alias @@ -1927,5 +1971,20 @@ SELECT distributors.* WHERE distributors.name = 'Westward'; ROWS FROM( ... ) is an extension of the SQL standard. + + + <literal>TABLESAMPLE</literal> clause + + + The TABLESAMPLE clause is currently accepted only on physical + relations and materialized views. + + + + Additional modules allow you to install custom sampling methods and use + them instead of the SQL standard methods. + + + -- cgit v1.2.3