summaryrefslogtreecommitdiff
path: root/src/backend/statistics/mcv.c
diff options
context:
space:
mode:
authorTomas Vondra <tomas.vondra@postgresql.org>2021-03-26 23:22:01 +0100
committerTomas Vondra <tomas.vondra@postgresql.org>2021-03-27 00:01:11 +0100
commita4d75c86bf15220df22de0a92c819ecef9db3849 (patch)
treea736a68b1c3f022590a886b7bac45276f1f490a6 /src/backend/statistics/mcv.c
parent98376c18f12e562421b5c77e619248e8b7aae3c6 (diff)
Extended statistics on expressions
Allow defining extended statistics on expressions, not just just on simple column references. With this commit, expressions are supported by all existing extended statistics kinds, improving the same types of estimates. A simple example may look like this: CREATE TABLE t (a int); CREATE STATISTICS s ON mod(a,10), mod(a,20) FROM t; ANALYZE t; The collected statistics are useful e.g. to estimate queries with those expressions in WHERE or GROUP BY clauses: SELECT * FROM t WHERE mod(a,10) = 0 AND mod(a,20) = 0; SELECT 1 FROM t GROUP BY mod(a,10), mod(a,20); This introduces new internal statistics kind 'e' (expressions) which is built automatically when the statistics object definition includes any expressions. This represents single-expression statistics, as if there was an expression index (but without the index maintenance overhead). The statistics is stored in pg_statistics_ext_data as an array of composite types, which is possible thanks to 79f6a942bd. CREATE STATISTICS allows building statistics on a single expression, in which case in which case it's not possible to specify statistics kinds. A new system view pg_stats_ext_exprs can be used to display expression statistics, similarly to pg_stats and pg_stats_ext views. ALTER TABLE ... ALTER COLUMN ... TYPE now treats indexes the same way it treats indexes, i.e. it drops and recreates the statistics. This means all statistics are reset, and we no longer try to preserve at least the functional dependencies. This should not be a major issue in practice, as the functional dependencies actually rely on per-column statistics, which were always reset anyway. Author: Tomas Vondra Reviewed-by: Justin Pryzby, Dean Rasheed, Zhihong Yu Discussion: https://postgr.es/m/ad7891d2-e90c-b446-9fe2-7419143847d7%40enterprisedb.com
Diffstat (limited to 'src/backend/statistics/mcv.c')
-rw-r--r--src/backend/statistics/mcv.c369
1 files changed, 214 insertions, 155 deletions
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 8335dff2418..2a00fb48483 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -74,7 +74,7 @@
((ndims) * sizeof(DimensionInfo)) + \
((nitems) * ITEM_SIZE(ndims)))
-static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
+static MultiSortSupport build_mss(StatsBuildData *data);
static SortItem *build_distinct_groups(int numrows, SortItem *items,
MultiSortSupport mss, int *ndistinct);
@@ -181,32 +181,33 @@ get_mincount_for_mcv_list(int samplerows, double totalrows)
*
*/
MCVList *
-statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
- VacAttrStats **stats, double totalrows, int stattarget)
+statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget)
{
int i,
numattrs,
+ numrows,
ngroups,
nitems;
- AttrNumber *attnums;
double mincount;
SortItem *items;
SortItem *groups;
MCVList *mcvlist = NULL;
MultiSortSupport mss;
- attnums = build_attnums_array(attrs, &numattrs);
-
/* comparator for all the columns */
- mss = build_mss(stats, numattrs);
+ mss = build_mss(data);
/* sort the rows */
- items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
- mss, numattrs, attnums);
+ items = build_sorted_items(data, &nitems, mss,
+ data->nattnums, data->attnums);
if (!items)
return NULL;
+ /* for convenience */
+ numattrs = data->nattnums;
+ numrows = data->numrows;
+
/* transform the sorted rows into groups (sorted by frequency) */
groups = build_distinct_groups(nitems, items, mss, &ngroups);
@@ -289,7 +290,7 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
/* store info about data type OIDs */
for (i = 0; i < numattrs; i++)
- mcvlist->types[i] = stats[i]->attrtypid;
+ mcvlist->types[i] = data->stats[i]->attrtypid;
/* Copy the first chunk of groups into the result. */
for (i = 0; i < nitems; i++)
@@ -347,9 +348,10 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
* build MultiSortSupport for the attributes passed in attrs
*/
static MultiSortSupport
-build_mss(VacAttrStats **stats, int numattrs)
+build_mss(StatsBuildData *data)
{
int i;
+ int numattrs = data->nattnums;
/* Sort by multiple columns (using array of SortSupport) */
MultiSortSupport mss = multi_sort_init(numattrs);
@@ -357,7 +359,7 @@ build_mss(VacAttrStats **stats, int numattrs)
/* prepare the sort functions for all the attributes */
for (i = 0; i < numattrs; i++)
{
- VacAttrStats *colstat = stats[i];
+ VacAttrStats *colstat = data->stats[i];
TypeCacheEntry *type;
type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
@@ -1524,6 +1526,59 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
}
/*
+ * match the attribute/expression to a dimension of the statistic
+ *
+ * Match the attribute/expression to statistics dimension. Optionally
+ * determine the collation.
+ */
+static int
+mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid)
+{
+ int idx = -1;
+
+ if (IsA(expr, Var))
+ {
+ /* simple Var, so just lookup using varattno */
+ Var *var = (Var *) expr;
+
+ if (collid)
+ *collid = var->varcollid;
+
+ idx = bms_member_index(keys, var->varattno);
+
+ /* make sure the index is valid */
+ Assert((idx >= 0) && (idx <= bms_num_members(keys)));
+ }
+ else
+ {
+ ListCell *lc;
+
+ /* expressions are stored after the simple columns */
+ idx = bms_num_members(keys);
+
+ if (collid)
+ *collid = exprCollation(expr);
+
+ /* expression - lookup in stats expressions */
+ foreach(lc, exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc);
+
+ if (equal(expr, stat_expr))
+ break;
+
+ idx++;
+ }
+
+ /* make sure the index is valid */
+ Assert((idx >= bms_num_members(keys)) &&
+ (idx <= bms_num_members(keys) + list_length(exprs)));
+ }
+
+ return idx;
+}
+
+/*
* mcv_get_match_bitmap
* Evaluate clauses using the MCV list, and update the match bitmap.
*
@@ -1544,7 +1599,8 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
*/
static bool *
mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
- Bitmapset *keys, MCVList *mcvlist, bool is_or)
+ Bitmapset *keys, List *exprs,
+ MCVList *mcvlist, bool is_or)
{
int i;
ListCell *l;
@@ -1582,77 +1638,78 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
OpExpr *expr = (OpExpr *) clause;
FmgrInfo opproc;
- /* valid only after examine_clause_args returns true */
- Var *var;
+ /* valid only after examine_opclause_args returns true */
+ Node *clause_expr;
Const *cst;
- bool varonleft;
+ bool expronleft;
+ int idx;
+ Oid collid;
fmgr_info(get_opcode(expr->opno), &opproc);
- /* extract the var and const from the expression */
- if (examine_clause_args(expr->args, &var, &cst, &varonleft))
+ /* extract the var/expr and const from the expression */
+ if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
+ elog(ERROR, "incompatible clause");
+
+ /* match the attribute/expression to a dimension of the statistic */
+ idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
+
+ /*
+ * Walk through the MCV items and evaluate the current clause. We
+ * can skip items that were already ruled out, and terminate if
+ * there are no remaining MCV items that might possibly match.
+ */
+ for (i = 0; i < mcvlist->nitems; i++)
{
- int idx;
+ bool match = true;
+ MCVItem *item = &mcvlist->items[i];
- /* match the attribute to a dimension of the statistic */
- idx = bms_member_index(keys, var->varattno);
+ Assert(idx >= 0);
/*
- * Walk through the MCV items and evaluate the current clause.
- * We can skip items that were already ruled out, and
- * terminate if there are no remaining MCV items that might
- * possibly match.
+ * When the MCV item or the Const value is NULL we can treat
+ * this as a mismatch. We must not call the operator because
+ * of strictness.
*/
- for (i = 0; i < mcvlist->nitems; i++)
+ if (item->isnull[idx] || cst->constisnull)
{
- bool match = true;
- MCVItem *item = &mcvlist->items[i];
-
- /*
- * When the MCV item or the Const value is NULL we can
- * treat this as a mismatch. We must not call the operator
- * because of strictness.
- */
- if (item->isnull[idx] || cst->constisnull)
- {
- matches[i] = RESULT_MERGE(matches[i], is_or, false);
- continue;
- }
+ matches[i] = RESULT_MERGE(matches[i], is_or, false);
+ continue;
+ }
- /*
- * Skip MCV items that can't change result in the bitmap.
- * Once the value gets false for AND-lists, or true for
- * OR-lists, we don't need to look at more clauses.
- */
- if (RESULT_IS_FINAL(matches[i], is_or))
- continue;
+ /*
+ * Skip MCV items that can't change result in the bitmap. Once
+ * the value gets false for AND-lists, or true for OR-lists,
+ * we don't need to look at more clauses.
+ */
+ if (RESULT_IS_FINAL(matches[i], is_or))
+ continue;
- /*
- * First check whether the constant is below the lower
- * boundary (in that case we can skip the bucket, because
- * there's no overlap).
- *
- * We don't store collations used to build the statistics,
- * but we can use the collation for the attribute itself,
- * as stored in varcollid. We do reset the statistics
- * after a type change (including collation change), so
- * this is OK. We may need to relax this after allowing
- * extended statistics on expressions.
- */
- if (varonleft)
- match = DatumGetBool(FunctionCall2Coll(&opproc,
- var->varcollid,
- item->values[idx],
- cst->constvalue));
- else
- match = DatumGetBool(FunctionCall2Coll(&opproc,
- var->varcollid,
- cst->constvalue,
- item->values[idx]));
-
- /* update the match bitmap with the result */
- matches[i] = RESULT_MERGE(matches[i], is_or, match);
- }
+ /*
+ * First check whether the constant is below the lower
+ * boundary (in that case we can skip the bucket, because
+ * there's no overlap).
+ *
+ * We don't store collations used to build the statistics, but
+ * we can use the collation for the attribute itself, as
+ * stored in varcollid. We do reset the statistics after a
+ * type change (including collation change), so this is OK.
+ * For expressions we use the collation extracted from the
+ * expression itself.
+ */
+ if (expronleft)
+ match = DatumGetBool(FunctionCall2Coll(&opproc,
+ collid,
+ item->values[idx],
+ cst->constvalue));
+ else
+ match = DatumGetBool(FunctionCall2Coll(&opproc,
+ collid,
+ cst->constvalue,
+ item->values[idx]));
+
+ /* update the match bitmap with the result */
+ matches[i] = RESULT_MERGE(matches[i], is_or, match);
}
}
else if (IsA(clause, ScalarArrayOpExpr))
@@ -1660,115 +1717,116 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
FmgrInfo opproc;
- /* valid only after examine_clause_args returns true */
- Var *var;
+ /* valid only after examine_opclause_args returns true */
+ Node *clause_expr;
Const *cst;
- bool varonleft;
+ bool expronleft;
+ Oid collid;
+ int idx;
+
+ /* array evaluation */
+ ArrayType *arrayval;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ int num_elems;
+ Datum *elem_values;
+ bool *elem_nulls;
fmgr_info(get_opcode(expr->opno), &opproc);
- /* extract the var and const from the expression */
- if (examine_clause_args(expr->args, &var, &cst, &varonleft))
+ /* extract the var/expr and const from the expression */
+ if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
+ elog(ERROR, "incompatible clause");
+
+ /* ScalarArrayOpExpr has the Var always on the left */
+ Assert(expronleft);
+
+ /* XXX what if (cst->constisnull == NULL)? */
+ if (!cst->constisnull)
{
- int idx;
+ arrayval = DatumGetArrayTypeP(cst->constvalue);
+ get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
+ &elmlen, &elmbyval, &elmalign);
+ deconstruct_array(arrayval,
+ ARR_ELEMTYPE(arrayval),
+ elmlen, elmbyval, elmalign,
+ &elem_values, &elem_nulls, &num_elems);
+ }
- ArrayType *arrayval;
- int16 elmlen;
- bool elmbyval;
- char elmalign;
- int num_elems;
- Datum *elem_values;
- bool *elem_nulls;
+ /* match the attribute/expression to a dimension of the statistic */
+ idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
- /* ScalarArrayOpExpr has the Var always on the left */
- Assert(varonleft);
+ /*
+ * Walk through the MCV items and evaluate the current clause. We
+ * can skip items that were already ruled out, and terminate if
+ * there are no remaining MCV items that might possibly match.
+ */
+ for (i = 0; i < mcvlist->nitems; i++)
+ {
+ int j;
+ bool match = (expr->useOr ? false : true);
+ MCVItem *item = &mcvlist->items[i];
- if (!cst->constisnull)
+ /*
+ * When the MCV item or the Const value is NULL we can treat
+ * this as a mismatch. We must not call the operator because
+ * of strictness.
+ */
+ if (item->isnull[idx] || cst->constisnull)
{
- arrayval = DatumGetArrayTypeP(cst->constvalue);
- get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
- &elmlen, &elmbyval, &elmalign);
- deconstruct_array(arrayval,
- ARR_ELEMTYPE(arrayval),
- elmlen, elmbyval, elmalign,
- &elem_values, &elem_nulls, &num_elems);
+ matches[i] = RESULT_MERGE(matches[i], is_or, false);
+ continue;
}
- /* match the attribute to a dimension of the statistic */
- idx = bms_member_index(keys, var->varattno);
-
/*
- * Walk through the MCV items and evaluate the current clause.
- * We can skip items that were already ruled out, and
- * terminate if there are no remaining MCV items that might
- * possibly match.
+ * Skip MCV items that can't change result in the bitmap. Once
+ * the value gets false for AND-lists, or true for OR-lists,
+ * we don't need to look at more clauses.
*/
- for (i = 0; i < mcvlist->nitems; i++)
+ if (RESULT_IS_FINAL(matches[i], is_or))
+ continue;
+
+ for (j = 0; j < num_elems; j++)
{
- int j;
- bool match = (expr->useOr ? false : true);
- MCVItem *item = &mcvlist->items[i];
+ Datum elem_value = elem_values[j];
+ bool elem_isnull = elem_nulls[j];
+ bool elem_match;
- /*
- * When the MCV item or the Const value is NULL we can
- * treat this as a mismatch. We must not call the operator
- * because of strictness.
- */
- if (item->isnull[idx] || cst->constisnull)
+ /* NULL values always evaluate as not matching. */
+ if (elem_isnull)
{
- matches[i] = RESULT_MERGE(matches[i], is_or, false);
+ match = RESULT_MERGE(match, expr->useOr, false);
continue;
}
/*
- * Skip MCV items that can't change result in the bitmap.
- * Once the value gets false for AND-lists, or true for
- * OR-lists, we don't need to look at more clauses.
+ * Stop evaluating the array elements once we reach match
+ * value that can't change - ALL() is the same as
+ * AND-list, ANY() is the same as OR-list.
*/
- if (RESULT_IS_FINAL(matches[i], is_or))
- continue;
+ if (RESULT_IS_FINAL(match, expr->useOr))
+ break;
- for (j = 0; j < num_elems; j++)
- {
- Datum elem_value = elem_values[j];
- bool elem_isnull = elem_nulls[j];
- bool elem_match;
-
- /* NULL values always evaluate as not matching. */
- if (elem_isnull)
- {
- match = RESULT_MERGE(match, expr->useOr, false);
- continue;
- }
-
- /*
- * Stop evaluating the array elements once we reach
- * match value that can't change - ALL() is the same
- * as AND-list, ANY() is the same as OR-list.
- */
- if (RESULT_IS_FINAL(match, expr->useOr))
- break;
-
- elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
- var->varcollid,
- item->values[idx],
- elem_value));
-
- match = RESULT_MERGE(match, expr->useOr, elem_match);
- }
+ elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
+ collid,
+ item->values[idx],
+ elem_value));
- /* update the match bitmap with the result */
- matches[i] = RESULT_MERGE(matches[i], is_or, match);
+ match = RESULT_MERGE(match, expr->useOr, elem_match);
}
+
+ /* update the match bitmap with the result */
+ matches[i] = RESULT_MERGE(matches[i], is_or, match);
}
}
else if (IsA(clause, NullTest))
{
NullTest *expr = (NullTest *) clause;
- Var *var = (Var *) (expr->arg);
+ Node *clause_expr = (Node *) (expr->arg);
- /* match the attribute to a dimension of the statistic */
- int idx = bms_member_index(keys, var->varattno);
+ /* match the attribute/expression to a dimension of the statistic */
+ int idx = mcv_match_expression(clause_expr, keys, exprs, NULL);
/*
* Walk through the MCV items and evaluate the current clause. We
@@ -1811,7 +1869,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
Assert(list_length(bool_clauses) >= 2);
/* build the match bitmap for the OR-clauses */
- bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys,
+ bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, exprs,
mcvlist, is_orclause(clause));
/*
@@ -1839,7 +1897,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
Assert(list_length(not_args) == 1);
/* build the match bitmap for the NOT-clause */
- not_matches = mcv_get_match_bitmap(root, not_args, keys,
+ not_matches = mcv_get_match_bitmap(root, not_args, keys, exprs,
mcvlist, false);
/*
@@ -1982,7 +2040,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
mcv = statext_mcv_load(stat->statOid);
/* build a match bitmap for the clauses */
- matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false);
+ matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs,
+ mcv, false);
/* sum frequencies for all the matching MCV items */
*basesel = 0.0;
@@ -2056,7 +2115,7 @@ mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
/* build the match bitmap for the new clause */
new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys,
- mcv, false);
+ stat->exprs, mcv, false);
/*
* Sum the frequencies for all the MCV items matching this clause and also