diff options
Diffstat (limited to 'src/backend/commands/statscmds.c')
-rw-r--r-- | src/backend/commands/statscmds.c | 437 |
1 files changed, 273 insertions, 164 deletions
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c index 2bae2058459..df4768952d5 100644 --- a/src/backend/commands/statscmds.c +++ b/src/backend/commands/statscmds.c @@ -29,6 +29,8 @@ #include "commands/comment.h" #include "commands/defrem.h" #include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/optimizer.h" #include "statistics/statistics.h" #include "utils/builtins.h" #include "utils/fmgroids.h" @@ -62,7 +64,8 @@ ObjectAddress CreateStatistics(CreateStatsStmt *stmt) { int16 attnums[STATS_MAX_DIMENSIONS]; - int numcols = 0; + int nattnums = 0; + int numcols; char *namestr; NameData stxname; Oid statoid; @@ -74,21 +77,25 @@ CreateStatistics(CreateStatsStmt *stmt) Datum datavalues[Natts_pg_statistic_ext_data]; bool datanulls[Natts_pg_statistic_ext_data]; int2vector *stxkeys; + List *stxexprs = NIL; + Datum exprsDatum; Relation statrel; Relation datarel; Relation rel = NULL; Oid relid; ObjectAddress parentobject, myself; - Datum types[3]; /* one for each possible type of statistic */ + Datum types[4]; /* one for each possible type of statistic */ int ntypes; ArrayType *stxkind; bool build_ndistinct; bool build_dependencies; bool build_mcv; + bool build_expressions; bool requested_type = false; int i; ListCell *cell; + ListCell *cell2; Assert(IsA(stmt, CreateStatsStmt)); @@ -190,101 +197,124 @@ CreateStatistics(CreateStatsStmt *stmt) } /* - * Currently, we only allow simple column references in the expression - * list. That will change someday, and again the grammar already supports - * it so we have to enforce restrictions here. For now, we can convert - * the expression list to a simple array of attnums. While at it, enforce - * some constraints. + * Make sure no more than STATS_MAX_DIMENSIONS columns are used. There + * might be duplicates and so on, but we'll deal with those later. + */ + numcols = list_length(stmt->exprs); + if (numcols > STATS_MAX_DIMENSIONS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("cannot have more than %d columns in statistics", + STATS_MAX_DIMENSIONS))); + + /* + * Convert the expression list to a simple array of attnums, but also keep + * a list of more complex expressions. While at it, enforce some + * constraints. + * + * XXX We do only the bare minimum to separate simple attribute and + * complex expressions - for example "(a)" will be treated as a complex + * expression. No matter how elaborate the check is, there'll always be a + * way around it, if the user is determined (consider e.g. "(a+0)"), so + * it's not worth protecting against it. */ foreach(cell, stmt->exprs) { Node *expr = (Node *) lfirst(cell); - ColumnRef *cref; - char *attname; + StatsElem *selem; HeapTuple atttuple; Form_pg_attribute attForm; TypeCacheEntry *type; - if (!IsA(expr, ColumnRef)) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("only simple column references are allowed in CREATE STATISTICS"))); - cref = (ColumnRef *) expr; - - if (list_length(cref->fields) != 1) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("only simple column references are allowed in CREATE STATISTICS"))); - attname = strVal((Value *) linitial(cref->fields)); - - atttuple = SearchSysCacheAttName(relid, attname); - if (!HeapTupleIsValid(atttuple)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column \"%s\" does not exist", - attname))); - attForm = (Form_pg_attribute) GETSTRUCT(atttuple); - - /* Disallow use of system attributes in extended stats */ - if (attForm->attnum <= 0) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("statistics creation on system columns is not supported"))); - - /* Disallow data types without a less-than operator */ - type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR); - if (type->lt_opr == InvalidOid) + /* + * We should not get anything else than StatsElem, given the grammar. + * But let's keep it as a safety. + */ + if (!IsA(expr, StatsElem)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class", - attname, format_type_be(attForm->atttypid)))); + errmsg("only simple column references and expressions are allowed in CREATE STATISTICS"))); - /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */ - if (numcols >= STATS_MAX_DIMENSIONS) - ereport(ERROR, - (errcode(ERRCODE_TOO_MANY_COLUMNS), - errmsg("cannot have more than %d columns in statistics", - STATS_MAX_DIMENSIONS))); + selem = (StatsElem *) expr; - attnums[numcols] = attForm->attnum; - numcols++; - ReleaseSysCache(atttuple); + if (selem->name) /* column reference */ + { + char *attname; + + attname = selem->name; + + atttuple = SearchSysCacheAttName(relid, attname); + if (!HeapTupleIsValid(atttuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" does not exist", + attname))); + attForm = (Form_pg_attribute) GETSTRUCT(atttuple); + + /* Disallow use of system attributes in extended stats */ + if (attForm->attnum <= 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("statistics creation on system columns is not supported"))); + + /* Disallow data types without a less-than operator */ + type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class", + attname, format_type_be(attForm->atttypid)))); + + attnums[nattnums] = attForm->attnum; + nattnums++; + ReleaseSysCache(atttuple); + } + else /* expression */ + { + Node *expr = selem->expr; + Oid atttype; + + Assert(expr != NULL); + + /* + * Disallow data types without a less-than operator. + * + * We ignore this for statistics on a single expression, in which + * case we'll build the regular statistics only (and that code can + * deal with such data types). + */ + if (list_length(stmt->exprs) > 1) + { + atttype = exprType(expr); + type = lookup_type_cache(atttype, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("expression cannot be used in multivariate statistics because its type %s has no default btree operator class", + format_type_be(atttype)))); + } + + stxexprs = lappend(stxexprs, expr); + } } /* - * Check that at least two columns were specified in the statement. The - * upper bound was already checked in the loop above. - */ - if (numcols < 2) - ereport(ERROR, - (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), - errmsg("extended statistics require at least 2 columns"))); - - /* - * Sort the attnums, which makes detecting duplicates somewhat easier, and - * it does not hurt (it does not affect the efficiency, unlike for - * indexes, for example). - */ - qsort(attnums, numcols, sizeof(int16), compare_int16); - - /* - * Check for duplicates in the list of columns. The attnums are sorted so - * just check consecutive elements. + * Parse the statistics kinds. + * + * First check that if this is the case with a single expression, there + * are no statistics kinds specified (we don't allow that for the simple + * CREATE STATISTICS form). */ - for (i = 1; i < numcols; i++) + if ((list_length(stmt->exprs) == 1) && (list_length(stxexprs) == 1)) { - if (attnums[i] == attnums[i - 1]) + /* statistics kinds not specified */ + if (list_length(stmt->stat_types) > 0) ereport(ERROR, - (errcode(ERRCODE_DUPLICATE_COLUMN), - errmsg("duplicate column name in statistics definition"))); + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("when building statistics on a single expression, statistics kinds may not be specified"))); } - /* Form an int2vector representation of the sorted column list */ - stxkeys = buildint2vector(attnums, numcols); - - /* - * Parse the statistics kinds. - */ + /* OK, let's check that we recognize the statistics kinds. */ build_ndistinct = false; build_dependencies = false; build_mcv = false; @@ -313,14 +343,91 @@ CreateStatistics(CreateStatsStmt *stmt) errmsg("unrecognized statistics kind \"%s\"", type))); } - /* If no statistic type was specified, build them all. */ - if (!requested_type) + + /* + * If no statistic type was specified, build them all (but only when the + * statistics is defined on more than one column/expression). + */ + if ((!requested_type) && (numcols >= 2)) { build_ndistinct = true; build_dependencies = true; build_mcv = true; } + /* + * When there are non-trivial expressions, build the expression stats + * automatically. This allows calculating good estimates for stats that + * consider per-clause estimates (e.g. functional dependencies). + */ + build_expressions = (list_length(stxexprs) > 0); + + /* + * Check that at least two columns were specified in the statement, or + * that we're building statistics on a single expression. + */ + if ((numcols < 2) && (list_length(stxexprs) != 1)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("extended statistics require at least 2 columns"))); + + /* + * Sort the attnums, which makes detecting duplicates somewhat easier, and + * it does not hurt (it does not matter for the contents, unlike for + * indexes, for example). + */ + qsort(attnums, nattnums, sizeof(int16), compare_int16); + + /* + * Check for duplicates in the list of columns. The attnums are sorted so + * just check consecutive elements. + */ + for (i = 1; i < nattnums; i++) + { + if (attnums[i] == attnums[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_COLUMN), + errmsg("duplicate column name in statistics definition"))); + } + + /* + * Check for duplicate expressions. We do two loops, counting the + * occurrences of each expression. This is O(N^2) but we only allow small + * number of expressions and it's not executed often. + * + * XXX We don't cross-check attributes and expressions, because it does + * not seem worth it. In principle we could check that expressions don't + * contain trivial attribute references like "(a)", but the reasoning is + * similar to why we don't bother with extracting columns from + * expressions. It's either expensive or very easy to defeat for + * determined user, and there's no risk if we allow such statistics (the + * statistics is useless, but harmless). + */ + foreach(cell, stxexprs) + { + Node *expr1 = (Node *) lfirst(cell); + int cnt = 0; + + foreach(cell2, stxexprs) + { + Node *expr2 = (Node *) lfirst(cell2); + + if (equal(expr1, expr2)) + cnt += 1; + } + + /* every expression should find at least itself */ + Assert(cnt >= 1); + + if (cnt > 1) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_COLUMN), + errmsg("duplicate expression in statistics definition"))); + } + + /* Form an int2vector representation of the sorted column list */ + stxkeys = buildint2vector(attnums, nattnums); + /* construct the char array of enabled statistic types */ ntypes = 0; if (build_ndistinct) @@ -329,9 +436,23 @@ CreateStatistics(CreateStatsStmt *stmt) types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES); if (build_mcv) types[ntypes++] = CharGetDatum(STATS_EXT_MCV); + if (build_expressions) + types[ntypes++] = CharGetDatum(STATS_EXT_EXPRESSIONS); Assert(ntypes > 0 && ntypes <= lengthof(types)); stxkind = construct_array(types, ntypes, CHAROID, 1, true, TYPALIGN_CHAR); + /* convert the expressions (if any) to a text datum */ + if (stxexprs != NIL) + { + char *exprsString; + + exprsString = nodeToString(stxexprs); + exprsDatum = CStringGetTextDatum(exprsString); + pfree(exprsString); + } + else + exprsDatum = (Datum) 0; + statrel = table_open(StatisticExtRelationId, RowExclusiveLock); /* @@ -351,6 +472,10 @@ CreateStatistics(CreateStatsStmt *stmt) values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys); values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind); + values[Anum_pg_statistic_ext_stxexprs - 1] = exprsDatum; + if (exprsDatum == (Datum) 0) + nulls[Anum_pg_statistic_ext_stxexprs - 1] = true; + /* insert it into pg_statistic_ext */ htup = heap_form_tuple(statrel->rd_att, values, nulls); CatalogTupleInsert(statrel, htup); @@ -373,6 +498,7 @@ CreateStatistics(CreateStatsStmt *stmt) datanulls[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true; datanulls[Anum_pg_statistic_ext_data_stxddependencies - 1] = true; datanulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true; + datanulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = true; /* insert it into pg_statistic_ext_data */ htup = heap_form_tuple(datarel->rd_att, datavalues, datanulls); @@ -396,13 +522,42 @@ CreateStatistics(CreateStatsStmt *stmt) */ ObjectAddressSet(myself, StatisticExtRelationId, statoid); - for (i = 0; i < numcols; i++) + /* add dependencies for plain column references */ + for (i = 0; i < nattnums; i++) { ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]); recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO); } /* + * If there are no dependencies on a column, give the statistics an auto + * dependency on the whole table. In most cases, this will be redundant, + * but it might not be if the statistics expressions contain no Vars + * (which might seem strange but possible). This is consistent with what + * we do for indexes in index_create. + * + * XXX We intentionally don't consider the expressions before adding this + * dependency, because recordDependencyOnSingleRelExpr may not create any + * dependencies for whole-row Vars. + */ + if (!nattnums) + { + ObjectAddressSet(parentobject, RelationRelationId, relid); + recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO); + } + + /* + * Store dependencies on anything mentioned in statistics expressions, + * just like we do for index expressions. + */ + if (stxexprs) + recordDependencyOnSingleRelExpr(&myself, + (Node *) stxexprs, + relid, + DEPENDENCY_NORMAL, + DEPENDENCY_AUTO, false, true); + + /* * Also add dependencies on namespace and owner. These are required * because the stats object might have a different namespace and/or owner * than the underlying table(s). @@ -583,87 +738,6 @@ RemoveStatisticsById(Oid statsOid) } /* - * Update a statistics object for ALTER COLUMN TYPE on a source column. - * - * This could throw an error if the type change can't be supported. - * If it can be supported, but the stats must be recomputed, a likely choice - * would be to set the relevant column(s) of the pg_statistic_ext_data tuple - * to null until the next ANALYZE. (Note that the type change hasn't actually - * happened yet, so one option that's *not* on the table is to recompute - * immediately.) - * - * For both ndistinct and functional-dependencies stats, the on-disk - * representation is independent of the source column data types, and it is - * plausible to assume that the old statistic values will still be good for - * the new column contents. (Obviously, if the ALTER COLUMN TYPE has a USING - * expression that substantially alters the semantic meaning of the column - * values, this assumption could fail. But that seems like a corner case - * that doesn't justify zapping the stats in common cases.) - * - * For MCV lists that's not the case, as those statistics store the datums - * internally. In this case we simply reset the statistics value to NULL. - * - * Note that "type change" includes collation change, which means we can rely - * on the MCV list being consistent with the collation info in pg_attribute - * during estimation. - */ -void -UpdateStatisticsForTypeChange(Oid statsOid, Oid relationOid, int attnum, - Oid oldColumnType, Oid newColumnType) -{ - HeapTuple stup, - oldtup; - - Relation rel; - - Datum values[Natts_pg_statistic_ext_data]; - bool nulls[Natts_pg_statistic_ext_data]; - bool replaces[Natts_pg_statistic_ext_data]; - - oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statsOid)); - if (!HeapTupleIsValid(oldtup)) - elog(ERROR, "cache lookup failed for statistics object %u", statsOid); - - /* - * When none of the defined statistics types contain datum values from the - * table's columns then there's no need to reset the stats. Functional - * dependencies and ndistinct stats should still hold true. - */ - if (!statext_is_kind_built(oldtup, STATS_EXT_MCV)) - { - ReleaseSysCache(oldtup); - return; - } - - /* - * OK, we need to reset some statistics. So let's build the new tuple, - * replacing the affected statistics types with NULL. - */ - memset(nulls, 0, Natts_pg_statistic_ext_data * sizeof(bool)); - memset(replaces, 0, Natts_pg_statistic_ext_data * sizeof(bool)); - memset(values, 0, Natts_pg_statistic_ext_data * sizeof(Datum)); - - replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true; - nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true; - - rel = table_open(StatisticExtDataRelationId, RowExclusiveLock); - - /* replace the old tuple */ - stup = heap_modify_tuple(oldtup, - RelationGetDescr(rel), - values, - nulls, - replaces); - - ReleaseSysCache(oldtup); - CatalogTupleUpdate(rel, &stup->t_self, stup); - - heap_freetuple(stup); - - table_close(rel, RowExclusiveLock); -} - -/* * Select a nonconflicting name for a new statistics. * * name1, name2, and label are used the same way as for makeObjectName(), @@ -731,19 +805,28 @@ ChooseExtendedStatisticNameAddition(List *exprs) buf[0] = '\0'; foreach(lc, exprs) { - ColumnRef *cref = (ColumnRef *) lfirst(lc); + StatsElem *selem = (StatsElem *) lfirst(lc); const char *name; /* It should be one of these, but just skip if it happens not to be */ - if (!IsA(cref, ColumnRef)) + if (!IsA(selem, StatsElem)) continue; - name = strVal((Value *) linitial(cref->fields)); + name = selem->name; if (buflen > 0) buf[buflen++] = '_'; /* insert _ between names */ /* + * We use fixed 'expr' for expressions, which have empty column names. + * For indexes this is handled in ChooseIndexColumnNames, but we have + * no such function for stats and it does not seem worth adding. If a + * better name is needed, the user can specify it explicitly. + */ + if (!name) + name = "expr"; + + /* * At this point we have buflen <= NAMEDATALEN. name should be less * than NAMEDATALEN already, but use strlcpy for paranoia. */ @@ -754,3 +837,29 @@ ChooseExtendedStatisticNameAddition(List *exprs) } return pstrdup(buf); } + +/* + * StatisticsGetRelation: given a statistics's relation OID, get the OID of + * the relation it is an statistics on. Uses the system cache. + */ +Oid +StatisticsGetRelation(Oid statId, bool missing_ok) +{ + HeapTuple tuple; + Form_pg_statistic_ext stx; + Oid result; + + tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statId)); + if (!HeapTupleIsValid(tuple)) + { + if (missing_ok) + return InvalidOid; + elog(ERROR, "cache lookup failed for statistics object %u", statId); + } + stx = (Form_pg_statistic_ext) GETSTRUCT(tuple); + Assert(stx->oid == statId); + + result = stx->stxrelid; + ReleaseSysCache(tuple); + return result; +} |