diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/commands/analyze.c | 7 | ||||
-rw-r--r-- | src/backend/tsearch/ts_selfuncs.c | 11 | ||||
-rw-r--r-- | src/backend/tsearch/ts_typanalyze.c | 29 | ||||
-rw-r--r-- | src/backend/utils/adt/array_selfuncs.c | 23 | ||||
-rw-r--r-- | src/backend/utils/adt/array_typanalyze.c | 27 | ||||
-rw-r--r-- | src/include/catalog/pg_statistic.h | 4 |
6 files changed, 75 insertions, 26 deletions
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 8ea2913d906..12b4f3fd36e 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1711,10 +1711,9 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats) i = Anum_pg_statistic_stanumbers1 - 1; for (k = 0; k < STATISTIC_NUM_SLOTS; k++) { - int nnum = stats->numnumbers[k]; - - if (nnum > 0) + if (stats->stanumbers[k] != NULL) { + int nnum = stats->numnumbers[k]; Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum)); ArrayType *arry; @@ -1732,7 +1731,7 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats) i = Anum_pg_statistic_stavalues1 - 1; for (k = 0; k < STATISTIC_NUM_SLOTS; k++) { - if (stats->numvalues[k] > 0) + if (stats->stavalues[k] != NULL) { ArrayType *arry; diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c index 453a5e5c2ea..6a71ae6452d 100644 --- a/src/backend/tsearch/ts_selfuncs.c +++ b/src/backend/tsearch/ts_selfuncs.c @@ -239,8 +239,8 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, } /* - * Grab the lowest frequency. compute_tsvector_stats() stored it for us in - * the one before the last cell of the Numbers array. See ts_typanalyze.c + * Grab the lowest MCE frequency. compute_tsvector_stats() stored it for + * us in the one before the last cell of the Numbers array. */ minfreq = numbers[nnumbers - 2]; @@ -374,8 +374,11 @@ tsquery_opr_selec(QueryItem *item, char *operand, else { /* - * The element is not in MCELEM. Punt, but assume that the - * selectivity cannot be more than minfreq / 2. + * The element is not in MCELEM. Estimate its frequency as + * half that of the least-frequent MCE. (We know it cannot be + * more than minfreq, and it could be a great deal less. Half + * seems like a good compromise.) For probably-historical + * reasons, clamp to not more than DEFAULT_TS_MATCH_SEL. */ selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2); } diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c index c5a71331ce8..93aab00a3ca 100644 --- a/src/backend/tsearch/ts_typanalyze.c +++ b/src/backend/tsearch/ts_typanalyze.c @@ -73,7 +73,7 @@ ts_typanalyze(PG_FUNCTION_ARGS) /* * compute_tsvector_stats() -- compute statistics for a tsvector column * - * This functions computes statistics that are useful for determining @@ + * This function computes statistics that are useful for determining @@ * operations' selectivity, along with the fraction of non-null rows and * average width. * @@ -312,7 +312,7 @@ compute_tsvector_stats(VacAttrStats *stats, /* * Construct an array of the interesting hashtable items, that is, * those meeting the cutoff frequency (s - epsilon)*N. Also identify - * the minimum and maximum frequencies among these items. + * the maximum frequency among these items. * * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff * frequency is 9*N / bucket_width. @@ -324,14 +324,12 @@ compute_tsvector_stats(VacAttrStats *stats, hash_seq_init(&scan_status, lexemes_tab); track_len = 0; - minfreq = lexeme_no; maxfreq = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { if (item->frequency > cutoff_freq) { sort_table[track_len++] = item; - minfreq = Min(minfreq, item->frequency); maxfreq = Max(maxfreq, item->frequency); } } @@ -346,19 +344,38 @@ compute_tsvector_stats(VacAttrStats *stats, * If we obtained more lexemes than we really want, get rid of those * with least frequencies. The easiest way is to qsort the array into * descending frequency order and truncate the array. + * + * If we did not find more elements than we want, then it is safe to + * assume that the stored MCE array will contain every element with + * frequency above the cutoff. In that case, rather than storing the + * smallest frequency we are keeping, we want to store the minimum + * frequency that would have been accepted as a valid MCE. The + * selectivity functions can assume that that is an upper bound on the + * frequency of elements not present in the array. + * + * If we found no candidate MCEs at all, we still want to record the + * cutoff frequency, since it's still valid to assume that no element + * has frequency more than that. */ if (num_mcelem < track_len) { qsort_interruptible(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc, NULL); - /* reset minfreq to the smallest frequency we're keeping */ + /* set minfreq to the smallest frequency we're keeping */ minfreq = sort_table[num_mcelem - 1]->frequency; } else + { num_mcelem = track_len; + /* set minfreq to the minimum frequency above the cutoff */ + minfreq = cutoff_freq + 1; + /* ensure maxfreq is nonzero, too */ + if (track_len == 0) + maxfreq = minfreq; + } /* Generate MCELEM slot entry */ - if (num_mcelem > 0) + if (num_mcelem >= 0) { MemoryContext old_context; Datum *mcelem_values; diff --git a/src/backend/utils/adt/array_selfuncs.c b/src/backend/utils/adt/array_selfuncs.c index a69a84c2aee..cf6fbf8652c 100644 --- a/src/backend/utils/adt/array_selfuncs.c +++ b/src/backend/utils/adt/array_selfuncs.c @@ -544,12 +544,15 @@ mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, if (numbers) { - /* Grab the lowest observed frequency */ + /* Grab the minimal MCE frequency */ minfreq = numbers[nmcelem]; } else { - /* Without statistics make some default assumptions */ + /* + * Without statistics, use DEFAULT_CONTAIN_SEL (the factor of 2 will + * be removed again below). + */ minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL; } @@ -621,8 +624,11 @@ mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, else { /* - * The element is not in MCELEM. Punt, but assume that the - * selectivity cannot be more than minfreq / 2. + * The element is not in MCELEM. Estimate its frequency as half + * that of the least-frequent MCE. (We know it cannot be more + * than minfreq, and it could be a great deal less. Half seems + * like a good compromise.) For probably-historical reasons, + * clamp to not more than DEFAULT_CONTAIN_SEL. */ elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2); } @@ -728,7 +734,7 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem, /* * Grab some of the summary statistics that compute_array_stats() stores: - * lowest frequency, frequency of null elements, and average distinct + * lowest MCE frequency, frequency of null elements, and average distinct * element count. */ minfreq = numbers[nmcelem]; @@ -802,8 +808,11 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem, else { /* - * The element is not in MCELEM. Punt, but assume that the - * selectivity cannot be more than minfreq / 2. + * The element is not in MCELEM. Estimate its frequency as half + * that of the least-frequent MCE. (We know it cannot be more + * than minfreq, and it could be a great deal less. Half seems + * like a good compromise.) For probably-historical reasons, + * clamp to not more than DEFAULT_CONTAIN_SEL. */ elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL, minfreq / 2); diff --git a/src/backend/utils/adt/array_typanalyze.c b/src/backend/utils/adt/array_typanalyze.c index 6f61629b977..560b27f3ca7 100644 --- a/src/backend/utils/adt/array_typanalyze.c +++ b/src/backend/utils/adt/array_typanalyze.c @@ -461,7 +461,7 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, /* * Construct an array of the interesting hashtable items, that is, * those meeting the cutoff frequency (s - epsilon)*N. Also identify - * the minimum and maximum frequencies among these items. + * the maximum frequency among these items. * * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff * frequency is 9*N / bucket_width. @@ -473,14 +473,12 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, hash_seq_init(&scan_status, elements_tab); track_len = 0; - minfreq = element_no; maxfreq = 0; while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL) { if (item->frequency > cutoff_freq) { sort_table[track_len++] = item; - minfreq = Min(minfreq, item->frequency); maxfreq = Max(maxfreq, item->frequency); } } @@ -497,19 +495,38 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, * If we obtained more elements than we really want, get rid of those * with least frequencies. The easiest way is to qsort the array into * descending frequency order and truncate the array. + * + * If we did not find more elements than we want, then it is safe to + * assume that the stored MCE array will contain every element with + * frequency above the cutoff. In that case, rather than storing the + * smallest frequency we are keeping, we want to store the minimum + * frequency that would have been accepted as a valid MCE. The + * selectivity functions can assume that that is an upper bound on the + * frequency of elements not present in the array. + * + * If we found no candidate MCEs at all, we still want to record the + * cutoff frequency, since it's still valid to assume that no element + * has frequency more than that. */ if (num_mcelem < track_len) { qsort_interruptible(sort_table, track_len, sizeof(TrackItem *), trackitem_compare_frequencies_desc, NULL); - /* reset minfreq to the smallest frequency we're keeping */ + /* set minfreq to the smallest frequency we're keeping */ minfreq = sort_table[num_mcelem - 1]->frequency; } else + { num_mcelem = track_len; + /* set minfreq to the minimum frequency above the cutoff */ + minfreq = cutoff_freq + 1; + /* ensure maxfreq is nonzero, too */ + if (track_len == 0) + maxfreq = minfreq; + } /* Generate MCELEM slot entry */ - if (num_mcelem > 0) + if (num_mcelem >= 0) { MemoryContext old_context; Datum *mcelem_values; diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h index 4216e27a8a4..444dc27dcad 100644 --- a/src/include/catalog/pg_statistic.h +++ b/src/include/catalog/pg_statistic.h @@ -240,6 +240,10 @@ DECLARE_FOREIGN_KEY((starelid, staattnum), pg_attribute, (attrelid, attnum)); * the fraction of non-null rows that contain at least one null element). If * this member is omitted, the column is presumed to contain no null elements. * + * Starting in v19, the first extra member can be smaller than the smallest + * frequency of any stored MCE, indicating that it's known that no element + * not present in the MCE array has frequency greater than that value. + * * Note: in current usage for tsvector columns, the stavalues elements are of * type text, even though their representation within tsvector is not * exactly text. |