summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2011-02-17 19:00:54 -0500
committerTom Lane <tgl@sss.pgh.pa.us>2011-02-17 19:00:54 -0500
commit848cd3289e4d08f9a3c78f654ceb6e3f754e1dd3 (patch)
treef2d2a2420fde8b6d8a4f8259b0fc7f7df749a680 /src
parent7422e0081d04ee4373a822392c729eb892a9d25e (diff)
Fix tsmatchsel() to account properly for null rows.
ts_typanalyze.c computes MCE statistics as fractions of the non-null rows, which seems fairly reasonable, and anyway changing it in released versions wouldn't be a good idea. But then ts_selfuncs.c has to account for that. Failure to do so results in overestimates in columns with a significant fraction of null documents. Back-patch to 8.4 where this stuff was introduced. Jesper Krogh
Diffstat (limited to 'src')
-rw-r--r--src/backend/tsearch/ts_selfuncs.c6
-rw-r--r--src/include/catalog/pg_statistic.h2
2 files changed, 8 insertions, 0 deletions
diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
index e7194ce66e2..b679b7544a3 100644
--- a/src/backend/tsearch/ts_selfuncs.c
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -189,11 +189,17 @@ tsquerysel(VariableStatData *vardata, Datum constval)
/* No most-common-elements info, so do without */
selec = tsquery_opr_selec_no_stats(query);
}
+
+ /*
+ * MCE stats count only non-null rows, so adjust for null rows.
+ */
+ selec *= (1.0 - stats->stanullfrac);
}
else
{
/* No stats at all, so do without */
selec = tsquery_opr_selec_no_stats(query);
+ /* we assume no nulls here, so no stanullfrac correction */
}
return selec;
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h
index 797774339d1..f5965b56ec6 100644
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -246,6 +246,8 @@ typedef FormData_pg_statistic *Form_pg_statistic;
* type with identifiable elements (for instance, tsvector). staop contains
* the equality operator appropriate to the element type. stavalues contains
* the most common element values, and stanumbers their frequencies. Unlike
+ * MCV slots, frequencies are measured as the fraction of non-null rows the
+ * element value appears in, not the frequency of all rows. Also unlike
* MCV slots, the values are sorted into order (to support binary search
* for a particular value). Since this puts the minimum and maximum
* frequencies at unpredictable spots in stanumbers, there are two extra