1 files changed, 50 insertions, 58 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index a74d4491553..7c1567a2cc3 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -139,11 +139,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
 						   FmgrInfo *opproc, bool isgt,
 						   Datum constval, Oid consttype);
 static double eqjoinsel_inner(Oid operator,
-				VariableStatData *vardata1, VariableStatData *vardata2,
-				RelOptInfo *rel1, RelOptInfo *rel2);
+				VariableStatData *vardata1, VariableStatData *vardata2);
 static double eqjoinsel_semi(Oid operator,
 			   VariableStatData *vardata1, VariableStatData *vardata2,
-			   RelOptInfo *rel1, RelOptInfo *rel2);
+			   RelOptInfo *inner_rel);
 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 				  Datum lobound, Datum hibound, Oid boundstypid,
 				  double *scaledlobound, double *scaledhibound);
@@ -1993,47 +1992,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	VariableStatData vardata1;
 	VariableStatData vardata2;
 	bool		join_is_reversed;
-	RelOptInfo *rel1;
-	RelOptInfo *rel2;
+	RelOptInfo *inner_rel;
 
 	get_join_variables(root, args, sjinfo,
 					   &vardata1, &vardata2, &join_is_reversed);
 
-	/*
-	 * Identify the join's direct input relations.  We use the min lefthand
-	 * and min righthand as the inputs, even though the join might actually
-	 * get done with larger input relations.  The min inputs are guaranteed to
-	 * have been formed by now, though, and always using them ensures
-	 * consistency of estimates.
-	 */
-	if (!join_is_reversed)
-	{
-		rel1 = find_join_input_rel(root, sjinfo->min_lefthand);
-		rel2 = find_join_input_rel(root, sjinfo->min_righthand);
-	}
-	else
-	{
-		rel1 = find_join_input_rel(root, sjinfo->min_righthand);
-		rel2 = find_join_input_rel(root, sjinfo->min_lefthand);
-	}
-
 	switch (sjinfo->jointype)
 	{
 		case JOIN_INNER:
 		case JOIN_LEFT:
 		case JOIN_FULL:
-			selec = eqjoinsel_inner(operator, &vardata1, &vardata2,
-									rel1, rel2);
+			selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
 			break;
 		case JOIN_SEMI:
 		case JOIN_ANTI:
+			/*
+			 * Look up the join's inner relation.  min_righthand is sufficient
+			 * information because neither SEMI nor ANTI joins permit any
+			 * reassociation into or out of their RHS, so the righthand will
+			 * always be exactly that set of rels.
+			 */
+			inner_rel = find_join_input_rel(root, sjinfo->min_righthand);
+
 			if (!join_is_reversed)
 				selec = eqjoinsel_semi(operator, &vardata1, &vardata2,
-									   rel1, rel2);
+									   inner_rel);
 			else
 				selec = eqjoinsel_semi(get_commutator(operator),
 									   &vardata2, &vardata1,
-									   rel2, rel1);
+									   inner_rel);
 			break;
 		default:
 			/* other values not expected here */
@@ -2059,8 +2046,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
  */
 static double
 eqjoinsel_inner(Oid operator,
-				VariableStatData *vardata1, VariableStatData *vardata2,
-				RelOptInfo *rel1, RelOptInfo *rel2)
+				VariableStatData *vardata1, VariableStatData *vardata2)
 {
 	double		selec;
 	double		nd1;
@@ -2254,26 +2240,10 @@ eqjoinsel_inner(Oid operator,
 		 * XXX Can we be smarter if we have an MCV list for just one side? It
 		 * seems that if we assume equal distribution for the other side, we
 		 * end up with the same answer anyway.
-		 *
-		 * An additional hack we use here is to clamp the nd1 and nd2 values
-		 * to not more than what we are estimating the input relation sizes to
-		 * be, providing a crude correction for the selectivity of restriction
-		 * clauses on those relations.	(We don't do that in the other path
-		 * since there we are comparing the nd values to stats for the whole
-		 * relations.)  We can apply this clamp both with respect to the base
-		 * relations from which the join variables come, and to the immediate
-		 * input relations of the current join.
 		 */
 		double		nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
 		double		nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
 
-		if (vardata1->rel)
-			nd1 = Min(nd1, vardata1->rel->rows);
-		nd1 = Min(nd1, rel1->rows);
-		if (vardata2->rel)
-			nd2 = Min(nd2, vardata2->rel->rows);
-		nd2 = Min(nd2, rel2->rows);
-
 		selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
 		if (nd1 > nd2)
 			selec /= nd1;
@@ -2300,7 +2270,7 @@ eqjoinsel_inner(Oid operator,
 static double
 eqjoinsel_semi(Oid operator,
 			   VariableStatData *vardata1, VariableStatData *vardata2,
-			   RelOptInfo *rel1, RelOptInfo *rel2)
+			   RelOptInfo *inner_rel)
 {
 	double		selec;
 	double		nd1;
@@ -2321,6 +2291,25 @@ eqjoinsel_semi(Oid operator,
 	nd1 = get_variable_numdistinct(vardata1);
 	nd2 = get_variable_numdistinct(vardata2);
 
+	/*
+	 * We clamp nd2 to be not more than what we estimate the inner relation's
+	 * size to be.  This is intuitively somewhat reasonable since obviously
+	 * there can't be more than that many distinct values coming from the
+	 * inner rel.  The reason for the asymmetry (ie, that we don't clamp nd1
+	 * likewise) is that this is the only pathway by which restriction clauses
+	 * applied to the inner rel will affect the join result size estimate,
+	 * since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
+	 * only the outer rel's size.  If we clamped nd1 we'd be double-counting
+	 * the selectivity of outer-rel restrictions.
+	 *
+	 * We can apply this clamping both with respect to the base relation from
+	 * which the join variable comes (if there is just one), and to the
+	 * immediate inner input relation of the current join.
+	 */
+	if (vardata2->rel)
+		nd2 = Min(nd2, vardata2->rel->rows);
+	nd2 = Min(nd2, inner_rel->rows);
+
 	if (HeapTupleIsValid(vardata1->statsTuple))
 	{
 		stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
@@ -2365,11 +2354,21 @@ eqjoinsel_semi(Oid operator,
 					uncertainfrac,
 					uncertain;
 		int			i,
-					nmatches;
+					nmatches,
+					clamped_nvalues2;
+
+		/*
+		 * The clamping above could have resulted in nd2 being less than
+		 * nvalues2; in which case, we assume that precisely the nd2 most
+		 * common values in the relation will appear in the join input, and so
+		 * compare to only the first nd2 members of the MCV list.  Of course
+		 * this is frequently wrong, but it's the best bet we can make.
+		 */
+		clamped_nvalues2 = Min(nvalues2, nd2);
 
 		fmgr_info(get_opcode(operator), &eqproc);
 		hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
-		hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
+		hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool));
 
 		/*
 		 * Note we assume that each MCV will match at most one member of the
@@ -2382,7 +2381,7 @@ eqjoinsel_semi(Oid operator,
 		{
 			int			j;
 
-			for (j = 0; j < nvalues2; j++)
+			for (j = 0; j < clamped_nvalues2; j++)
 			{
 				if (hasmatch2[j])
 					continue;
@@ -2426,7 +2425,7 @@ eqjoinsel_semi(Oid operator,
 		{
 			nd1 -= nmatches;
 			nd2 -= nmatches;
-			if (nd1 <= nd2 || nd2 <= 0)
+			if (nd1 <= nd2 || nd2 < 0)
 				uncertainfrac = 1.0;
 			else
 				uncertainfrac = nd2 / nd1;
@@ -2447,14 +2446,7 @@ eqjoinsel_semi(Oid operator,
 
 		if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT)
 		{
-			if (vardata1->rel)
-				nd1 = Min(nd1, vardata1->rel->rows);
-			nd1 = Min(nd1, rel1->rows);
-			if (vardata2->rel)
-				nd2 = Min(nd2, vardata2->rel->rows);
-			nd2 = Min(nd2, rel2->rows);
-
-			if (nd1 <= nd2 || nd2 <= 0)
+			if (nd1 <= nd2 || nd2 < 0)
 				selec = 1.0 - nullfrac1;
 			else
 				selec = (nd2 / nd1) * (1.0 - nullfrac1);