Improve GiST range-contained-by searches by adding a flag for empty ranges.

In the original implementation, a range-contained-by search had to scan the entire index because an empty range could be lurking anywhere. Improve that by adding a flag to upper GiST entries that says whether the represented subtree contains any empty ranges. Also, make a simple mod to the penalty function to discourage empty ranges from getting pushed into subtrees without any. This needs more work, and the picksplit function should be taught about it too, but that code can be improved without causing an on-disk compatibility break; so we'll leave it for another day. Since we're breaking on-disk compatibility of range values anyway, I took the opportunity to reorganize the range flags bits; the unused RANGE_xB_NULL bits are now adjacent, which might open the door for using them in some other way later. In passing, remove the GiST range opclass entry for <>, which doesn't seem like it can really be indexed usefully. Alexander Korotkov, with some editorializing by Tom
author: Tom Lane <tgl@sss.pgh.pa.us> 2011-11-27 16:50:37 -0500
committer: Tom Lane <tgl@sss.pgh.pa.us> 2011-11-27 16:51:29 -0500
commit: c66e4f138b04d749a713ad075e16f3d60975f5ad (patch)
tree: 6e784a8191049cb7583c3f2c9b3649bc2c8624a0 /src/backend/utils/adt
parent: 08da2d282f1c3cbff141ecd218d737990cf6d234 (diff)
2 files changed, 103 insertions, 44 deletions
diff --git a/src/backend/utils/adt/rangetypes.c b/src/backend/utils/adt/rangetypes.c
index d0f890c28aa..2b9e7b320ab 100644
--- a/src/backend/utils/adt/rangetypes.c
+++ b/src/backend/utils/adt/rangetypes.c
@@ -1623,6 +1623,24 @@ range_get_flags(RangeType *range)
 }
 
 /*
+ * range_set_contain_empty: set the RANGE_CONTAIN_EMPTY bit in the value.
+ *
+ * This is only needed in GiST operations, so we don't include a provision
+ * for setting it in range_serialize; rather, this function must be applied
+ * afterwards.
+ */
+void
+range_set_contain_empty(RangeType *range)
+{
+	char	   *flagsp;
+
+	/* flag byte is datum's last byte */
+	flagsp = (char *) range + VARSIZE(range) - 1;
+
+	*flagsp |= RANGE_CONTAIN_EMPTY;
+}
+
+/*
  * This both serializes and canonicalizes (if applicable) the range.
  * This should be used by most callers.
  */
diff --git a/src/backend/utils/adt/rangetypes_gist.c b/src/backend/utils/adt/rangetypes_gist.c
index be59a5c4a38..b6839588289 100644
--- a/src/backend/utils/adt/rangetypes_gist.c
+++ b/src/backend/utils/adt/rangetypes_gist.c
@@ -17,6 +17,7 @@
 #include "access/gist.h"
 #include "access/skey.h"
 #include "utils/builtins.h"
+#include "utils/datum.h"
 #include "utils/rangetypes.h"
 
 
@@ -32,7 +33,11 @@
 #define RANGESTRAT_CONTAINED_BY			8
 #define RANGESTRAT_CONTAINS_ELEM		16
 #define RANGESTRAT_EQ					18
-#define RANGESTRAT_NE					19
+
+/* Copy a RangeType datum (hardwires typbyval and typlen for ranges...) */
+#define rangeCopy(r) \
+	((RangeType *) DatumGetPointer(datumCopy(PointerGetDatum(r), \
+											 false, -1)))
 
 /*
  * Auxiliary structure for picksplit method.
@@ -146,6 +151,16 @@ range_gist_penalty(PG_FUNCTION_ARGS)
 	subtype_diff = &typcache->rng_subdiff_finfo;
 
 	/*
+	 * If new is or contains empty, and orig doesn't, apply infinite penalty.
+	 * We really don't want to pollute an empty-free subtree with empties.
+	 */
+	if (RangeIsOrContainsEmpty(new) && !RangeIsOrContainsEmpty(orig))
+	{
+		*penalty = get_float4_infinity();
+		PG_RETURN_POINTER(penalty);
+	}
+
+	/*
 	 * We want to compare the size of "orig" to size of "orig union new".
 	 * The penalty will be the sum of the reduction in the lower bound plus
 	 * the increase in the upper bound.
@@ -163,30 +178,9 @@ range_gist_penalty(PG_FUNCTION_ARGS)
 	}
 	else if (empty1)
 	{
-		if (lower2.infinite || upper2.infinite)
-		{
-			/* from empty to infinite */
-			*penalty = get_float4_infinity();
-			PG_RETURN_POINTER(penalty);
-		}
-		else if (OidIsValid(subtype_diff->fn_oid))
-		{
-			/* from empty to upper2-lower2 */
-			*penalty = DatumGetFloat8(FunctionCall2Coll(subtype_diff,
-														typcache->rng_collation,
-														upper2.val,
-														lower2.val));
-			/* upper2 must be >= lower2 */
-			if (*penalty < 0)
-				*penalty = 0;		/* subtype_diff is broken */
-			PG_RETURN_POINTER(penalty);
-		}
-		else
-		{
-			/* wild guess */
-			*penalty = 1.0;
-			PG_RETURN_POINTER(penalty);
-		}
+		/* infinite penalty for pushing non-empty into all-empty subtree */
+		*penalty = get_float4_infinity();
+		PG_RETURN_POINTER(penalty);
 	}
 
 	/* if orig isn't empty, s_union can't be either */
@@ -334,15 +328,27 @@ range_gist_picksplit(PG_FUNCTION_ARGS)
 Datum
 range_gist_same(PG_FUNCTION_ARGS)
 {
-	/* Datum r1 = PG_GETARG_DATUM(0); */
-	/* Datum r2 = PG_GETARG_DATUM(1); */
+	RangeType  *r1 = PG_GETARG_RANGE(0);
+	RangeType  *r2 = PG_GETARG_RANGE(1);
 	bool	   *result = (bool *) PG_GETARG_POINTER(2);
 
 	/*
-	 * We can safely call range_eq using our fcinfo directly; it won't notice
-	 * the third argument.  This allows it to use fn_extra for caching.
+	 * range_eq will ignore the RANGE_CONTAIN_EMPTY flag, so we have to
+	 * check that for ourselves.  More generally, if the entries have been
+	 * properly normalized, then unequal flags bytes must mean unequal ranges
+	 * ... so let's just test all the flag bits at once.
 	 */
-	*result = DatumGetBool(range_eq(fcinfo));
+	if (range_get_flags(r1) != range_get_flags(r2))
+		*result = false;
+	else
+	{
+		/*
+		 * We can safely call range_eq using our fcinfo directly; it won't
+		 * notice the third argument.  This allows it to use fn_extra for
+		 * caching.
+		 */
+		*result = DatumGetBool(range_eq(fcinfo));
+	}
 
 	PG_RETURN_POINTER(result);
 }
@@ -356,27 +362,53 @@ range_gist_same(PG_FUNCTION_ARGS)
 /*
  * Return the smallest range that contains r1 and r2
  *
- * XXX would it be better to redefine range_union as working this way?
+ * This differs from regular range_union in two critical ways:
+ * 1. It won't throw an error for non-adjacent r1 and r2, but just absorb
+ * the intervening values into the result range.
+ * 2. We track whether any empty range has been union'd into the result,
+ * so that contained_by searches can be indexed.  Note that this means
+ * that *all* unions formed within the GiST index must go through here.
  */
 static RangeType *
 range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2)
 {
+	RangeType  *result;
 	RangeBound	lower1,
 				lower2;
 	RangeBound	upper1,
 				upper2;
 	bool		empty1,
 				empty2;
+	char		flags1,
+				flags2;
 	RangeBound *result_lower;
 	RangeBound *result_upper;
 
 	range_deserialize(typcache, r1, &lower1, &upper1, &empty1);
 	range_deserialize(typcache, r2, &lower2, &upper2, &empty2);
+	flags1 = range_get_flags(r1);
+	flags2 = range_get_flags(r2);
 
 	if (empty1)
+	{
+		/* We can return r2 as-is if it already is or contains empty */
+		if (flags2 & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY))
+			return r2;
+		/* Else we'd better copy it (modify-in-place isn't safe) */
+		r2 = rangeCopy(r2);
+		range_set_contain_empty(r2);
 		return r2;
+	}
 	if (empty2)
+	{
+		/* We can return r1 as-is if it already is or contains empty */
+		if (flags1 & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY))
+			return r1;
+		/* Else we'd better copy it (modify-in-place isn't safe) */
+		r1 = rangeCopy(r1);
+		range_set_contain_empty(r1);
 		return r1;
+	}
 
 	if (range_cmp_bounds(typcache, &lower1, &lower2) <= 0)
 		result_lower = &lower1;
@@ -389,12 +421,19 @@ range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2)
 		result_upper = &upper2;
 
 	/* optimization to avoid constructing a new range */
-	if (result_lower == &lower1 && result_upper == &upper1)
+	if (result_lower == &lower1 && result_upper == &upper1 &&
+		((flags1 & RANGE_CONTAIN_EMPTY) || !(flags2 & RANGE_CONTAIN_EMPTY)))
 		return r1;
-	if (result_lower == &lower2 && result_upper == &upper2)
+	if (result_lower == &lower2 && result_upper == &upper2 &&
+		((flags2 & RANGE_CONTAIN_EMPTY) || !(flags1 & RANGE_CONTAIN_EMPTY)))
 		return r2;
 
-	return make_range(typcache, result_lower, result_upper, false);
+	result = make_range(typcache, result_lower, result_upper, false);
+
+	if ((flags1 & RANGE_CONTAIN_EMPTY) || (flags2 & RANGE_CONTAIN_EMPTY))
+		range_set_contain_empty(result);
+
+	return result;
 }
 
 /*
@@ -484,21 +523,26 @@ range_gist_consistent_int(FmgrInfo *flinfo, StrategyNumber strategy,
 			break;
 		case RANGESTRAT_CONTAINED_BY:
 			/*
-			 * Ideally we'd apply range_overlaps here, but at present it
-			 * might fail to find empty ranges in the index, which should
-			 * be reported as being contained by anything.  This needs work.
+			 * Empty ranges are contained by anything, so if key is or
+			 * contains any empty ranges, we must descend into it.  Otherwise,
+			 * descend only if key overlaps the query.
 			 */
-			return true;
+			if (RangeIsOrContainsEmpty(key))
+				return true;
+			proc = range_overlaps;
 			break;
 		case RANGESTRAT_CONTAINS_ELEM:
 			proc = range_contains_elem;
 			break;
 		case RANGESTRAT_EQ:
+			/*
+			 * If query is empty, descend only if the key is or contains any
+			 * empty ranges.  Otherwise, descend if key contains query.
+			 */
+			if (RangeIsEmpty(DatumGetRangeType(query)))
+				return RangeIsOrContainsEmpty(key);
 			proc = range_contains;
 			break;
-		case RANGESTRAT_NE:
-			return true;
-			break;
 		default:
 			elog(ERROR, "unrecognized range strategy: %d", strategy);
 			proc = NULL;		/* keep compiler quiet */
@@ -555,9 +599,6 @@ range_gist_consistent_leaf(FmgrInfo *flinfo, StrategyNumber strategy,
 		case RANGESTRAT_EQ:
 			proc = range_eq;
 			break;
-		case RANGESTRAT_NE:
-			proc = range_ne;
-			break;
 		default:
 			elog(ERROR, "unrecognized range strategy: %d", strategy);
 			proc = NULL;		/* keep compiler quiet */
author	Tom Lane <tgl@sss.pgh.pa.us>	2011-11-27 16:50:37 -0500
committer	Tom Lane <tgl@sss.pgh.pa.us>	2011-11-27 16:51:29 -0500
commit	c66e4f138b04d749a713ad075e16f3d60975f5ad (patch)
tree	6e784a8191049cb7583c3f2c9b3649bc2c8624a0 /src/backend/utils/adt
parent	08da2d282f1c3cbff141ecd218d737990cf6d234 (diff)