summaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/tsrank.c
diff options
context:
space:
mode:
authorTeodor Sigaev <teodor@sigaev.ru>2016-04-07 18:44:18 +0300
committerTeodor Sigaev <teodor@sigaev.ru>2016-04-07 18:44:18 +0300
commitbb140506df605fab58f48926ee1db1f80bdafb59 (patch)
tree581f9aeb71e3596000af3b4904e0c62a372d77b3 /src/backend/utils/adt/tsrank.c
parent015e88942aa50f0d419ddac00e63bb06d6e62e86 (diff)
Phrase full text search.
Patch introduces new text search operator (<-> or <DISTANCE>) into tsquery. On-disk and binary in/out format of tsquery are backward compatible. It has two side effect: - change order for tsquery, so, users, who has a btree index over tsquery, should reindex it - less number of parenthesis in tsquery output, and tsquery becomes more readable Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov Reviewers: Alexander Korotkov, Artur Zakirov
Diffstat (limited to 'src/backend/utils/adt/tsrank.c')
-rw-r--r--src/backend/utils/adt/tsrank.c263
1 files changed, 185 insertions, 78 deletions
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c
index 53f678a3bfb..ab47b763eeb 100644
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -364,8 +364,10 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
return 0.0;
/* XXX: What about NOT? */
- res = (item->type == QI_OPR && item->qoperator.oper == OP_AND) ?
- calc_rank_and(w, t, q) : calc_rank_or(w, t, q);
+ res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND ||
+ item->qoperator.oper == OP_PHRASE)) ?
+ calc_rank_and(w, t, q) :
+ calc_rank_or(w, t, q);
if (res < 0)
res = 1e-20f;
@@ -496,10 +498,17 @@ ts_rank_tt(PG_FUNCTION_ARGS)
typedef struct
{
- QueryItem **item;
- int16 nitem;
- uint8 wclass;
- int32 pos;
+ union {
+ struct { /* compiled doc representation */
+ QueryItem **items;
+ int16 nitem;
+ } query;
+ struct { /* struct is used for preparing doc representation */
+ QueryItem *item;
+ WordEntry *entry;
+ } map;
+ } data;
+ WordEntryPos pos;
} DocRepresentation;
static int
@@ -508,26 +517,59 @@ compareDocR(const void *va, const void *vb)
const DocRepresentation *a = (const DocRepresentation *) va;
const DocRepresentation *b = (const DocRepresentation *) vb;
- if (a->pos == b->pos)
- return 0;
- return (a->pos > b->pos) ? 1 : -1;
+ if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos))
+ {
+ if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
+ {
+ if (a->data.map.entry == b->data.map.entry)
+ return 0;
+
+ return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
+ }
+
+ return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
+ }
+
+ return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1;
}
+#define MAXQROPOS MAXENTRYPOS
+typedef struct
+{
+ bool operandexists;
+ bool reverseinsert; /* indicates insert order,
+ true means descending order */
+ uint32 npos;
+ WordEntryPos pos[MAXQROPOS];
+} QueryRepresentationOperand;
+
typedef struct
{
- TSQuery query;
- bool *operandexist;
+ TSQuery query;
+ QueryRepresentationOperand *operandData;
} QueryRepresentation;
-#define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] )
-#define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true
+#define QR_GET_OPERAND_DATA(q, v) \
+ ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) )
static bool
-checkcondition_QueryOperand(void *checkval, QueryOperand *val)
+checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data)
{
- QueryRepresentation *qr = (QueryRepresentation *) checkval;
+ QueryRepresentation *qr = (QueryRepresentation *) checkval;
+ QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val);
- return QR_GET_OPERAND_EXISTS(qr, val);
+ if (!opData->operandexists)
+ return false;
+
+ if (data)
+ {
+ data->npos = opData->npos;
+ data->pos = opData->pos;
+ if (opData->reverseinsert)
+ data->pos += MAXQROPOS - opData->npos;
+ }
+
+ return true;
}
typedef struct
@@ -539,14 +581,65 @@ typedef struct
DocRepresentation *end;
} CoverExt;
+static void
+resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert)
+{
+ int i;
+
+ for(i = 0; i < qr->query->size; i++)
+ {
+ qr->operandData[i].operandexists = false;
+ qr->operandData[i].reverseinsert = reverseinsert;
+ qr->operandData[i].npos = 0;
+ }
+}
+
+static void
+fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry)
+{
+ int i;
+ int lastPos;
+ QueryRepresentationOperand *opData;
+
+ for (i = 0; i < entry->data.query.nitem; i++)
+ {
+ if (entry->data.query.items[i]->type != QI_VAL)
+ continue;
+
+ opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]);
+
+ opData->operandexists = true;
+
+ if (opData->npos == 0)
+ {
+ lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0;
+ opData->pos[lastPos] = entry->pos;
+ opData->npos++;
+ continue;
+ }
+
+ lastPos = opData->reverseinsert ?
+ (MAXQROPOS - opData->npos) :
+ (opData->npos - 1);
+
+ if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos))
+ {
+ lastPos = opData->reverseinsert ?
+ (MAXQROPOS - 1 - opData->npos) :
+ (opData->npos);
+
+ opData->pos[lastPos] = entry->pos;
+ opData->npos++;
+ }
+ }
+}
static bool
Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
{
- DocRepresentation *ptr;
- int lastpos = ext->pos;
- int i;
- bool found = false;
+ DocRepresentation *ptr;
+ int lastpos = ext->pos;
+ bool found = false;
/*
* since this function recurses, it could be driven to stack overflow.
@@ -554,7 +647,7 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
*/
check_stack_depth();
- memset(qr->operandexist, 0, sizeof(bool) * qr->query->size);
+ resetQueryRepresentation(qr, false);
ext->p = INT_MAX;
ext->q = 0;
@@ -563,16 +656,13 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
/* find upper bound of cover from current position, move up */
while (ptr - doc < len)
{
- for (i = 0; i < ptr->nitem; i++)
- {
- if (ptr->item[i]->type == QI_VAL)
- QR_SET_OPERAND_EXISTS(qr, ptr->item[i]);
- }
+ fillQueryRepresentationData(qr, ptr);
+
if (TS_execute(GETQUERY(qr->query), (void *) qr, false, checkcondition_QueryOperand))
{
- if (ptr->pos > ext->q)
+ if (WEP_GETPOS(ptr->pos) > ext->q)
{
- ext->q = ptr->pos;
+ ext->q = WEP_GETPOS(ptr->pos);
ext->end = ptr;
lastpos = ptr - doc;
found = true;
@@ -585,22 +675,24 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
if (!found)
return false;
- memset(qr->operandexist, 0, sizeof(bool) * qr->query->size);
+ resetQueryRepresentation(qr, true);
ptr = doc + lastpos;
/* find lower bound of cover from found upper bound, move down */
while (ptr >= doc + ext->pos)
{
- for (i = 0; i < ptr->nitem; i++)
- if (ptr->item[i]->type == QI_VAL)
- QR_SET_OPERAND_EXISTS(qr, ptr->item[i]);
+ /*
+ * we scan doc from right to left, so pos info in reverse order!
+ */
+ fillQueryRepresentationData(qr, ptr);
+
if (TS_execute(GETQUERY(qr->query), (void *) qr, true, checkcondition_QueryOperand))
{
- if (ptr->pos < ext->p)
+ if (WEP_GETPOS(ptr->pos) < ext->p)
{
ext->begin = ptr;
- ext->p = ptr->pos;
+ ext->p = WEP_GETPOS(ptr->pos);
}
break;
}
@@ -628,18 +720,20 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
WordEntry *entry,
*firstentry;
WordEntryPos *post;
- int32 dimt,
+ int32 dimt, /* number of 'post' items */
j,
i,
nitem;
int len = qr->query->size * 4,
cur = 0;
DocRepresentation *doc;
- char *operand;
doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
- operand = GETOPERAND(qr->query);
+ /*
+ * Iterate through query to make DocRepresentaion for words and it's entries
+ * satisfied by query
+ */
for (i = 0; i < qr->query->size; i++)
{
QueryOperand *curoperand;
@@ -649,13 +743,11 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
curoperand = &item[i].qoperand;
- if (QR_GET_OPERAND_EXISTS(qr, &item[i]))
- continue;
-
firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
if (!entry)
continue;
+ /* iterations over entries in tsvector */
while (entry - firstentry < nitem)
{
if (entry->haspos)
@@ -676,53 +768,67 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
}
+ /* iterations over entry's positions */
for (j = 0; j < dimt; j++)
{
- if (j == 0)
- {
- int k;
-
- doc[cur].nitem = 0;
- doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size);
-
- for (k = 0; k < qr->query->size; k++)
- {
- QueryOperand *kptr = &item[k].qoperand;
- QueryOperand *iptr = &item[i].qoperand;
-
- if (k == i ||
- (item[k].type == QI_VAL &&
- compareQueryOperand(&kptr, &iptr, operand) == 0))
- {
- /*
- * if k == i, we've already checked above that
- * it's type == Q_VAL
- */
- doc[cur].item[doc[cur].nitem] = item + k;
- doc[cur].nitem++;
- QR_SET_OPERAND_EXISTS(qr, item + k);
- }
- }
- }
- else
+ if (curoperand->weight == 0 ||
+ curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
{
- doc[cur].nitem = doc[cur - 1].nitem;
- doc[cur].item = doc[cur - 1].item;
+ doc[cur].pos = post[j];
+ doc[cur].data.map.entry = entry;
+ doc[cur].data.map.item = (QueryItem *) curoperand;
+ cur++;
}
- doc[cur].pos = WEP_GETPOS(post[j]);
- doc[cur].wclass = WEP_GETWEIGHT(post[j]);
- cur++;
}
entry++;
}
}
- *doclen = cur;
-
if (cur > 0)
{
+ DocRepresentation *rptr = doc + 1,
+ *wptr = doc,
+ storage;
+
+ /*
+ * Sort representation in ascending order by pos and entry
+ */
qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
+
+ /*
+ * Join QueryItem per WordEntry and it's position
+ */
+ storage.pos = doc->pos;
+ storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
+ storage.data.query.items[0] = doc->data.map.item;
+ storage.data.query.nitem = 1;
+
+ while (rptr - doc < cur)
+ {
+ if (rptr->pos == (rptr-1)->pos &&
+ rptr->data.map.entry == (rptr-1)->data.map.entry)
+ {
+ storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
+ storage.data.query.nitem++;
+ }
+ else
+ {
+ *wptr = storage;
+ wptr++;
+ storage.pos = rptr->pos;
+ storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
+ storage.data.query.items[0] = rptr->data.map.item;
+ storage.data.query.nitem = 1;
+ }
+
+ rptr++;
+ }
+
+ *wptr = storage;
+ wptr++;
+
+ *doclen = wptr - doc;
return doc;
}
@@ -758,12 +864,13 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
}
qr.query = query;
- qr.operandexist = (bool *) palloc0(sizeof(bool) * query->size);
+ qr.operandData = (QueryRepresentationOperand *)
+ palloc0(sizeof(QueryRepresentationOperand) * query->size);
doc = get_docrep(txt, &qr, &doclen);
if (!doc)
{
- pfree(qr.operandexist);
+ pfree(qr.operandData);
return 0.0;
}
@@ -777,7 +884,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
while (ptr <= ext.end)
{
- InvSum += invws[ptr->wclass];
+ InvSum += invws[WEP_GETWEIGHT(ptr->pos)];
ptr++;
}
@@ -827,7 +934,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
pfree(doc);
- pfree(qr.operandexist);
+ pfree(qr.operandData);
return (float4) Wdoc;
}