diff options
author | Teodor Sigaev <teodor@sigaev.ru> | 2016-04-07 18:44:18 +0300 |
---|---|---|
committer | Teodor Sigaev <teodor@sigaev.ru> | 2016-04-07 18:44:18 +0300 |
commit | bb140506df605fab58f48926ee1db1f80bdafb59 (patch) | |
tree | 581f9aeb71e3596000af3b4904e0c62a372d77b3 /src/backend/utils/adt/tsrank.c | |
parent | 015e88942aa50f0d419ddac00e63bb06d6e62e86 (diff) |
Phrase full text search.
Patch introduces new text search operator (<-> or <DISTANCE>) into tsquery.
On-disk and binary in/out format of tsquery are backward compatible.
It has two side effect:
- change order for tsquery, so, users, who has a btree index over tsquery,
should reindex it
- less number of parenthesis in tsquery output, and tsquery becomes more
readable
Authors: Teodor Sigaev, Oleg Bartunov, Dmitry Ivanov
Reviewers: Alexander Korotkov, Artur Zakirov
Diffstat (limited to 'src/backend/utils/adt/tsrank.c')
-rw-r--r-- | src/backend/utils/adt/tsrank.c | 263 |
1 files changed, 185 insertions, 78 deletions
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c index 53f678a3bfb..ab47b763eeb 100644 --- a/src/backend/utils/adt/tsrank.c +++ b/src/backend/utils/adt/tsrank.c @@ -364,8 +364,10 @@ calc_rank(const float *w, TSVector t, TSQuery q, int32 method) return 0.0; /* XXX: What about NOT? */ - res = (item->type == QI_OPR && item->qoperator.oper == OP_AND) ? - calc_rank_and(w, t, q) : calc_rank_or(w, t, q); + res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND || + item->qoperator.oper == OP_PHRASE)) ? + calc_rank_and(w, t, q) : + calc_rank_or(w, t, q); if (res < 0) res = 1e-20f; @@ -496,10 +498,17 @@ ts_rank_tt(PG_FUNCTION_ARGS) typedef struct { - QueryItem **item; - int16 nitem; - uint8 wclass; - int32 pos; + union { + struct { /* compiled doc representation */ + QueryItem **items; + int16 nitem; + } query; + struct { /* struct is used for preparing doc representation */ + QueryItem *item; + WordEntry *entry; + } map; + } data; + WordEntryPos pos; } DocRepresentation; static int @@ -508,26 +517,59 @@ compareDocR(const void *va, const void *vb) const DocRepresentation *a = (const DocRepresentation *) va; const DocRepresentation *b = (const DocRepresentation *) vb; - if (a->pos == b->pos) - return 0; - return (a->pos > b->pos) ? 1 : -1; + if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos)) + { + if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos)) + { + if (a->data.map.entry == b->data.map.entry) + return 0; + + return (a->data.map.entry > b->data.map.entry) ? 1 : -1; + } + + return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1; + } + + return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1; } +#define MAXQROPOS MAXENTRYPOS +typedef struct +{ + bool operandexists; + bool reverseinsert; /* indicates insert order, + true means descending order */ + uint32 npos; + WordEntryPos pos[MAXQROPOS]; +} QueryRepresentationOperand; + typedef struct { - TSQuery query; - bool *operandexist; + TSQuery query; + QueryRepresentationOperand *operandData; } QueryRepresentation; -#define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] ) -#define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true +#define QR_GET_OPERAND_DATA(q, v) \ + ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) ) static bool -checkcondition_QueryOperand(void *checkval, QueryOperand *val) +checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data) { - QueryRepresentation *qr = (QueryRepresentation *) checkval; + QueryRepresentation *qr = (QueryRepresentation *) checkval; + QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val); - return QR_GET_OPERAND_EXISTS(qr, val); + if (!opData->operandexists) + return false; + + if (data) + { + data->npos = opData->npos; + data->pos = opData->pos; + if (opData->reverseinsert) + data->pos += MAXQROPOS - opData->npos; + } + + return true; } typedef struct @@ -539,14 +581,65 @@ typedef struct DocRepresentation *end; } CoverExt; +static void +resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert) +{ + int i; + + for(i = 0; i < qr->query->size; i++) + { + qr->operandData[i].operandexists = false; + qr->operandData[i].reverseinsert = reverseinsert; + qr->operandData[i].npos = 0; + } +} + +static void +fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry) +{ + int i; + int lastPos; + QueryRepresentationOperand *opData; + + for (i = 0; i < entry->data.query.nitem; i++) + { + if (entry->data.query.items[i]->type != QI_VAL) + continue; + + opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]); + + opData->operandexists = true; + + if (opData->npos == 0) + { + lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0; + opData->pos[lastPos] = entry->pos; + opData->npos++; + continue; + } + + lastPos = opData->reverseinsert ? + (MAXQROPOS - opData->npos) : + (opData->npos - 1); + + if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos)) + { + lastPos = opData->reverseinsert ? + (MAXQROPOS - 1 - opData->npos) : + (opData->npos); + + opData->pos[lastPos] = entry->pos; + opData->npos++; + } + } +} static bool Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) { - DocRepresentation *ptr; - int lastpos = ext->pos; - int i; - bool found = false; + DocRepresentation *ptr; + int lastpos = ext->pos; + bool found = false; /* * since this function recurses, it could be driven to stack overflow. @@ -554,7 +647,7 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) */ check_stack_depth(); - memset(qr->operandexist, 0, sizeof(bool) * qr->query->size); + resetQueryRepresentation(qr, false); ext->p = INT_MAX; ext->q = 0; @@ -563,16 +656,13 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) /* find upper bound of cover from current position, move up */ while (ptr - doc < len) { - for (i = 0; i < ptr->nitem; i++) - { - if (ptr->item[i]->type == QI_VAL) - QR_SET_OPERAND_EXISTS(qr, ptr->item[i]); - } + fillQueryRepresentationData(qr, ptr); + if (TS_execute(GETQUERY(qr->query), (void *) qr, false, checkcondition_QueryOperand)) { - if (ptr->pos > ext->q) + if (WEP_GETPOS(ptr->pos) > ext->q) { - ext->q = ptr->pos; + ext->q = WEP_GETPOS(ptr->pos); ext->end = ptr; lastpos = ptr - doc; found = true; @@ -585,22 +675,24 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) if (!found) return false; - memset(qr->operandexist, 0, sizeof(bool) * qr->query->size); + resetQueryRepresentation(qr, true); ptr = doc + lastpos; /* find lower bound of cover from found upper bound, move down */ while (ptr >= doc + ext->pos) { - for (i = 0; i < ptr->nitem; i++) - if (ptr->item[i]->type == QI_VAL) - QR_SET_OPERAND_EXISTS(qr, ptr->item[i]); + /* + * we scan doc from right to left, so pos info in reverse order! + */ + fillQueryRepresentationData(qr, ptr); + if (TS_execute(GETQUERY(qr->query), (void *) qr, true, checkcondition_QueryOperand)) { - if (ptr->pos < ext->p) + if (WEP_GETPOS(ptr->pos) < ext->p) { ext->begin = ptr; - ext->p = ptr->pos; + ext->p = WEP_GETPOS(ptr->pos); } break; } @@ -628,18 +720,20 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) WordEntry *entry, *firstentry; WordEntryPos *post; - int32 dimt, + int32 dimt, /* number of 'post' items */ j, i, nitem; int len = qr->query->size * 4, cur = 0; DocRepresentation *doc; - char *operand; doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); - operand = GETOPERAND(qr->query); + /* + * Iterate through query to make DocRepresentaion for words and it's entries + * satisfied by query + */ for (i = 0; i < qr->query->size; i++) { QueryOperand *curoperand; @@ -649,13 +743,11 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) curoperand = &item[i].qoperand; - if (QR_GET_OPERAND_EXISTS(qr, &item[i])) - continue; - firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem); if (!entry) continue; + /* iterations over entries in tsvector */ while (entry - firstentry < nitem) { if (entry->haspos) @@ -676,53 +768,67 @@ get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); } + /* iterations over entry's positions */ for (j = 0; j < dimt; j++) { - if (j == 0) - { - int k; - - doc[cur].nitem = 0; - doc[cur].item = (QueryItem **) palloc(sizeof(QueryItem *) * qr->query->size); - - for (k = 0; k < qr->query->size; k++) - { - QueryOperand *kptr = &item[k].qoperand; - QueryOperand *iptr = &item[i].qoperand; - - if (k == i || - (item[k].type == QI_VAL && - compareQueryOperand(&kptr, &iptr, operand) == 0)) - { - /* - * if k == i, we've already checked above that - * it's type == Q_VAL - */ - doc[cur].item[doc[cur].nitem] = item + k; - doc[cur].nitem++; - QR_SET_OPERAND_EXISTS(qr, item + k); - } - } - } - else + if (curoperand->weight == 0 || + curoperand->weight & (1 << WEP_GETWEIGHT(post[j]))) { - doc[cur].nitem = doc[cur - 1].nitem; - doc[cur].item = doc[cur - 1].item; + doc[cur].pos = post[j]; + doc[cur].data.map.entry = entry; + doc[cur].data.map.item = (QueryItem *) curoperand; + cur++; } - doc[cur].pos = WEP_GETPOS(post[j]); - doc[cur].wclass = WEP_GETWEIGHT(post[j]); - cur++; } entry++; } } - *doclen = cur; - if (cur > 0) { + DocRepresentation *rptr = doc + 1, + *wptr = doc, + storage; + + /* + * Sort representation in ascending order by pos and entry + */ qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); + + /* + * Join QueryItem per WordEntry and it's position + */ + storage.pos = doc->pos; + storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); + storage.data.query.items[0] = doc->data.map.item; + storage.data.query.nitem = 1; + + while (rptr - doc < cur) + { + if (rptr->pos == (rptr-1)->pos && + rptr->data.map.entry == (rptr-1)->data.map.entry) + { + storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item; + storage.data.query.nitem++; + } + else + { + *wptr = storage; + wptr++; + storage.pos = rptr->pos; + storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); + storage.data.query.items[0] = rptr->data.map.item; + storage.data.query.nitem = 1; + } + + rptr++; + } + + *wptr = storage; + wptr++; + + *doclen = wptr - doc; return doc; } @@ -758,12 +864,13 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) } qr.query = query; - qr.operandexist = (bool *) palloc0(sizeof(bool) * query->size); + qr.operandData = (QueryRepresentationOperand *) + palloc0(sizeof(QueryRepresentationOperand) * query->size); doc = get_docrep(txt, &qr, &doclen); if (!doc) { - pfree(qr.operandexist); + pfree(qr.operandData); return 0.0; } @@ -777,7 +884,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) while (ptr <= ext.end) { - InvSum += invws[ptr->wclass]; + InvSum += invws[WEP_GETWEIGHT(ptr->pos)]; ptr++; } @@ -827,7 +934,7 @@ calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) pfree(doc); - pfree(qr.operandexist); + pfree(qr.operandData); return (float4) Wdoc; } |