From c52795d18a698d25b9cd7cd1ca9318a42b08fdb9 Mon Sep 17 00:00:00 2001 From: Teodor Sigaev Date: Mon, 21 Nov 2005 12:27:57 +0000 Subject: Text parser rewritten: - supports multibyte encodings - more strict rules for lexemes - flex isn't used Add: - tsquery plainto_tsquery(text) Function makes tsquery from plain text. - &&, ||, !! operation for tsquery for combining tsquery from it's parts: 'foo & bar' || 'asd' => 'foo & bar | asd' --- contrib/tsearch2/wordparser/parser.c | 1028 ++++++++++++++++++++++++++++++++++ 1 file changed, 1028 insertions(+) create mode 100644 contrib/tsearch2/wordparser/parser.c (limited to 'contrib/tsearch2/wordparser/parser.c') diff --git a/contrib/tsearch2/wordparser/parser.c b/contrib/tsearch2/wordparser/parser.c new file mode 100644 index 00000000000..e414a865ffd --- /dev/null +++ b/contrib/tsearch2/wordparser/parser.c @@ -0,0 +1,1028 @@ +#include "postgres.h" + +#include "utils/builtins.h" +#include "utils/pg_locale.h" +#include "mb/pg_wchar.h" + +#include "deflex.h" +#include "parser.h" +#include "ts_locale.h" + + +static TParserPosition* +newTParserPosition(TParserPosition *prev) { + TParserPosition *res = (TParserPosition*)palloc(sizeof(TParserPosition)); + + if ( prev ) + memcpy(res, prev, sizeof(TParserPosition)); + else + memset(res, 0, sizeof(TParserPosition)); + + res->prev = prev; + + res->pushedAtAction = NULL; + + return res; +} + +TParser* +TParserInit( char *str, int len ) { + TParser *prs = (TParser*)palloc0( sizeof(TParser) ); + + prs->charmaxlen = pg_database_encoding_max_length(); + prs->str = str; + prs->lenstr = len; + +#ifdef TS_USE_WIDE + /* + * Use wide char code only when max encoding length > 1 and ctype != C. + * Some operating systems fail with multi-byte encodings and a C locale. + * Also, for a C locale there is no need to process as multibyte. + * From backend/utils/adt/oracle_compat.c Teodor + */ + + if ( prs->charmaxlen > 1 && !lc_ctype_is_c() ) { + prs->usewide=true; + prs->wstr = (wchar_t*)palloc( sizeof(wchar_t) * prs->lenstr ); + prs->lenwstr = char2wchar( prs->wstr, prs->str, prs->lenstr ); + } else +#endif + prs->usewide=false; + + prs->state = newTParserPosition(NULL); + prs->state->state = TPS_Base; + + return prs; +} + +void +TParserClose( TParser* prs ) { + while( prs->state ) { + TParserPosition *ptr = prs->state->prev; + pfree( prs->state ); + prs->state = ptr; + } + + if ( prs->wstr ) + pfree( prs->wstr ); + pfree( prs ); +} + +/* + * defining support function, equvalent is* macroses, but + * working with any possible encodings and locales + */ + +#ifdef TS_USE_WIDE + +#define p_iswhat(type) \ +static int \ +p_is##type(TParser *prs) { \ + Assert( prs->state ); \ + return ( ( prs->usewide ) ? isw##type( (wint_t)*( prs->wstr + prs->state->poschar ) ) : \ + is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ +} \ + \ +static int \ +p_isnot##type(TParser *prs) { \ + return !p_is##type(prs); \ +} + + + +/* p_iseq should be used only for ascii symbols */ + +static int +p_iseq(TParser *prs, char c) { + Assert( prs->state ); + return ( ( prs->state->charlen==1 && *( prs->str + prs->state->posbyte ) == c ) ) ? 1 : 0; +} + +#else /* TS_USE_WIDE */ + +#define p_iswhat(type) \ +static int \ +p_is##type(TParser *prs) { \ + Assert( prs->state ); \ + return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ) ); \ +} \ + \ +static int \ +p_isnot##type(TParser *prs) { \ + return !p_is##type(prs); \ +} + + +static int +p_iseq(TParser *prs, char c) { + Assert( prs->state ); + return ( *( prs->str + prs->state->posbyte ) == c ) ) ? 1 : 0; +} + +#endif /* TS_USE_WIDE */ + +p_iswhat(alnum) +p_iswhat(alpha) +p_iswhat(digit) +p_iswhat(lower) +p_iswhat(print) +p_iswhat(punct) +p_iswhat(space) +p_iswhat(upper) +p_iswhat(xdigit) + +static int +p_isEOF(TParser *prs) { + Assert( prs->state ); + return (prs->state->posbyte == prs->lenstr || prs->state->charlen==0) ? 1 : 0; +} + +static int +p_iseqC(TParser *prs) { + return p_iseq(prs, prs->c); +} + +static int +p_isneC(TParser *prs) { + return !p_iseq(prs, prs->c); +} + +static int +p_isascii(TParser *prs) { + return ( prs->state->charlen==1 && isascii( (unsigned char) *( prs->str + prs->state->posbyte ) ) ) ? 1 : 0; +} + +static int +p_islatin(TParser *prs) { + return ( p_isalpha(prs) && p_isascii(prs) ) ? 1 : 0; +} + +static int +p_isnonlatin(TParser *prs) { + return ( p_isalpha(prs) && !p_isascii(prs) ) ? 1 : 0; +} + +void _make_compiler_happy(void); +void +_make_compiler_happy(void) { + p_isalnum(NULL); p_isnotalnum(NULL); + p_isalpha(NULL); p_isnotalpha(NULL); + p_isdigit(NULL); p_isnotdigit(NULL); + p_islower(NULL); p_isnotlower(NULL); + p_isprint(NULL); p_isnotprint(NULL); + p_ispunct(NULL); p_isnotpunct(NULL); + p_isspace(NULL); p_isnotspace(NULL); + p_isupper(NULL); p_isnotupper(NULL); + p_isxdigit(NULL); p_isnotxdigit(NULL); + p_isEOF(NULL); + p_iseqC(NULL); p_isneC(NULL); +} + + +static void +SpecialTags(TParser *prs) { + switch( prs->state->lencharlexeme ) { + case 8: /* lexeme, "ignore = false; + break; + case 7: /*