diff options
Diffstat (limited to 'contrib/fuzzystrmatch/fuzzystrmatch.c')
-rw-r--r-- | contrib/fuzzystrmatch/fuzzystrmatch.c | 731 |
1 files changed, 0 insertions, 731 deletions
diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c deleted file mode 100644 index fbb16f66b33..00000000000 --- a/contrib/fuzzystrmatch/fuzzystrmatch.c +++ /dev/null @@ -1,731 +0,0 @@ -/* - * fuzzystrmatch.c - * - * Functions for "fuzzy" comparison of strings - * - * Copyright (c) Joseph Conway <joseph.conway@home.com>, 2001; - * - * levenshtein() - * ------------- - * Written based on a description of the algorithm by Michael Gilleland - * found at http://www.merriampark.com/ld.htm - * Also looked at levenshtein.c in the PHP 4.0.6 distribution for - * inspiration. - * - * metaphone() - * ----------- - * Modified for PostgreSQL by Joe Conway. - * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com> - * Code slightly modified for use as PostgreSQL function (palloc, elog, etc). - * Metaphone was originally created by Lawrence Philips and presented in article - * in "Computer Language" December 1990 issue. - * - * Permission to use, copy, modify, and distribute this software and its - * documentation for any purpose, without fee, and without a written agreement - * is hereby granted, provided that the above copyright notice and this - * paragraph and the following two paragraphs appear in all copies. - * - * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR - * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING - * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS - * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, - * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY - * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS - * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO - * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - * - */ - -#include "fuzzystrmatch.h" - -/* - * Calculates Levenshtein Distance between two strings. - * Uses simplest and fastest cost model only, i.e. assumes a cost of 1 for - * each deletion, substitution, or insertion. - */ -PG_FUNCTION_INFO_V1(levenshtein); -Datum -levenshtein(PG_FUNCTION_ARGS) -{ - char *str_s; - char *str_s0; - char *str_t; - int cols = 0; - int rows = 0; - int *u_cells; - int *l_cells; - int *tmp; - int i; - int j; - - /* - * Fetch the arguments. str_s is referred to as the "source" cols = - * length of source + 1 to allow for the initialization column str_t - * is referred to as the "target", rows = length of target + 1 rows = - * length of target + 1 to allow for the initialization row - */ - str_s = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0)))); - str_t = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(1)))); - - cols = strlen(str_s) + 1; - rows = strlen(str_t) + 1; - - /* - * Restrict the length of the strings being compared to something - * reasonable because we will have to perform rows * cols - * calcualtions. If longer strings need to be compared, increase - * MAX_LEVENSHTEIN_STRLEN to suit (but within your tolerance for speed - * and memory usage). - */ - if ((cols > MAX_LEVENSHTEIN_STRLEN + 1) || (rows > MAX_LEVENSHTEIN_STRLEN + 1)) - elog(ERROR, "levenshtein: Arguments may not exceed %d characters in length", MAX_LEVENSHTEIN_STRLEN); - - /* - * If either rows or cols is 0, the answer is the other value. This - * makes sense since it would take that many insertions the build a - * matching string - */ - - if (cols == 0) - PG_RETURN_INT32(rows); - - if (rows == 0) - PG_RETURN_INT32(cols); - - /* - * Allocate two vectors of integers. One will be used for the "upper" - * row, the other for the "lower" row. Initialize the "upper" row to - * 0..cols - */ - u_cells = palloc(sizeof(int) * cols); - for (i = 0; i < cols; i++) - u_cells[i] = i; - - l_cells = palloc(sizeof(int) * cols); - - /* - * Use str_s0 to "rewind" the pointer to str_s in the nested for loop - * below - */ - str_s0 = str_s; - - /* - * Loop throught the rows, starting at row 1. Row 0 is used for the - * initial "upper" row. - */ - for (j = 1; j < rows; j++) - { - /* - * We'll always start with col 1, and initialize lower row col 0 - * to j - */ - l_cells[0] = j; - - for (i = 1; i < cols; i++) - { - int c = 0; - int c1 = 0; - int c2 = 0; - int c3 = 0; - - /* - * The "cost" value is 0 if the character at the current col - * position in the source string, matches the character at the - * current row position in the target string; cost is 1 - * otherwise. - */ - c = ((CHAREQ(str_s, str_t)) ? 0 : 1); - - /* - * c1 is upper right cell plus 1 - */ - c1 = u_cells[i] + 1; - - /* - * c2 is lower left cell plus 1 - */ - c2 = l_cells[i - 1] + 1; - - /* - * c3 is cell diagonally above to the left plus "cost" - */ - c3 = u_cells[i - 1] + c; - - /* - * The lower right cell is set to the minimum of c1, c2, c3 - */ - l_cells[i] = (c1 < c2 ? c1 : c2) < c3 ? (c1 < c2 ? c1 : c2) : c3; - - /* - * Increment the pointer to str_s - */ - NextChar(str_s); - } - - /* - * Lower row now becomes the upper row, and the upper row gets - * reused as the new lower row. - */ - tmp = u_cells; - u_cells = l_cells; - l_cells = tmp; - - /* - * Increment the pointer to str_t - */ - NextChar(str_t); - - /* - * Rewind the pointer to str_s - */ - str_s = str_s0; - } - - /* - * Because the final value (at position row, col) was swapped from the - * lower row to the upper row, that's where we'll find it. - */ - PG_RETURN_INT32(u_cells[cols - 1]); -} - -/* - * Calculates the metaphone of an input string. - * Returns number of characters requested - * (suggested value is 4) - */ -PG_FUNCTION_INFO_V1(metaphone); -Datum -metaphone(PG_FUNCTION_ARGS) -{ - int reqlen; - char *str_i; - size_t str_i_len; - char *metaph; - text *result_text; - int retval; - - str_i = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(PG_GETARG_TEXT_P(0)))); - str_i_len = strlen(str_i); - - if (str_i_len > MAX_METAPHONE_STRLEN) - elog(ERROR, "metaphone: Input string must not exceed %d characters", MAX_METAPHONE_STRLEN); - if (!(str_i_len > 0)) - elog(ERROR, "metaphone: Input string length must be > 0"); - - reqlen = PG_GETARG_INT32(1); - if (reqlen > MAX_METAPHONE_STRLEN) - elog(ERROR, "metaphone: Requested Metaphone output length must not exceed %d characters", MAX_METAPHONE_STRLEN); - if (!(reqlen > 0)) - elog(ERROR, "metaphone: Requested Metaphone output length must be > 0"); - - metaph = palloc(reqlen); - memset(metaph, '\0', reqlen); - - retval = _metaphone(str_i, reqlen, &metaph); - if (retval == META_SUCCESS) - { - result_text = DatumGetTextP(DirectFunctionCall1(textin, CStringGetDatum(metaph))); - PG_RETURN_TEXT_P(result_text); - } - else - { - elog(ERROR, "metaphone: failure"); - - /* - * Keep the compiler quiet - */ - PG_RETURN_NULL(); - } -} - - -/* - * Original code by Michael G Schwern starts here. - * Code slightly modified for use as PostgreSQL - * function (palloc, etc). Original includes - * are rolled into fuzzystrmatch.h - *------------------------------------------------------------------*/ - -/* I suppose I could have been using a character pointer instead of - * accesssing the array directly... */ - -/* Look at the next letter in the word */ -#define Next_Letter (toupper((unsigned char) word[w_idx+1])) -/* Look at the current letter in the word */ -#define Curr_Letter (toupper((unsigned char) word[w_idx])) -/* Go N letters back. */ -#define Look_Back_Letter(n) \ - (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0') -/* Previous letter. I dunno, should this return null on failure? */ -#define Prev_Letter (Look_Back_Letter(1)) -/* Look two letters down. It makes sure you don't walk off the string. */ -#define After_Next_Letter \ - (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0') -#define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n)) - - -/* Allows us to safely look ahead an arbitrary # of letters */ -/* I probably could have just used strlen... */ -char -Lookahead(char *word, int how_far) -{ - char letter_ahead = '\0'; /* null by default */ - int idx; - - for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); - /* Edge forward in the string... */ - - letter_ahead = word[idx]; /* idx will be either == to how_far or at - * the end of the string */ - return letter_ahead; -} - - -/* phonize one letter */ -#define Phonize(c) do {(*phoned_word)[p_idx++] = c;} while (0) -/* Slap a null character on the end of the phoned word */ -#define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0) -/* How long is the phoned word? */ -#define Phone_Len (p_idx) - -/* Note is a letter is a 'break' in the word */ -#define Isbreak(c) (!isalpha((unsigned char) (c))) - - -int -_metaphone( - /* IN */ - char *word, - int max_phonemes, - /* OUT */ - char **phoned_word -) -{ - int w_idx = 0; /* point in the phonization we're at. */ - int p_idx = 0; /* end of the phoned phrase */ - - /*-- Parameter checks --*/ - - /* - * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001 - */ - - /* Negative phoneme length is meaningless */ - if (!(max_phonemes > 0)) - elog(ERROR, "metaphone: Requested output length must be > 0"); - - /* Empty/null string is meaningless */ - if ((word == NULL) || !(strlen(word) > 0)) - elog(ERROR, "metaphone: Input string length must be > 0"); - - /*-- Allocate memory for our phoned_phrase --*/ - if (max_phonemes == 0) - { /* Assume largest possible */ - *phoned_word = palloc(sizeof(char) * strlen(word) +1); - if (!*phoned_word) - return META_ERROR; - } - else - { - *phoned_word = palloc(sizeof(char) * max_phonemes + 1); - if (!*phoned_word) - return META_ERROR; - } - - /*-- The first phoneme has to be processed specially. --*/ - /* Find our first letter */ - for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++) - { - /* On the off chance we were given nothing but crap... */ - if (Curr_Letter == '\0') - { - End_Phoned_Word; - return META_SUCCESS; /* For testing */ - } - } - - switch (Curr_Letter) - { - /* AE becomes E */ - case 'A': - if (Next_Letter == 'E') - { - Phonize('E'); - w_idx += 2; - } - /* Remember, preserve vowels at the beginning */ - else - { - Phonize('A'); - w_idx++; - } - break; - /* [GKP]N becomes N */ - case 'G': - case 'K': - case 'P': - if (Next_Letter == 'N') - { - Phonize('N'); - w_idx += 2; - } - break; - - /* - * WH becomes H, WR becomes R W if followed by a vowel - */ - case 'W': - if (Next_Letter == 'H' || - Next_Letter == 'R') - { - Phonize(Next_Letter); - w_idx += 2; - } - else if (isvowel(Next_Letter)) - { - Phonize('W'); - w_idx += 2; - } - /* else ignore */ - break; - /* X becomes S */ - case 'X': - Phonize('S'); - w_idx++; - break; - /* Vowels are kept */ - - /* - * We did A already case 'A': case 'a': - */ - case 'E': - case 'I': - case 'O': - case 'U': - Phonize(Curr_Letter); - w_idx++; - break; - default: - /* do nothing */ - break; - } - - - - /* On to the metaphoning */ - for (; Curr_Letter != '\0' && - (max_phonemes == 0 || Phone_Len < max_phonemes); - w_idx++) - { - /* - * How many letters to skip because an eariler encoding handled - * multiple letters - */ - unsigned short int skip_letter = 0; - - - /* - * THOUGHT: It would be nice if, rather than having things - * like... well, SCI. For SCI you encode the S, then have to - * remember to skip the C. So the phonome SCI invades both S and - * C. It would be better, IMHO, to skip the C from the S part of - * the encoding. Hell, I'm trying it. - */ - - /* Ignore non-alphas */ - if (!isalpha((unsigned char) (Curr_Letter))) - continue; - - /* Drop duplicates, except CC */ - if (Curr_Letter == Prev_Letter && - Curr_Letter != 'C') - continue; - - switch (Curr_Letter) - { - /* B -> B unless in MB */ - case 'B': - if (Prev_Letter != 'M') - Phonize('B'); - break; - - /* - * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW - * is handled in S) S if -CI-, -CE- or -CY- dropped if - * -SCI-, SCE-, -SCY- (handed in S) else K - */ - case 'C': - if (MAKESOFT(Next_Letter)) - { /* C[IEY] */ - if (After_Next_Letter == 'A' && - Next_Letter == 'I') - { /* CIA */ - Phonize(SH); - } - /* SC[IEY] */ - else if (Prev_Letter == 'S') - { - /* Dropped */ - } - else - Phonize('S'); - } - else if (Next_Letter == 'H') - { -#ifndef USE_TRADITIONAL_METAPHONE - if (After_Next_Letter == 'R' || - Prev_Letter == 'S') - { /* Christ, School */ - Phonize('K'); - } - else - Phonize(SH); -#else - Phonize(SH); -#endif - skip_letter++; - } - else - Phonize('K'); - break; - - /* - * J if in -DGE-, -DGI- or -DGY- else T - */ - case 'D': - if (Next_Letter == 'G' && - MAKESOFT(After_Next_Letter)) - { - Phonize('J'); - skip_letter++; - } - else - Phonize('T'); - break; - - /* - * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else - * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or - * -DGY- (handled in D) else J if in -GE-, -GI, -GY and - * not GG else K - */ - case 'G': - if (Next_Letter == 'H') - { - if (!(NOGHTOF(Look_Back_Letter(3)) || - Look_Back_Letter(4) == 'H')) - { - Phonize('F'); - skip_letter++; - } - else - { - /* silent */ - } - } - else if (Next_Letter == 'N') - { - if (Isbreak(After_Next_Letter) || - (After_Next_Letter == 'E' && - Look_Ahead_Letter(3) == 'D')) - { - /* dropped */ - } - else - Phonize('K'); - } - else if (MAKESOFT(Next_Letter) && - Prev_Letter != 'G') - Phonize('J'); - else - Phonize('K'); - break; - /* H if before a vowel and not after C,G,P,S,T */ - case 'H': - if (isvowel(Next_Letter) && - !AFFECTH(Prev_Letter)) - Phonize('H'); - break; - - /* - * dropped if after C else K - */ - case 'K': - if (Prev_Letter != 'C') - Phonize('K'); - break; - - /* - * F if before H else P - */ - case 'P': - if (Next_Letter == 'H') - Phonize('F'); - else - Phonize('P'); - break; - - /* - * K - */ - case 'Q': - Phonize('K'); - break; - - /* - * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S - */ - case 'S': - if (Next_Letter == 'I' && - (After_Next_Letter == 'O' || - After_Next_Letter == 'A')) - Phonize(SH); - else if (Next_Letter == 'H') - { - Phonize(SH); - skip_letter++; - } -#ifndef USE_TRADITIONAL_METAPHONE - else if (Next_Letter == 'C' && - Look_Ahead_Letter(2) == 'H' && - Look_Ahead_Letter(3) == 'W') - { - Phonize(SH); - skip_letter += 2; - } -#endif - else - Phonize('S'); - break; - - /* - * 'sh' in -TIA- or -TIO- else 'th' before H else T - */ - case 'T': - if (Next_Letter == 'I' && - (After_Next_Letter == 'O' || - After_Next_Letter == 'A')) - Phonize(SH); - else if (Next_Letter == 'H') - { - Phonize(TH); - skip_letter++; - } - else - Phonize('T'); - break; - /* F */ - case 'V': - Phonize('F'); - break; - /* W before a vowel, else dropped */ - case 'W': - if (isvowel(Next_Letter)) - Phonize('W'); - break; - /* KS */ - case 'X': - Phonize('K'); - Phonize('S'); - break; - /* Y if followed by a vowel */ - case 'Y': - if (isvowel(Next_Letter)) - Phonize('Y'); - break; - /* S */ - case 'Z': - Phonize('S'); - break; - /* No transformation */ - case 'F': - case 'J': - case 'L': - case 'M': - case 'N': - case 'R': - Phonize(Curr_Letter); - break; - default: - /* nothing */ - break; - } /* END SWITCH */ - - w_idx += skip_letter; - } /* END FOR */ - - End_Phoned_Word; - - return (META_SUCCESS); -} /* END metaphone */ - - -/* - * SQL function: soundex(text) returns text - */ -PG_FUNCTION_INFO_V1(soundex); - -Datum -soundex(PG_FUNCTION_ARGS) -{ - char outstr[SOUNDEX_LEN + 1]; - char *arg; - - arg = _textout(PG_GETARG_TEXT_P(0)); - - _soundex(arg, outstr); - - PG_RETURN_TEXT_P(_textin(outstr)); -} - -static void -_soundex(const char *instr, char *outstr) -{ - int count; - - AssertArg(instr); - AssertArg(outstr); - - outstr[SOUNDEX_LEN] = '\0'; - - /* Skip leading non-alphabetic characters */ - while (!isalpha((unsigned char) instr[0]) && instr[0]) - ++instr; - - /* No string left */ - if (!instr[0]) - { - outstr[0] = (char) 0; - return; - } - - /* Take the first letter as is */ - *outstr++ = (char) toupper((unsigned char) *instr++); - - count = 1; - while (*instr && count < SOUNDEX_LEN) - { - if (isalpha((unsigned char) *instr) && - soundex_code(*instr) != soundex_code(*(instr - 1))) - { - *outstr = soundex_code(instr[0]); - if (*outstr != '0') - { - ++outstr; - ++count; - } - } - ++instr; - } - - /* Fill with 0's */ - while (count < SOUNDEX_LEN) - { - *outstr = '0'; - ++outstr; - ++count; - } -} |