LCOV - differential code coverage report
Current view: top level - contrib/fuzzystrmatch - fuzzystrmatch.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 44.1 % 279 123 9 67 80 1 84 2 36 75 77 1
Current Date: 2023-04-08 15:15:32 Functions: 90.0 % 20 18 2 18 2 18
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*
       2                 :  * fuzzystrmatch.c
       3                 :  *
       4                 :  * Functions for "fuzzy" comparison of strings
       5                 :  *
       6                 :  * Joe Conway <mail@joeconway.com>
       7                 :  *
       8                 :  * contrib/fuzzystrmatch/fuzzystrmatch.c
       9                 :  * Copyright (c) 2001-2023, PostgreSQL Global Development Group
      10                 :  * ALL RIGHTS RESERVED;
      11                 :  *
      12                 :  * metaphone()
      13                 :  * -----------
      14                 :  * Modified for PostgreSQL by Joe Conway.
      15                 :  * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
      16                 :  * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
      17                 :  * Metaphone was originally created by Lawrence Philips and presented in article
      18                 :  * in "Computer Language" December 1990 issue.
      19                 :  *
      20                 :  * Permission to use, copy, modify, and distribute this software and its
      21                 :  * documentation for any purpose, without fee, and without a written agreement
      22                 :  * is hereby granted, provided that the above copyright notice and this
      23                 :  * paragraph and the following two paragraphs appear in all copies.
      24                 :  *
      25                 :  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
      26                 :  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
      27                 :  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
      28                 :  * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
      29                 :  * POSSIBILITY OF SUCH DAMAGE.
      30                 :  *
      31                 :  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
      32                 :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      33                 :  * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
      34                 :  * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
      35                 :  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
      36                 :  *
      37                 :  */
      38                 : 
      39                 : #include "postgres.h"
      40                 : 
      41                 : #include <ctype.h>
      42                 : 
      43                 : #include "mb/pg_wchar.h"
      44                 : #include "utils/builtins.h"
      45                 : #include "utils/varlena.h"
      46                 : #include "varatt.h"
      47                 : 
      48 GIC           2 : PG_MODULE_MAGIC;
      49 ECB             : 
      50                 : /*
      51                 :  * Soundex
      52                 :  */
      53                 : static void _soundex(const char *instr, char *outstr);
      54                 : 
      55                 : #define SOUNDEX_LEN 4
      56                 : 
      57                 : /*                                  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
      58                 : static const char *soundex_table = "01230120022455012623010202";
      59                 : 
      60                 : static char
      61 GIC         127 : soundex_code(char letter)
      62 ECB             : {
      63 GIC         127 :     letter = toupper((unsigned char) letter);
      64 ECB             :     /* Defend against non-ASCII letters */
      65 GIC         127 :     if (letter >= 'A' && letter <= 'Z')
      66 CBC         126 :         return soundex_table[letter - 'A'];
      67               1 :     return letter;
      68 ECB             : }
      69                 : 
      70                 : /*
      71                 :  * Metaphone
      72                 :  */
      73                 : #define MAX_METAPHONE_STRLEN        255
      74                 : 
      75                 : /*
      76                 :  * Original code by Michael G Schwern starts here.
      77                 :  * Code slightly modified for use as PostgreSQL function.
      78                 :  */
      79                 : 
      80                 : 
      81                 : /**************************************************************************
      82                 :     metaphone -- Breaks english phrases down into their phonemes.
      83                 : 
      84                 :     Input
      85                 :         word            --  An english word to be phonized
      86                 :         max_phonemes    --  How many phonemes to calculate.  If 0, then it
      87                 :                             will phonize the entire phrase.
      88                 :         phoned_word     --  The final phonized word.  (We'll allocate the
      89                 :                             memory.)
      90                 :     Output
      91                 :         error   --  A simple error flag, returns true or false
      92                 : 
      93                 :     NOTES:  ALL non-alpha characters are ignored, this includes whitespace,
      94                 :     although non-alpha characters will break up phonemes.
      95                 : ****************************************************************************/
      96                 : 
      97                 : 
      98                 : /*  I add modifications to the traditional metaphone algorithm that you
      99                 :     might find in books.  Define this if you want metaphone to behave
     100                 :     traditionally */
     101                 : #undef USE_TRADITIONAL_METAPHONE
     102                 : 
     103                 : /* Special encodings */
     104                 : #define  SH     'X'
     105                 : #define  TH     '0'
     106                 : 
     107                 : static char Lookahead(char *word, int how_far);
     108                 : static void _metaphone(char *word, int max_phonemes, char **phoned_word);
     109                 : 
     110                 : /* Metachar.h ... little bits about characters for metaphone */
     111                 : 
     112                 : 
     113                 : /*-- Character encoding array & accessing macros --*/
     114                 : /* Stolen directly out of the book... */
     115                 : static const char _codes[26] = {
     116                 :     1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
     117                 : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
     118                 : };
     119                 : 
     120                 : static int
     121 GIC           1 : getcode(char c)
     122 ECB             : {
     123 GIC           1 :     if (isalpha((unsigned char) c))
     124 ECB             :     {
     125 GIC           1 :         c = toupper((unsigned char) c);
     126 ECB             :         /* Defend against non-ASCII letters */
     127 GIC           1 :         if (c >= 'A' && c <= 'Z')
     128 CBC           1 :             return _codes[c - 'A'];
     129 ECB             :     }
     130 UIC           0 :     return 0;
     131 EUB             : }
     132                 : 
     133                 : #define isvowel(c)  (getcode(c) & 1)    /* AEIOU */
     134                 : 
     135                 : /* These letters are passed through unchanged */
     136                 : #define NOCHANGE(c) (getcode(c) & 2)    /* FJMNR */
     137                 : 
     138                 : /* These form diphthongs when preceding H */
     139                 : #define AFFECTH(c)  (getcode(c) & 4)    /* CGPST */
     140                 : 
     141                 : /* These make C and G soft */
     142                 : #define MAKESOFT(c) (getcode(c) & 8)    /* EIY */
     143                 : 
     144                 : /* These prevent GH from becoming F */
     145                 : #define NOGHTOF(c)  (getcode(c) & 16)   /* BDH */
     146                 : 
     147 GIC           2 : PG_FUNCTION_INFO_V1(levenshtein_with_costs);
     148 ECB             : Datum
     149 GIC           1 : levenshtein_with_costs(PG_FUNCTION_ARGS)
     150 ECB             : {
     151 GIC           1 :     text       *src = PG_GETARG_TEXT_PP(0);
     152 CBC           1 :     text       *dst = PG_GETARG_TEXT_PP(1);
     153               1 :     int         ins_c = PG_GETARG_INT32(2);
     154               1 :     int         del_c = PG_GETARG_INT32(3);
     155               1 :     int         sub_c = PG_GETARG_INT32(4);
     156 ECB             :     const char *s_data;
     157                 :     const char *t_data;
     158                 :     int         s_bytes,
     159                 :                 t_bytes;
     160                 : 
     161                 :     /* Extract a pointer to the actual character data */
     162 GIC           1 :     s_data = VARDATA_ANY(src);
     163 CBC           1 :     t_data = VARDATA_ANY(dst);
     164 ECB             :     /* Determine length of each string in bytes */
     165 GIC           1 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     166 CBC           1 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     167 ECB             : 
     168 GIC           1 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     169 ECB             :                                        ins_c, del_c, sub_c, false));
     170                 : }
     171                 : 
     172                 : 
     173 GIC           2 : PG_FUNCTION_INFO_V1(levenshtein);
     174 ECB             : Datum
     175 GIC           1 : levenshtein(PG_FUNCTION_ARGS)
     176 ECB             : {
     177 GIC           1 :     text       *src = PG_GETARG_TEXT_PP(0);
     178 CBC           1 :     text       *dst = PG_GETARG_TEXT_PP(1);
     179 ECB             :     const char *s_data;
     180                 :     const char *t_data;
     181                 :     int         s_bytes,
     182                 :                 t_bytes;
     183                 : 
     184                 :     /* Extract a pointer to the actual character data */
     185 GIC           1 :     s_data = VARDATA_ANY(src);
     186 CBC           1 :     t_data = VARDATA_ANY(dst);
     187 ECB             :     /* Determine length of each string in bytes */
     188 GIC           1 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     189 CBC           1 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     190 ECB             : 
     191 GIC           1 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
     192 ECB             :                                        1, 1, 1, false));
     193                 : }
     194                 : 
     195                 : 
     196 GIC           1 : PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
     197 ECB             : Datum
     198 UIC           0 : levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
     199 EUB             : {
     200 UIC           0 :     text       *src = PG_GETARG_TEXT_PP(0);
     201 UBC           0 :     text       *dst = PG_GETARG_TEXT_PP(1);
     202               0 :     int         ins_c = PG_GETARG_INT32(2);
     203               0 :     int         del_c = PG_GETARG_INT32(3);
     204               0 :     int         sub_c = PG_GETARG_INT32(4);
     205               0 :     int         max_d = PG_GETARG_INT32(5);
     206 EUB             :     const char *s_data;
     207                 :     const char *t_data;
     208                 :     int         s_bytes,
     209                 :                 t_bytes;
     210                 : 
     211                 :     /* Extract a pointer to the actual character data */
     212 UIC           0 :     s_data = VARDATA_ANY(src);
     213 UBC           0 :     t_data = VARDATA_ANY(dst);
     214 EUB             :     /* Determine length of each string in bytes */
     215 UIC           0 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     216 UBC           0 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     217 EUB             : 
     218 UIC           0 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     219 EUB             :                                                   t_data, t_bytes,
     220                 :                                                   ins_c, del_c, sub_c,
     221                 :                                                   max_d, false));
     222                 : }
     223                 : 
     224                 : 
     225 GIC           2 : PG_FUNCTION_INFO_V1(levenshtein_less_equal);
     226 ECB             : Datum
     227 GIC           2 : levenshtein_less_equal(PG_FUNCTION_ARGS)
     228 ECB             : {
     229 GIC           2 :     text       *src = PG_GETARG_TEXT_PP(0);
     230 CBC           2 :     text       *dst = PG_GETARG_TEXT_PP(1);
     231               2 :     int         max_d = PG_GETARG_INT32(2);
     232 ECB             :     const char *s_data;
     233                 :     const char *t_data;
     234                 :     int         s_bytes,
     235                 :                 t_bytes;
     236                 : 
     237                 :     /* Extract a pointer to the actual character data */
     238 GIC           2 :     s_data = VARDATA_ANY(src);
     239 CBC           2 :     t_data = VARDATA_ANY(dst);
     240 ECB             :     /* Determine length of each string in bytes */
     241 GIC           2 :     s_bytes = VARSIZE_ANY_EXHDR(src);
     242 CBC           2 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
     243 ECB             : 
     244 GIC           2 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
     245 ECB             :                                                   t_data, t_bytes,
     246                 :                                                   1, 1, 1,
     247                 :                                                   max_d, false));
     248                 : }
     249                 : 
     250                 : 
     251                 : /*
     252                 :  * Calculates the metaphone of an input string.
     253                 :  * Returns number of characters requested
     254                 :  * (suggested value is 4)
     255                 :  */
     256 GIC           2 : PG_FUNCTION_INFO_V1(metaphone);
     257 ECB             : Datum
     258 GIC           1 : metaphone(PG_FUNCTION_ARGS)
     259 ECB             : {
     260 GIC           1 :     char       *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
     261 CBC           1 :     size_t      str_i_len = strlen(str_i);
     262 ECB             :     int         reqlen;
     263                 :     char       *metaph;
     264                 : 
     265                 :     /* return an empty string if we receive one */
     266 GIC           1 :     if (!(str_i_len > 0))
     267 LBC           0 :         PG_RETURN_TEXT_P(cstring_to_text(""));
     268 EUB             : 
     269 GIC           1 :     if (str_i_len > MAX_METAPHONE_STRLEN)
     270 LBC           0 :         ereport(ERROR,
     271 EUB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     272                 :                  errmsg("argument exceeds the maximum length of %d bytes",
     273                 :                         MAX_METAPHONE_STRLEN)));
     274                 : 
     275 GIC           1 :     reqlen = PG_GETARG_INT32(1);
     276 CBC           1 :     if (reqlen > MAX_METAPHONE_STRLEN)
     277 LBC           0 :         ereport(ERROR,
     278 EUB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     279                 :                  errmsg("output exceeds the maximum length of %d bytes",
     280                 :                         MAX_METAPHONE_STRLEN)));
     281                 : 
     282 GIC           1 :     if (!(reqlen > 0))
     283 LBC           0 :         ereport(ERROR,
     284 EUB             :                 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
     285                 :                  errmsg("output cannot be empty string")));
     286                 : 
     287 GIC           1 :     _metaphone(str_i, reqlen, &metaph);
     288 CBC           1 :     PG_RETURN_TEXT_P(cstring_to_text(metaph));
     289 ECB             : }
     290                 : 
     291                 : 
     292                 : /*
     293                 :  * Original code by Michael G Schwern starts here.
     294                 :  * Code slightly modified for use as PostgreSQL
     295                 :  * function (palloc, etc).
     296                 :  */
     297                 : 
     298                 : /* I suppose I could have been using a character pointer instead of
     299                 :  * accessing the array directly... */
     300                 : 
     301                 : /* Look at the next letter in the word */
     302                 : #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
     303                 : /* Look at the current letter in the word */
     304                 : #define Curr_Letter (toupper((unsigned char) word[w_idx]))
     305                 : /* Go N letters back. */
     306                 : #define Look_Back_Letter(n) \
     307                 :     (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
     308                 : /* Previous letter.  I dunno, should this return null on failure? */
     309                 : #define Prev_Letter (Look_Back_Letter(1))
     310                 : /* Look two letters down.  It makes sure you don't walk off the string. */
     311                 : #define After_Next_Letter \
     312                 :     (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
     313                 : #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
     314                 : 
     315                 : 
     316                 : /* Allows us to safely look ahead an arbitrary # of letters */
     317                 : /* I probably could have just used strlen... */
     318                 : static char
     319 UIC           0 : Lookahead(char *word, int how_far)
     320 EUB             : {
     321 UIC           0 :     char        letter_ahead = '\0';    /* null by default */
     322 EUB             :     int         idx;
     323                 : 
     324 UIC           0 :     for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
     325 EUB             :     /* Edge forward in the string... */
     326                 : 
     327 UIC           0 :     letter_ahead = word[idx];   /* idx will be either == to how_far or at the
     328 EUB             :                                  * end of the string */
     329 UIC           0 :     return letter_ahead;
     330 EUB             : }
     331                 : 
     332                 : 
     333                 : /* phonize one letter */
     334                 : #define Phonize(c)  do {(*phoned_word)[p_idx++] = c;} while (0)
     335                 : /* Slap a null character on the end of the phoned word */
     336                 : #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
     337                 : /* How long is the phoned word? */
     338                 : #define Phone_Len   (p_idx)
     339                 : 
     340                 : /* Note is a letter is a 'break' in the word */
     341                 : #define Isbreak(c)  (!isalpha((unsigned char) (c)))
     342                 : 
     343                 : 
     344                 : static void
     345 GIC           1 : _metaphone(char *word,          /* IN */
     346 ECB             :            int max_phonemes,
     347                 :            char **phoned_word)  /* OUT */
     348                 : {
     349 GIC           1 :     int         w_idx = 0;      /* point in the phonization we're at. */
     350 CBC           1 :     int         p_idx = 0;      /* end of the phoned phrase */
     351 ECB             : 
     352                 :     /*-- Parameter checks --*/
     353                 : 
     354                 :     /*
     355                 :      * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
     356                 :      */
     357                 : 
     358                 :     /* Negative phoneme length is meaningless */
     359 GIC           1 :     if (!(max_phonemes > 0))
     360 ECB             :         /* internal error */
     361 UIC           0 :         elog(ERROR, "metaphone: Requested output length must be > 0");
     362 EUB             : 
     363                 :     /* Empty/null string is meaningless */
     364 GIC           1 :     if ((word == NULL) || !(strlen(word) > 0))
     365 ECB             :         /* internal error */
     366 UIC           0 :         elog(ERROR, "metaphone: Input string length must be > 0");
     367 EUB             : 
     368                 :     /*-- Allocate memory for our phoned_phrase --*/
     369 GIC           1 :     if (max_phonemes == 0)
     370 ECB             :     {                           /* Assume largest possible */
     371 UIC           0 :         *phoned_word = palloc(sizeof(char) * strlen(word) + 1);
     372 EUB             :     }
     373                 :     else
     374                 :     {
     375 GIC           1 :         *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
     376 ECB             :     }
     377                 : 
     378                 :     /*-- The first phoneme has to be processed specially. --*/
     379                 :     /* Find our first letter */
     380 GIC           1 :     for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
     381 ECB             :     {
     382                 :         /* On the off chance we were given nothing but crap... */
     383 UIC           0 :         if (Curr_Letter == '\0')
     384 EUB             :         {
     385 UIC           0 :             End_Phoned_Word;
     386 UBC           0 :             return;
     387 EUB             :         }
     388                 :     }
     389                 : 
     390 GIC           1 :     switch (Curr_Letter)
     391 ECB             :     {
     392                 :             /* AE becomes E */
     393 UIC           0 :         case 'A':
     394 UBC           0 :             if (Next_Letter == 'E')
     395 EUB             :             {
     396 UIC           0 :                 Phonize('E');
     397 UBC           0 :                 w_idx += 2;
     398 EUB             :             }
     399                 :             /* Remember, preserve vowels at the beginning */
     400                 :             else
     401                 :             {
     402 UIC           0 :                 Phonize('A');
     403 UBC           0 :                 w_idx++;
     404 EUB             :             }
     405 UIC           0 :             break;
     406 EUB             :             /* [GKP]N becomes N */
     407 GIC           1 :         case 'G':
     408 ECB             :         case 'K':
     409                 :         case 'P':
     410 GIC           1 :             if (Next_Letter == 'N')
     411 ECB             :             {
     412 UIC           0 :                 Phonize('N');
     413 UBC           0 :                 w_idx += 2;
     414 EUB             :             }
     415 GIC           1 :             break;
     416 ECB             : 
     417                 :             /*
     418                 :              * WH becomes H, WR becomes R W if followed by a vowel
     419                 :              */
     420 UIC           0 :         case 'W':
     421 UBC           0 :             if (Next_Letter == 'H' ||
     422               0 :                 Next_Letter == 'R')
     423 EUB             :             {
     424 UIC           0 :                 Phonize(Next_Letter);
     425 UBC           0 :                 w_idx += 2;
     426 EUB             :             }
     427 UIC           0 :             else if (isvowel(Next_Letter))
     428 EUB             :             {
     429 UIC           0 :                 Phonize('W');
     430 UBC           0 :                 w_idx += 2;
     431 EUB             :             }
     432                 :             /* else ignore */
     433 UIC           0 :             break;
     434 EUB             :             /* X becomes S */
     435 UIC           0 :         case 'X':
     436 UBC           0 :             Phonize('S');
     437               0 :             w_idx++;
     438               0 :             break;
     439 EUB             :             /* Vowels are kept */
     440                 : 
     441                 :             /*
     442                 :              * We did A already case 'A': case 'a':
     443                 :              */
     444 UIC           0 :         case 'E':
     445 EUB             :         case 'I':
     446                 :         case 'O':
     447                 :         case 'U':
     448 UIC           0 :             Phonize(Curr_Letter);
     449 UBC           0 :             w_idx++;
     450               0 :             break;
     451               0 :         default:
     452 EUB             :             /* do nothing */
     453 UIC           0 :             break;
     454 EUB             :     }
     455                 : 
     456                 : 
     457                 : 
     458                 :     /* On to the metaphoning */
     459 GIC           6 :     for (; Curr_Letter != '\0' &&
     460 CBC           5 :          (max_phonemes == 0 || Phone_Len < max_phonemes);
     461               5 :          w_idx++)
     462 ECB             :     {
     463                 :         /*
     464                 :          * How many letters to skip because an earlier encoding handled
     465                 :          * multiple letters
     466                 :          */
     467 GIC           5 :         unsigned short int skip_letter = 0;
     468 ECB             : 
     469                 : 
     470                 :         /*
     471                 :          * THOUGHT:  It would be nice if, rather than having things like...
     472                 :          * well, SCI.  For SCI you encode the S, then have to remember to skip
     473                 :          * the C.  So the phonome SCI invades both S and C.  It would be
     474                 :          * better, IMHO, to skip the C from the S part of the encoding. Hell,
     475                 :          * I'm trying it.
     476                 :          */
     477                 : 
     478                 :         /* Ignore non-alphas */
     479 GIC           5 :         if (!isalpha((unsigned char) (Curr_Letter)))
     480 LBC           0 :             continue;
     481 EUB             : 
     482                 :         /* Drop duplicates, except CC */
     483 GIC           5 :         if (Curr_Letter == Prev_Letter &&
     484 LBC           0 :             Curr_Letter != 'C')
     485 UBC           0 :             continue;
     486 EUB             : 
     487 GIC           5 :         switch (Curr_Letter)
     488 ECB             :         {
     489                 :                 /* B -> B unless in MB */
     490 GIC           1 :             case 'B':
     491 CBC           1 :                 if (Prev_Letter != 'M')
     492 LBC           0 :                     Phonize('B');
     493 GBC           1 :                 break;
     494 ECB             : 
     495                 :                 /*
     496                 :                  * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
     497                 :                  * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
     498                 :                  * SCE-, -SCY- (handed in S) else K
     499                 :                  */
     500 UIC           0 :             case 'C':
     501 UBC           0 :                 if (MAKESOFT(Next_Letter))
     502 EUB             :                 {               /* C[IEY] */
     503 UIC           0 :                     if (After_Next_Letter == 'A' &&
     504 UBC           0 :                         Next_Letter == 'I')
     505 EUB             :                     {           /* CIA */
     506 UIC           0 :                         Phonize(SH);
     507 EUB             :                     }
     508                 :                     /* SC[IEY] */
     509 UIC           0 :                     else if (Prev_Letter == 'S')
     510 EUB             :                     {
     511                 :                         /* Dropped */
     512                 :                     }
     513                 :                     else
     514 UIC           0 :                         Phonize('S');
     515 EUB             :                 }
     516 UIC           0 :                 else if (Next_Letter == 'H')
     517 EUB             :                 {
     518                 : #ifndef USE_TRADITIONAL_METAPHONE
     519 UIC           0 :                     if (After_Next_Letter == 'R' ||
     520 UBC           0 :                         Prev_Letter == 'S')
     521 EUB             :                     {           /* Christ, School */
     522 UIC           0 :                         Phonize('K');
     523 EUB             :                     }
     524                 :                     else
     525 UIC           0 :                         Phonize(SH);
     526 EUB             : #else
     527                 :                     Phonize(SH);
     528                 : #endif
     529 UIC           0 :                     skip_letter++;
     530 EUB             :                 }
     531                 :                 else
     532 UIC           0 :                     Phonize('K');
     533 UBC           0 :                 break;
     534 EUB             : 
     535                 :                 /*
     536                 :                  * J if in -DGE-, -DGI- or -DGY- else T
     537                 :                  */
     538 UIC           0 :             case 'D':
     539 UBC           0 :                 if (Next_Letter == 'G' &&
     540               0 :                     MAKESOFT(After_Next_Letter))
     541 EUB             :                 {
     542 UIC           0 :                     Phonize('J');
     543 UBC           0 :                     skip_letter++;
     544 EUB             :                 }
     545                 :                 else
     546 UIC           0 :                     Phonize('T');
     547 UBC           0 :                 break;
     548 EUB             : 
     549                 :                 /*
     550                 :                  * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
     551                 :                  * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
     552                 :                  * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
     553                 :                  * else K
     554                 :                  */
     555 GIC           1 :             case 'G':
     556 CBC           1 :                 if (Next_Letter == 'H')
     557 ECB             :                 {
     558 UIC           0 :                     if (!(NOGHTOF(Look_Back_Letter(3)) ||
     559 UBC           0 :                           Look_Back_Letter(4) == 'H'))
     560 EUB             :                     {
     561 UIC           0 :                         Phonize('F');
     562 UBC           0 :                         skip_letter++;
     563 EUB             :                     }
     564                 :                     else
     565                 :                     {
     566                 :                         /* silent */
     567                 :                     }
     568                 :                 }
     569 GIC           1 :                 else if (Next_Letter == 'N')
     570 ECB             :                 {
     571 UIC           0 :                     if (Isbreak(After_Next_Letter) ||
     572 UBC           0 :                         (After_Next_Letter == 'E' &&
     573               0 :                          Look_Ahead_Letter(3) == 'D'))
     574 EUB             :                     {
     575                 :                         /* dropped */
     576                 :                     }
     577                 :                     else
     578 UIC           0 :                         Phonize('K');
     579 EUB             :                 }
     580 GIC           1 :                 else if (MAKESOFT(Next_Letter) &&
     581 LBC           0 :                          Prev_Letter != 'G')
     582 UBC           0 :                     Phonize('J');
     583 EUB             :                 else
     584 GIC           1 :                     Phonize('K');
     585 CBC           1 :                 break;
     586 ECB             :                 /* H if before a vowel and not after C,G,P,S,T */
     587 UIC           0 :             case 'H':
     588 UBC           0 :                 if (isvowel(Next_Letter) &&
     589               0 :                     !AFFECTH(Prev_Letter))
     590               0 :                     Phonize('H');
     591               0 :                 break;
     592 EUB             : 
     593                 :                 /*
     594                 :                  * dropped if after C else K
     595                 :                  */
     596 UIC           0 :             case 'K':
     597 UBC           0 :                 if (Prev_Letter != 'C')
     598               0 :                     Phonize('K');
     599               0 :                 break;
     600 EUB             : 
     601                 :                 /*
     602                 :                  * F if before H else P
     603                 :                  */
     604 UIC           0 :             case 'P':
     605 UBC           0 :                 if (Next_Letter == 'H')
     606               0 :                     Phonize('F');
     607 EUB             :                 else
     608 UIC           0 :                     Phonize('P');
     609 UBC           0 :                 break;
     610 EUB             : 
     611                 :                 /*
     612                 :                  * K
     613                 :                  */
     614 UIC           0 :             case 'Q':
     615 UBC           0 :                 Phonize('K');
     616               0 :                 break;
     617 EUB             : 
     618                 :                 /*
     619                 :                  * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
     620                 :                  */
     621 UIC           0 :             case 'S':
     622 UBC           0 :                 if (Next_Letter == 'I' &&
     623               0 :                     (After_Next_Letter == 'O' ||
     624               0 :                      After_Next_Letter == 'A'))
     625               0 :                     Phonize(SH);
     626               0 :                 else if (Next_Letter == 'H')
     627 EUB             :                 {
     628 UIC           0 :                     Phonize(SH);
     629 UBC           0 :                     skip_letter++;
     630 EUB             :                 }
     631                 : #ifndef USE_TRADITIONAL_METAPHONE
     632 UIC           0 :                 else if (Next_Letter == 'C' &&
     633 UBC           0 :                          Look_Ahead_Letter(2) == 'H' &&
     634               0 :                          Look_Ahead_Letter(3) == 'W')
     635 EUB             :                 {
     636 UIC           0 :                     Phonize(SH);
     637 UBC           0 :                     skip_letter += 2;
     638 EUB             :                 }
     639                 : #endif
     640                 :                 else
     641 UIC           0 :                     Phonize('S');
     642 UBC           0 :                 break;
     643 EUB             : 
     644                 :                 /*
     645                 :                  * 'sh' in -TIA- or -TIO- else 'th' before H else T
     646                 :                  */
     647 UIC           0 :             case 'T':
     648 UBC           0 :                 if (Next_Letter == 'I' &&
     649               0 :                     (After_Next_Letter == 'O' ||
     650               0 :                      After_Next_Letter == 'A'))
     651               0 :                     Phonize(SH);
     652               0 :                 else if (Next_Letter == 'H')
     653 EUB             :                 {
     654 UIC           0 :                     Phonize(TH);
     655 UBC           0 :                     skip_letter++;
     656 EUB             :                 }
     657                 :                 else
     658 UIC           0 :                     Phonize('T');
     659 UBC           0 :                 break;
     660 EUB             :                 /* F */
     661 UIC           0 :             case 'V':
     662 UBC           0 :                 Phonize('F');
     663               0 :                 break;
     664 EUB             :                 /* W before a vowel, else dropped */
     665 UIC           0 :             case 'W':
     666 UBC           0 :                 if (isvowel(Next_Letter))
     667               0 :                     Phonize('W');
     668               0 :                 break;
     669 EUB             :                 /* KS */
     670 UIC           0 :             case 'X':
     671 UBC           0 :                 Phonize('K');
     672               0 :                 if (max_phonemes == 0 || Phone_Len < max_phonemes)
     673               0 :                     Phonize('S');
     674               0 :                 break;
     675 EUB             :                 /* Y if followed by a vowel */
     676 UIC           0 :             case 'Y':
     677 UBC           0 :                 if (isvowel(Next_Letter))
     678               0 :                     Phonize('Y');
     679               0 :                 break;
     680 EUB             :                 /* S */
     681 UIC           0 :             case 'Z':
     682 UBC           0 :                 Phonize('S');
     683               0 :                 break;
     684 EUB             :                 /* No transformation */
     685 GIC           1 :             case 'F':
     686 ECB             :             case 'J':
     687                 :             case 'L':
     688                 :             case 'M':
     689                 :             case 'N':
     690                 :             case 'R':
     691 GIC           1 :                 Phonize(Curr_Letter);
     692 CBC           1 :                 break;
     693               2 :             default:
     694 ECB             :                 /* nothing */
     695 GIC           2 :                 break;
     696 ECB             :         }                       /* END SWITCH */
     697                 : 
     698 GIC           5 :         w_idx += skip_letter;
     699 ECB             :     }                           /* END FOR */
     700                 : 
     701 GIC           1 :     End_Phoned_Word;
     702 ECB             : }                               /* END metaphone */
     703                 : 
     704                 : 
     705                 : /*
     706                 :  * SQL function: soundex(text) returns text
     707                 :  */
     708 GIC           3 : PG_FUNCTION_INFO_V1(soundex);
     709 ECB             : 
     710                 : Datum
     711 GIC           7 : soundex(PG_FUNCTION_ARGS)
     712 ECB             : {
     713                 :     char        outstr[SOUNDEX_LEN + 1];
     714                 :     char       *arg;
     715                 : 
     716 GIC           7 :     arg = text_to_cstring(PG_GETARG_TEXT_PP(0));
     717 ECB             : 
     718 GIC           7 :     _soundex(arg, outstr);
     719 ECB             : 
     720 GIC           7 :     PG_RETURN_TEXT_P(cstring_to_text(outstr));
     721 ECB             : }
     722                 : 
     723                 : static void
     724 GIC          13 : _soundex(const char *instr, char *outstr)
     725 ECB             : {
     726                 :     int         count;
     727                 : 
     728 GNC          13 :     Assert(instr);
     729              13 :     Assert(outstr);
     730 ECB             : 
     731 GIC          13 :     outstr[SOUNDEX_LEN] = '\0';
     732 ECB             : 
     733                 :     /* Skip leading non-alphabetic characters */
     734 GIC          13 :     while (!isalpha((unsigned char) instr[0]) && instr[0])
     735 LBC           0 :         ++instr;
     736 EUB             : 
     737                 :     /* No string left */
     738 GIC          13 :     if (!instr[0])
     739 ECB             :     {
     740 UIC           0 :         outstr[0] = (char) 0;
     741 UBC           0 :         return;
     742 EUB             :     }
     743                 : 
     744                 :     /* Take the first letter as is */
     745 GIC          13 :     *outstr++ = (char) toupper((unsigned char) *instr++);
     746 ECB             : 
     747 GIC          13 :     count = 1;
     748 CBC          60 :     while (*instr && count < SOUNDEX_LEN)
     749 ECB             :     {
     750 GIC          93 :         if (isalpha((unsigned char) *instr) &&
     751 CBC          46 :             soundex_code(*instr) != soundex_code(*(instr - 1)))
     752 ECB             :         {
     753 GIC          35 :             *outstr = soundex_code(instr[0]);
     754 CBC          35 :             if (*outstr != '0')
     755 ECB             :             {
     756 GIC          23 :                 ++outstr;
     757 CBC          23 :                 ++count;
     758 ECB             :             }
     759                 :         }
     760 GIC          47 :         ++instr;
     761 ECB             :     }
     762                 : 
     763                 :     /* Fill with 0's */
     764 GIC          29 :     while (count < SOUNDEX_LEN)
     765 ECB             :     {
     766 GIC          16 :         *outstr = '0';
     767 CBC          16 :         ++outstr;
     768              16 :         ++count;
     769 ECB             :     }
     770                 : }
     771                 : 
     772 GIC           2 : PG_FUNCTION_INFO_V1(difference);
     773 ECB             : 
     774                 : Datum
     775 GIC           3 : difference(PG_FUNCTION_ARGS)
     776 ECB             : {
     777                 :     char        sndx1[SOUNDEX_LEN + 1],
     778                 :                 sndx2[SOUNDEX_LEN + 1];
     779                 :     int         i,
     780                 :                 result;
     781                 : 
     782 GIC           3 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1);
     783 CBC           3 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2);
     784 ECB             : 
     785 GIC           3 :     result = 0;
     786 CBC          15 :     for (i = 0; i < SOUNDEX_LEN; i++)
     787 ECB             :     {
     788 GIC          12 :         if (sndx1[i] == sndx2[i])
     789 CBC           6 :             result++;
     790 ECB             :     }
     791                 : 
     792 GIC           3 :     PG_RETURN_INT32(result);
     793 ECB             : }
        

Generated by: LCOV version v1.16-55-g56c0a2a