LCOV - differential code coverage report
Current view: top level - contrib/fuzzystrmatch - fuzzystrmatch.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 44.1 % 279 123 9 67 80 1 84 2 36 75 77 1
Current Date: 2023-04-08 17:13:01 Functions: 90.0 % 20 18 2 18 2 18
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (120,180] days: 100.0 % 2 2 2
Legend: Lines: hit not hit (240..) days: 43.7 % 277 121 9 67 80 1 84 36 75 77
Function coverage date bins:
(240..) days: 45.0 % 40 18 2 18 2 18

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*
                                  2                 :  * fuzzystrmatch.c
                                  3                 :  *
                                  4                 :  * Functions for "fuzzy" comparison of strings
                                  5                 :  *
                                  6                 :  * Joe Conway <mail@joeconway.com>
                                  7                 :  *
                                  8                 :  * contrib/fuzzystrmatch/fuzzystrmatch.c
                                  9                 :  * Copyright (c) 2001-2023, PostgreSQL Global Development Group
                                 10                 :  * ALL RIGHTS RESERVED;
                                 11                 :  *
                                 12                 :  * metaphone()
                                 13                 :  * -----------
                                 14                 :  * Modified for PostgreSQL by Joe Conway.
                                 15                 :  * Based on CPAN's "Text-Metaphone-1.96" by Michael G Schwern <schwern@pobox.com>
                                 16                 :  * Code slightly modified for use as PostgreSQL function (palloc, elog, etc).
                                 17                 :  * Metaphone was originally created by Lawrence Philips and presented in article
                                 18                 :  * in "Computer Language" December 1990 issue.
                                 19                 :  *
                                 20                 :  * Permission to use, copy, modify, and distribute this software and its
                                 21                 :  * documentation for any purpose, without fee, and without a written agreement
                                 22                 :  * is hereby granted, provided that the above copyright notice and this
                                 23                 :  * paragraph and the following two paragraphs appear in all copies.
                                 24                 :  *
                                 25                 :  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR
                                 26                 :  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
                                 27                 :  * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
                                 28                 :  * DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE
                                 29                 :  * POSSIBILITY OF SUCH DAMAGE.
                                 30                 :  *
                                 31                 :  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
                                 32                 :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
                                 33                 :  * AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
                                 34                 :  * ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO
                                 35                 :  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
                                 36                 :  *
                                 37                 :  */
                                 38                 : 
                                 39                 : #include "postgres.h"
                                 40                 : 
                                 41                 : #include <ctype.h>
                                 42                 : 
                                 43                 : #include "mb/pg_wchar.h"
                                 44                 : #include "utils/builtins.h"
                                 45                 : #include "utils/varlena.h"
                                 46                 : #include "varatt.h"
                                 47                 : 
 6158 tgl                        48 GIC           2 : PG_MODULE_MAGIC;
 6158 tgl                        49 ECB             : 
                                 50                 : /*
                                 51                 :  * Soundex
                                 52                 :  */
                                 53                 : static void _soundex(const char *instr, char *outstr);
                                 54                 : 
                                 55                 : #define SOUNDEX_LEN 4
                                 56                 : 
                                 57                 : /*                                  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
                                 58                 : static const char *soundex_table = "01230120022455012623010202";
                                 59                 : 
                                 60                 : static char
 5115 tgl                        61 GIC         127 : soundex_code(char letter)
 5115 tgl                        62 ECB             : {
 5115 tgl                        63 GIC         127 :     letter = toupper((unsigned char) letter);
 5115 tgl                        64 ECB             :     /* Defend against non-ASCII letters */
 5115 tgl                        65 GIC         127 :     if (letter >= 'A' && letter <= 'Z')
 5115 tgl                        66 CBC         126 :         return soundex_table[letter - 'A'];
                                 67               1 :     return letter;
 5115 tgl                        68 ECB             : }
                                 69                 : 
                                 70                 : /*
                                 71                 :  * Metaphone
                                 72                 :  */
                                 73                 : #define MAX_METAPHONE_STRLEN        255
                                 74                 : 
                                 75                 : /*
                                 76                 :  * Original code by Michael G Schwern starts here.
                                 77                 :  * Code slightly modified for use as PostgreSQL function.
                                 78                 :  */
                                 79                 : 
                                 80                 : 
                                 81                 : /**************************************************************************
                                 82                 :     metaphone -- Breaks english phrases down into their phonemes.
                                 83                 : 
                                 84                 :     Input
                                 85                 :         word            --  An english word to be phonized
                                 86                 :         max_phonemes    --  How many phonemes to calculate.  If 0, then it
                                 87                 :                             will phonize the entire phrase.
                                 88                 :         phoned_word     --  The final phonized word.  (We'll allocate the
                                 89                 :                             memory.)
                                 90                 :     Output
                                 91                 :         error   --  A simple error flag, returns true or false
                                 92                 : 
                                 93                 :     NOTES:  ALL non-alpha characters are ignored, this includes whitespace,
                                 94                 :     although non-alpha characters will break up phonemes.
                                 95                 : ****************************************************************************/
                                 96                 : 
                                 97                 : 
                                 98                 : /*  I add modifications to the traditional metaphone algorithm that you
                                 99                 :     might find in books.  Define this if you want metaphone to behave
                                100                 :     traditionally */
                                101                 : #undef USE_TRADITIONAL_METAPHONE
                                102                 : 
                                103                 : /* Special encodings */
                                104                 : #define  SH     'X'
                                105                 : #define  TH     '0'
                                106                 : 
                                107                 : static char Lookahead(char *word, int how_far);
                                108                 : static void _metaphone(char *word, int max_phonemes, char **phoned_word);
                                109                 : 
                                110                 : /* Metachar.h ... little bits about characters for metaphone */
                                111                 : 
                                112                 : 
                                113                 : /*-- Character encoding array & accessing macros --*/
                                114                 : /* Stolen directly out of the book... */
                                115                 : static const char _codes[26] = {
                                116                 :     1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0, 2, 4, 4, 1, 0, 0, 0, 8, 0
                                117                 : /*  a  b c  d e f g  h i j k l m n o p q r s t u v w x y z */
                                118                 : };
                                119                 : 
                                120                 : static int
 5115 tgl                       121 GIC           1 : getcode(char c)
 5115 tgl                       122 ECB             : {
 5115 tgl                       123 GIC           1 :     if (isalpha((unsigned char) c))
 5115 tgl                       124 ECB             :     {
 5115 tgl                       125 GIC           1 :         c = toupper((unsigned char) c);
 5115 tgl                       126 ECB             :         /* Defend against non-ASCII letters */
 5115 tgl                       127 GIC           1 :         if (c >= 'A' && c <= 'Z')
 5115 tgl                       128 CBC           1 :             return _codes[c - 'A'];
 5115 tgl                       129 ECB             :     }
 5115 tgl                       130 UIC           0 :     return 0;
 5115 tgl                       131 EUB             : }
                                132                 : 
                                133                 : #define isvowel(c)  (getcode(c) & 1)    /* AEIOU */
                                134                 : 
                                135                 : /* These letters are passed through unchanged */
                                136                 : #define NOCHANGE(c) (getcode(c) & 2)    /* FJMNR */
                                137                 : 
                                138                 : /* These form diphthongs when preceding H */
                                139                 : #define AFFECTH(c)  (getcode(c) & 4)    /* CGPST */
                                140                 : 
                                141                 : /* These make C and G soft */
                                142                 : #define MAKESOFT(c) (getcode(c) & 8)    /* EIY */
                                143                 : 
                                144                 : /* These prevent GH from becoming F */
                                145                 : #define NOGHTOF(c)  (getcode(c) & 16)   /* BDH */
                                146                 : 
 5484 tgl                       147 GIC           2 : PG_FUNCTION_INFO_V1(levenshtein_with_costs);
 5484 tgl                       148 ECB             : Datum
 5484 tgl                       149 GIC           1 : levenshtein_with_costs(PG_FUNCTION_ARGS)
 5484 tgl                       150 ECB             : {
 4637 rhaas                     151 GIC           1 :     text       *src = PG_GETARG_TEXT_PP(0);
 4637 rhaas                     152 CBC           1 :     text       *dst = PG_GETARG_TEXT_PP(1);
 5050 bruce                     153               1 :     int         ins_c = PG_GETARG_INT32(2);
                                154               1 :     int         del_c = PG_GETARG_INT32(3);
                                155               1 :     int         sub_c = PG_GETARG_INT32(4);
 3069 rhaas                     156 ECB             :     const char *s_data;
                                157                 :     const char *t_data;
                                158                 :     int         s_bytes,
                                159                 :                 t_bytes;
                                160                 : 
                                161                 :     /* Extract a pointer to the actual character data */
 3069 rhaas                     162 GIC           1 :     s_data = VARDATA_ANY(src);
 3069 rhaas                     163 CBC           1 :     t_data = VARDATA_ANY(dst);
 2634 tgl                       164 ECB             :     /* Determine length of each string in bytes */
 3069 rhaas                     165 GIC           1 :     s_bytes = VARSIZE_ANY_EXHDR(src);
 3069 rhaas                     166 CBC           1 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
 3069 rhaas                     167 ECB             : 
 2634 tgl                       168 GIC           1 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
 2634 tgl                       169 ECB             :                                        ins_c, del_c, sub_c, false));
                                170                 : }
                                171                 : 
                                172                 : 
 5484 tgl                       173 GIC           2 : PG_FUNCTION_INFO_V1(levenshtein);
 5484 tgl                       174 ECB             : Datum
 5484 tgl                       175 GIC           1 : levenshtein(PG_FUNCTION_ARGS)
 5484 tgl                       176 ECB             : {
 4637 rhaas                     177 GIC           1 :     text       *src = PG_GETARG_TEXT_PP(0);
 4637 rhaas                     178 CBC           1 :     text       *dst = PG_GETARG_TEXT_PP(1);
 3069 rhaas                     179 ECB             :     const char *s_data;
                                180                 :     const char *t_data;
                                181                 :     int         s_bytes,
                                182                 :                 t_bytes;
                                183                 : 
                                184                 :     /* Extract a pointer to the actual character data */
 3069 rhaas                     185 GIC           1 :     s_data = VARDATA_ANY(src);
 3069 rhaas                     186 CBC           1 :     t_data = VARDATA_ANY(dst);
 2634 tgl                       187 ECB             :     /* Determine length of each string in bytes */
 3069 rhaas                     188 GIC           1 :     s_bytes = VARSIZE_ANY_EXHDR(src);
 3069 rhaas                     189 CBC           1 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
 3069 rhaas                     190 ECB             : 
 2634 tgl                       191 GIC           1 :     PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
 2634 tgl                       192 ECB             :                                        1, 1, 1, false));
                                193                 : }
                                194                 : 
                                195                 : 
 4555 rhaas                     196 GIC           1 : PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
 4555 rhaas                     197 ECB             : Datum
 4555 rhaas                     198 UIC           0 : levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
 4555 rhaas                     199 EUB             : {
 4555 rhaas                     200 UIC           0 :     text       *src = PG_GETARG_TEXT_PP(0);
 4555 rhaas                     201 UBC           0 :     text       *dst = PG_GETARG_TEXT_PP(1);
                                202               0 :     int         ins_c = PG_GETARG_INT32(2);
                                203               0 :     int         del_c = PG_GETARG_INT32(3);
                                204               0 :     int         sub_c = PG_GETARG_INT32(4);
                                205               0 :     int         max_d = PG_GETARG_INT32(5);
 3069 rhaas                     206 EUB             :     const char *s_data;
                                207                 :     const char *t_data;
                                208                 :     int         s_bytes,
                                209                 :                 t_bytes;
                                210                 : 
                                211                 :     /* Extract a pointer to the actual character data */
 3069 rhaas                     212 UIC           0 :     s_data = VARDATA_ANY(src);
 3069 rhaas                     213 UBC           0 :     t_data = VARDATA_ANY(dst);
 2634 tgl                       214 EUB             :     /* Determine length of each string in bytes */
 3069 rhaas                     215 UIC           0 :     s_bytes = VARSIZE_ANY_EXHDR(src);
 3069 rhaas                     216 UBC           0 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
 3069 rhaas                     217 EUB             : 
 2634 tgl                       218 UIC           0 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
 2634 tgl                       219 EUB             :                                                   t_data, t_bytes,
                                220                 :                                                   ins_c, del_c, sub_c,
                                221                 :                                                   max_d, false));
                                222                 : }
                                223                 : 
                                224                 : 
 4555 rhaas                     225 GIC           2 : PG_FUNCTION_INFO_V1(levenshtein_less_equal);
 4555 rhaas                     226 ECB             : Datum
 4555 rhaas                     227 GIC           2 : levenshtein_less_equal(PG_FUNCTION_ARGS)
 4555 rhaas                     228 ECB             : {
 4555 rhaas                     229 GIC           2 :     text       *src = PG_GETARG_TEXT_PP(0);
 4555 rhaas                     230 CBC           2 :     text       *dst = PG_GETARG_TEXT_PP(1);
                                231               2 :     int         max_d = PG_GETARG_INT32(2);
 3069 rhaas                     232 ECB             :     const char *s_data;
                                233                 :     const char *t_data;
                                234                 :     int         s_bytes,
                                235                 :                 t_bytes;
                                236                 : 
                                237                 :     /* Extract a pointer to the actual character data */
 3069 rhaas                     238 GIC           2 :     s_data = VARDATA_ANY(src);
 3069 rhaas                     239 CBC           2 :     t_data = VARDATA_ANY(dst);
 2634 tgl                       240 ECB             :     /* Determine length of each string in bytes */
 3069 rhaas                     241 GIC           2 :     s_bytes = VARSIZE_ANY_EXHDR(src);
 3069 rhaas                     242 CBC           2 :     t_bytes = VARSIZE_ANY_EXHDR(dst);
 3069 rhaas                     243 ECB             : 
 2634 tgl                       244 GIC           2 :     PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
 2634 tgl                       245 ECB             :                                                   t_data, t_bytes,
                                246                 :                                                   1, 1, 1,
                                247                 :                                                   max_d, false));
                                248                 : }
                                249                 : 
                                250                 : 
                                251                 : /*
                                252                 :  * Calculates the metaphone of an input string.
                                253                 :  * Returns number of characters requested
                                254                 :  * (suggested value is 4)
                                255                 :  */
 7915 bruce                     256 GIC           2 : PG_FUNCTION_INFO_V1(metaphone);
 7915 bruce                     257 ECB             : Datum
 7915 bruce                     258 GIC           1 : metaphone(PG_FUNCTION_ARGS)
 7915 bruce                     259 ECB             : {
 5493 tgl                       260 GIC           1 :     char       *str_i = TextDatumGetCString(PG_GETARG_DATUM(0));
 5493 tgl                       261 CBC           1 :     size_t      str_i_len = strlen(str_i);
 7915 bruce                     262 ECB             :     int         reqlen;
                                263                 :     char       *metaph;
                                264                 : 
                                265                 :     /* return an empty string if we receive one */
 6856 mail                      266 GIC           1 :     if (!(str_i_len > 0))
 5493 tgl                       267 LBC           0 :         PG_RETURN_TEXT_P(cstring_to_text(""));
 6856 mail                      268 EUB             : 
 7915 bruce                     269 GIC           1 :     if (str_i_len > MAX_METAPHONE_STRLEN)
 7199 tgl                       270 LBC           0 :         ereport(ERROR,
 7199 tgl                       271 EUB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                272                 :                  errmsg("argument exceeds the maximum length of %d bytes",
                                273                 :                         MAX_METAPHONE_STRLEN)));
                                274                 : 
 7915 bruce                     275 GIC           1 :     reqlen = PG_GETARG_INT32(1);
 7915 bruce                     276 CBC           1 :     if (reqlen > MAX_METAPHONE_STRLEN)
 7199 tgl                       277 LBC           0 :         ereport(ERROR,
 7199 tgl                       278 EUB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                279                 :                  errmsg("output exceeds the maximum length of %d bytes",
                                280                 :                         MAX_METAPHONE_STRLEN)));
                                281                 : 
 7915 bruce                     282 GIC           1 :     if (!(reqlen > 0))
 7199 tgl                       283 LBC           0 :         ereport(ERROR,
 7199 tgl                       284 EUB             :                 (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
                                285                 :                  errmsg("output cannot be empty string")));
                                286                 : 
 2061 peter_e                   287 GIC           1 :     _metaphone(str_i, reqlen, &metaph);
 2061 peter_e                   288 CBC           1 :     PG_RETURN_TEXT_P(cstring_to_text(metaph));
 7915 bruce                     289 ECB             : }
                                290                 : 
                                291                 : 
                                292                 : /*
                                293                 :  * Original code by Michael G Schwern starts here.
                                294                 :  * Code slightly modified for use as PostgreSQL
                                295                 :  * function (palloc, etc).
                                296                 :  */
                                297                 : 
                                298                 : /* I suppose I could have been using a character pointer instead of
                                299                 :  * accessing the array directly... */
                                300                 : 
                                301                 : /* Look at the next letter in the word */
                                302                 : #define Next_Letter (toupper((unsigned char) word[w_idx+1]))
                                303                 : /* Look at the current letter in the word */
                                304                 : #define Curr_Letter (toupper((unsigned char) word[w_idx]))
                                305                 : /* Go N letters back. */
                                306                 : #define Look_Back_Letter(n) \
                                307                 :     (w_idx >= (n) ? toupper((unsigned char) word[w_idx-(n)]) : '\0')
                                308                 : /* Previous letter.  I dunno, should this return null on failure? */
                                309                 : #define Prev_Letter (Look_Back_Letter(1))
                                310                 : /* Look two letters down.  It makes sure you don't walk off the string. */
                                311                 : #define After_Next_Letter \
                                312                 :     (Next_Letter != '\0' ? toupper((unsigned char) word[w_idx+2]) : '\0')
                                313                 : #define Look_Ahead_Letter(n) toupper((unsigned char) Lookahead(word+w_idx, n))
                                314                 : 
                                315                 : 
                                316                 : /* Allows us to safely look ahead an arbitrary # of letters */
                                317                 : /* I probably could have just used strlen... */
                                318                 : static char
 7836 bruce                     319 UIC           0 : Lookahead(char *word, int how_far)
 7836 bruce                     320 EUB             : {
 7836 bruce                     321 UIC           0 :     char        letter_ahead = '\0';    /* null by default */
 7836 bruce                     322 EUB             :     int         idx;
                                323                 : 
 7836 bruce                     324 UIC           0 :     for (idx = 0; word[idx] != '\0' && idx < how_far; idx++);
 7836 bruce                     325 EUB             :     /* Edge forward in the string... */
                                326                 : 
 6385 bruce                     327 UIC           0 :     letter_ahead = word[idx];   /* idx will be either == to how_far or at the
 6385 bruce                     328 EUB             :                                  * end of the string */
 7915 bruce                     329 UIC           0 :     return letter_ahead;
 7915 bruce                     330 EUB             : }
                                331                 : 
                                332                 : 
                                333                 : /* phonize one letter */
                                334                 : #define Phonize(c)  do {(*phoned_word)[p_idx++] = c;} while (0)
                                335                 : /* Slap a null character on the end of the phoned word */
                                336                 : #define End_Phoned_Word do {(*phoned_word)[p_idx] = '\0';} while (0)
                                337                 : /* How long is the phoned word? */
                                338                 : #define Phone_Len   (p_idx)
                                339                 : 
                                340                 : /* Note is a letter is a 'break' in the word */
                                341                 : #define Isbreak(c)  (!isalpha((unsigned char) (c)))
                                342                 : 
                                343                 : 
                                344                 : static void
 5050 bruce                     345 GIC           1 : _metaphone(char *word,          /* IN */
 7836 bruce                     346 ECB             :            int max_phonemes,
                                347                 :            char **phoned_word)  /* OUT */
                                348                 : {
 7836 bruce                     349 GIC           1 :     int         w_idx = 0;      /* point in the phonization we're at. */
 7836 bruce                     350 CBC           1 :     int         p_idx = 0;      /* end of the phoned phrase */
 7836 bruce                     351 ECB             : 
                                352                 :     /*-- Parameter checks --*/
                                353                 : 
                                354                 :     /*
                                355                 :      * Shouldn't be necessary, but left these here anyway jec Aug 3, 2001
                                356                 :      */
                                357                 : 
                                358                 :     /* Negative phoneme length is meaningless */
 7915 bruce                     359 GIC           1 :     if (!(max_phonemes > 0))
 7199 tgl                       360 ECB             :         /* internal error */
 7915 bruce                     361 UIC           0 :         elog(ERROR, "metaphone: Requested output length must be > 0");
 7915 bruce                     362 EUB             : 
                                363                 :     /* Empty/null string is meaningless */
 7915 bruce                     364 GIC           1 :     if ((word == NULL) || !(strlen(word) > 0))
 7199 tgl                       365 ECB             :         /* internal error */
 7915 bruce                     366 UIC           0 :         elog(ERROR, "metaphone: Input string length must be > 0");
 7836 bruce                     367 EUB             : 
                                368                 :     /*-- Allocate memory for our phoned_phrase --*/
 7836 bruce                     369 GIC           1 :     if (max_phonemes == 0)
 7836 bruce                     370 ECB             :     {                           /* Assume largest possible */
 2118 tgl                       371 UIC           0 :         *phoned_word = palloc(sizeof(char) * strlen(word) + 1);
 7836 bruce                     372 EUB             :     }
                                373                 :     else
                                374                 :     {
 7915 bruce                     375 GIC           1 :         *phoned_word = palloc(sizeof(char) * max_phonemes + 1);
 7915 bruce                     376 ECB             :     }
                                377                 : 
                                378                 :     /*-- The first phoneme has to be processed specially. --*/
                                379                 :     /* Find our first letter */
 7770 tgl                       380 GIC           1 :     for (; !isalpha((unsigned char) (Curr_Letter)); w_idx++)
 7836 bruce                     381 ECB             :     {
                                382                 :         /* On the off chance we were given nothing but crap... */
 7836 bruce                     383 UIC           0 :         if (Curr_Letter == '\0')
 7836 bruce                     384 EUB             :         {
 7832 bruce                     385 UIC           0 :             End_Phoned_Word;
 2061 peter_e                   386 UBC           0 :             return;
 7915 bruce                     387 EUB             :         }
                                388                 :     }
                                389                 : 
 7836 bruce                     390 GIC           1 :     switch (Curr_Letter)
 7836 bruce                     391 ECB             :     {
                                392                 :             /* AE becomes E */
 7915 bruce                     393 UIC           0 :         case 'A':
 7836 bruce                     394 UBC           0 :             if (Next_Letter == 'E')
 7836 bruce                     395 EUB             :             {
 7915 bruce                     396 UIC           0 :                 Phonize('E');
 7836 bruce                     397 UBC           0 :                 w_idx += 2;
 7915 bruce                     398 EUB             :             }
                                399                 :             /* Remember, preserve vowels at the beginning */
                                400                 :             else
                                401                 :             {
 7915 bruce                     402 UIC           0 :                 Phonize('A');
 7915 bruce                     403 UBC           0 :                 w_idx++;
 7915 bruce                     404 EUB             :             }
 7915 bruce                     405 UIC           0 :             break;
 7836 bruce                     406 EUB             :             /* [GKP]N becomes N */
 7915 bruce                     407 GIC           1 :         case 'G':
 7915 bruce                     408 ECB             :         case 'K':
                                409                 :         case 'P':
 7836 bruce                     410 GIC           1 :             if (Next_Letter == 'N')
 7836 bruce                     411 ECB             :             {
 7915 bruce                     412 UIC           0 :                 Phonize('N');
 7836 bruce                     413 UBC           0 :                 w_idx += 2;
 7915 bruce                     414 EUB             :             }
 7915 bruce                     415 GIC           1 :             break;
 7836 bruce                     416 ECB             : 
                                417                 :             /*
                                418                 :              * WH becomes H, WR becomes R W if followed by a vowel
                                419                 :              */
 7915 bruce                     420 UIC           0 :         case 'W':
 7836 bruce                     421 UBC           0 :             if (Next_Letter == 'H' ||
                                422               0 :                 Next_Letter == 'R')
 7915 bruce                     423 EUB             :             {
 7836 bruce                     424 UIC           0 :                 Phonize(Next_Letter);
 7836 bruce                     425 UBC           0 :                 w_idx += 2;
 7915 bruce                     426 EUB             :             }
 7836 bruce                     427 UIC           0 :             else if (isvowel(Next_Letter))
 7836 bruce                     428 EUB             :             {
 7836 bruce                     429 UIC           0 :                 Phonize('W');
 7836 bruce                     430 UBC           0 :                 w_idx += 2;
 7915 bruce                     431 EUB             :             }
                                432                 :             /* else ignore */
 7915 bruce                     433 UIC           0 :             break;
 7836 bruce                     434 EUB             :             /* X becomes S */
 7915 bruce                     435 UIC           0 :         case 'X':
 7915 bruce                     436 UBC           0 :             Phonize('S');
                                437               0 :             w_idx++;
                                438               0 :             break;
 7836 bruce                     439 EUB             :             /* Vowels are kept */
                                440                 : 
                                441                 :             /*
                                442                 :              * We did A already case 'A': case 'a':
                                443                 :              */
 7915 bruce                     444 UIC           0 :         case 'E':
 7915 bruce                     445 EUB             :         case 'I':
                                446                 :         case 'O':
                                447                 :         case 'U':
 7915 bruce                     448 UIC           0 :             Phonize(Curr_Letter);
 7915 bruce                     449 UBC           0 :             w_idx++;
                                450               0 :             break;
                                451               0 :         default:
 7915 bruce                     452 EUB             :             /* do nothing */
 7915 bruce                     453 UIC           0 :             break;
 7915 bruce                     454 EUB             :     }
                                455                 : 
                                456                 : 
                                457                 : 
                                458                 :     /* On to the metaphoning */
 7836 bruce                     459 GIC           6 :     for (; Curr_Letter != '\0' &&
 7836 bruce                     460 CBC           5 :          (max_phonemes == 0 || Phone_Len < max_phonemes);
                                461               5 :          w_idx++)
 7836 bruce                     462 ECB             :     {
                                463                 :         /*
                                464                 :          * How many letters to skip because an earlier encoding handled
                                465                 :          * multiple letters
                                466                 :          */
 7836 bruce                     467 GIC           5 :         unsigned short int skip_letter = 0;
 7836 bruce                     468 ECB             : 
                                469                 : 
                                470                 :         /*
                                471                 :          * THOUGHT:  It would be nice if, rather than having things like...
                                472                 :          * well, SCI.  For SCI you encode the S, then have to remember to skip
                                473                 :          * the C.  So the phonome SCI invades both S and C.  It would be
                                474                 :          * better, IMHO, to skip the C from the S part of the encoding. Hell,
                                475                 :          * I'm trying it.
                                476                 :          */
                                477                 : 
                                478                 :         /* Ignore non-alphas */
 7770 tgl                       479 GIC           5 :         if (!isalpha((unsigned char) (Curr_Letter)))
 7915 bruce                     480 LBC           0 :             continue;
 7836 bruce                     481 EUB             : 
                                482                 :         /* Drop duplicates, except CC */
 7836 bruce                     483 GIC           5 :         if (Curr_Letter == Prev_Letter &&
 7836 bruce                     484 LBC           0 :             Curr_Letter != 'C')
 7915 bruce                     485 UBC           0 :             continue;
 7836 bruce                     486 EUB             : 
 7836 bruce                     487 GIC           5 :         switch (Curr_Letter)
 7836 bruce                     488 ECB             :         {
                                489                 :                 /* B -> B unless in MB */
 7915 bruce                     490 GIC           1 :             case 'B':
 7836 bruce                     491 CBC           1 :                 if (Prev_Letter != 'M')
 7915 bruce                     492 LBC           0 :                     Phonize('B');
 7915 bruce                     493 GBC           1 :                 break;
 7836 bruce                     494 ECB             : 
                                495                 :                 /*
                                496                 :                  * 'sh' if -CIA- or -CH, but not SCH, except SCHW. (SCHW is
                                497                 :                  * handled in S) S if -CI-, -CE- or -CY- dropped if -SCI-,
                                498                 :                  * SCE-, -SCY- (handed in S) else K
                                499                 :                  */
 7915 bruce                     500 UIC           0 :             case 'C':
 7836 bruce                     501 UBC           0 :                 if (MAKESOFT(Next_Letter))
 7836 bruce                     502 EUB             :                 {               /* C[IEY] */
 7836 bruce                     503 UIC           0 :                     if (After_Next_Letter == 'A' &&
 7836 bruce                     504 UBC           0 :                         Next_Letter == 'I')
 7836 bruce                     505 EUB             :                     {           /* CIA */
 7915 bruce                     506 UIC           0 :                         Phonize(SH);
 7915 bruce                     507 EUB             :                     }
                                508                 :                     /* SC[IEY] */
 7836 bruce                     509 UIC           0 :                     else if (Prev_Letter == 'S')
 7836 bruce                     510 EUB             :                     {
                                511                 :                         /* Dropped */
                                512                 :                     }
                                513                 :                     else
 7836 bruce                     514 UIC           0 :                         Phonize('S');
 7915 bruce                     515 EUB             :                 }
 7836 bruce                     516 UIC           0 :                 else if (Next_Letter == 'H')
 7836 bruce                     517 EUB             :                 {
                                518                 : #ifndef USE_TRADITIONAL_METAPHONE
 7836 bruce                     519 UIC           0 :                     if (After_Next_Letter == 'R' ||
 7836 bruce                     520 UBC           0 :                         Prev_Letter == 'S')
 7836 bruce                     521 EUB             :                     {           /* Christ, School */
 7915 bruce                     522 UIC           0 :                         Phonize('K');
 7915 bruce                     523 EUB             :                     }
                                524                 :                     else
 7915 bruce                     525 UIC           0 :                         Phonize(SH);
 7915 bruce                     526 EUB             : #else
                                527                 :                     Phonize(SH);
                                528                 : #endif
 7915 bruce                     529 UIC           0 :                     skip_letter++;
 7915 bruce                     530 EUB             :                 }
                                531                 :                 else
 7915 bruce                     532 UIC           0 :                     Phonize('K');
 7915 bruce                     533 UBC           0 :                 break;
 7836 bruce                     534 EUB             : 
                                535                 :                 /*
                                536                 :                  * J if in -DGE-, -DGI- or -DGY- else T
                                537                 :                  */
 7915 bruce                     538 UIC           0 :             case 'D':
 7836 bruce                     539 UBC           0 :                 if (Next_Letter == 'G' &&
                                540               0 :                     MAKESOFT(After_Next_Letter))
 7836 bruce                     541 EUB             :                 {
 7915 bruce                     542 UIC           0 :                     Phonize('J');
 7915 bruce                     543 UBC           0 :                     skip_letter++;
 7915 bruce                     544 EUB             :                 }
                                545                 :                 else
 7915 bruce                     546 UIC           0 :                     Phonize('T');
 7915 bruce                     547 UBC           0 :                 break;
 7836 bruce                     548 EUB             : 
                                549                 :                 /*
                                550                 :                  * F if in -GH and not B--GH, D--GH, -H--GH, -H---GH else
                                551                 :                  * dropped if -GNED, -GN, else dropped if -DGE-, -DGI- or
                                552                 :                  * -DGY- (handled in D) else J if in -GE-, -GI, -GY and not GG
                                553                 :                  * else K
                                554                 :                  */
 7915 bruce                     555 GIC           1 :             case 'G':
 7836 bruce                     556 CBC           1 :                 if (Next_Letter == 'H')
 7836 bruce                     557 ECB             :                 {
 7836 bruce                     558 UIC           0 :                     if (!(NOGHTOF(Look_Back_Letter(3)) ||
 7836 bruce                     559 UBC           0 :                           Look_Back_Letter(4) == 'H'))
 7836 bruce                     560 EUB             :                     {
 7915 bruce                     561 UIC           0 :                         Phonize('F');
 7915 bruce                     562 UBC           0 :                         skip_letter++;
 7915 bruce                     563 EUB             :                     }
                                564                 :                     else
                                565                 :                     {
                                566                 :                         /* silent */
                                567                 :                     }
                                568                 :                 }
 7836 bruce                     569 GIC           1 :                 else if (Next_Letter == 'N')
 7836 bruce                     570 ECB             :                 {
 7836 bruce                     571 UIC           0 :                     if (Isbreak(After_Next_Letter) ||
 7836 bruce                     572 UBC           0 :                         (After_Next_Letter == 'E' &&
                                573               0 :                          Look_Ahead_Letter(3) == 'D'))
 7836 bruce                     574 EUB             :                     {
                                575                 :                         /* dropped */
                                576                 :                     }
                                577                 :                     else
 7915 bruce                     578 UIC           0 :                         Phonize('K');
 7915 bruce                     579 EUB             :                 }
 7836 bruce                     580 GIC           1 :                 else if (MAKESOFT(Next_Letter) &&
 7836 bruce                     581 LBC           0 :                          Prev_Letter != 'G')
 7915 bruce                     582 UBC           0 :                     Phonize('J');
 7836 bruce                     583 EUB             :                 else
 7915 bruce                     584 GIC           1 :                     Phonize('K');
 7915 bruce                     585 CBC           1 :                 break;
 7836 bruce                     586 ECB             :                 /* H if before a vowel and not after C,G,P,S,T */
 7915 bruce                     587 UIC           0 :             case 'H':
 7836 bruce                     588 UBC           0 :                 if (isvowel(Next_Letter) &&
                                589               0 :                     !AFFECTH(Prev_Letter))
 7915                           590               0 :                     Phonize('H');
                                591               0 :                 break;
 7836 bruce                     592 EUB             : 
                                593                 :                 /*
                                594                 :                  * dropped if after C else K
                                595                 :                  */
 7915 bruce                     596 UIC           0 :             case 'K':
 7836 bruce                     597 UBC           0 :                 if (Prev_Letter != 'C')
 7915                           598               0 :                     Phonize('K');
                                599               0 :                 break;
 7836 bruce                     600 EUB             : 
                                601                 :                 /*
                                602                 :                  * F if before H else P
                                603                 :                  */
 7915 bruce                     604 UIC           0 :             case 'P':
 7836 bruce                     605 UBC           0 :                 if (Next_Letter == 'H')
 7915                           606               0 :                     Phonize('F');
 7836 bruce                     607 EUB             :                 else
 7915 bruce                     608 UIC           0 :                     Phonize('P');
 7915 bruce                     609 UBC           0 :                 break;
 7836 bruce                     610 EUB             : 
                                611                 :                 /*
                                612                 :                  * K
                                613                 :                  */
 7915 bruce                     614 UIC           0 :             case 'Q':
 7915 bruce                     615 UBC           0 :                 Phonize('K');
                                616               0 :                 break;
 7836 bruce                     617 EUB             : 
                                618                 :                 /*
                                619                 :                  * 'sh' in -SH-, -SIO- or -SIA- or -SCHW- else S
                                620                 :                  */
 7915 bruce                     621 UIC           0 :             case 'S':
 7836 bruce                     622 UBC           0 :                 if (Next_Letter == 'I' &&
                                623               0 :                     (After_Next_Letter == 'O' ||
                                624               0 :                      After_Next_Letter == 'A'))
 7915                           625               0 :                     Phonize(SH);
 7836                           626               0 :                 else if (Next_Letter == 'H')
 7836 bruce                     627 EUB             :                 {
 7915 bruce                     628 UIC           0 :                     Phonize(SH);
 7915 bruce                     629 UBC           0 :                     skip_letter++;
 7915 bruce                     630 EUB             :                 }
                                631                 : #ifndef USE_TRADITIONAL_METAPHONE
 7836 bruce                     632 UIC           0 :                 else if (Next_Letter == 'C' &&
 7836 bruce                     633 UBC           0 :                          Look_Ahead_Letter(2) == 'H' &&
                                634               0 :                          Look_Ahead_Letter(3) == 'W')
 7836 bruce                     635 EUB             :                 {
 7915 bruce                     636 UIC           0 :                     Phonize(SH);
 7915 bruce                     637 UBC           0 :                     skip_letter += 2;
 7915 bruce                     638 EUB             :                 }
                                639                 : #endif
                                640                 :                 else
 7915 bruce                     641 UIC           0 :                     Phonize('S');
 7915 bruce                     642 UBC           0 :                 break;
 7836 bruce                     643 EUB             : 
                                644                 :                 /*
                                645                 :                  * 'sh' in -TIA- or -TIO- else 'th' before H else T
                                646                 :                  */
 7915 bruce                     647 UIC           0 :             case 'T':
 7836 bruce                     648 UBC           0 :                 if (Next_Letter == 'I' &&
                                649               0 :                     (After_Next_Letter == 'O' ||
                                650               0 :                      After_Next_Letter == 'A'))
 7915                           651               0 :                     Phonize(SH);
 7836                           652               0 :                 else if (Next_Letter == 'H')
 7836 bruce                     653 EUB             :                 {
 7915 bruce                     654 UIC           0 :                     Phonize(TH);
 7915 bruce                     655 UBC           0 :                     skip_letter++;
 7915 bruce                     656 EUB             :                 }
                                657                 :                 else
 7915 bruce                     658 UIC           0 :                     Phonize('T');
 7915 bruce                     659 UBC           0 :                 break;
 7836 bruce                     660 EUB             :                 /* F */
 7915 bruce                     661 UIC           0 :             case 'V':
 7915 bruce                     662 UBC           0 :                 Phonize('F');
                                663               0 :                 break;
 7836 bruce                     664 EUB             :                 /* W before a vowel, else dropped */
 7915 bruce                     665 UIC           0 :             case 'W':
 7836 bruce                     666 UBC           0 :                 if (isvowel(Next_Letter))
 7915                           667               0 :                     Phonize('W');
                                668               0 :                 break;
 7836 bruce                     669 EUB             :                 /* KS */
 7915 bruce                     670 UIC           0 :             case 'X':
 7915 bruce                     671 UBC           0 :                 Phonize('K');
 7229                           672               0 :                 if (max_phonemes == 0 || Phone_Len < max_phonemes)
                                673               0 :                     Phonize('S');
 7915                           674               0 :                 break;
 7836 bruce                     675 EUB             :                 /* Y if followed by a vowel */
 7915 bruce                     676 UIC           0 :             case 'Y':
 7836 bruce                     677 UBC           0 :                 if (isvowel(Next_Letter))
 7915                           678               0 :                     Phonize('Y');
                                679               0 :                 break;
 7836 bruce                     680 EUB             :                 /* S */
 7915 bruce                     681 UIC           0 :             case 'Z':
 7915 bruce                     682 UBC           0 :                 Phonize('S');
                                683               0 :                 break;
 7836 bruce                     684 EUB             :                 /* No transformation */
 7915 bruce                     685 GIC           1 :             case 'F':
 7915 bruce                     686 ECB             :             case 'J':
                                687                 :             case 'L':
                                688                 :             case 'M':
                                689                 :             case 'N':
                                690                 :             case 'R':
 7915 bruce                     691 GIC           1 :                 Phonize(Curr_Letter);
 7915 bruce                     692 CBC           1 :                 break;
                                693               2 :             default:
 7915 bruce                     694 ECB             :                 /* nothing */
 7915 bruce                     695 GIC           2 :                 break;
 7836 bruce                     696 ECB             :         }                       /* END SWITCH */
                                697                 : 
 7915 bruce                     698 GIC           5 :         w_idx += skip_letter;
 7836 bruce                     699 ECB             :     }                           /* END FOR */
                                700                 : 
 7915 bruce                     701 GIC           1 :     End_Phoned_Word;
 2118 tgl                       702 ECB             : }                               /* END metaphone */
                                703                 : 
                                704                 : 
                                705                 : /*
                                706                 :  * SQL function: soundex(text) returns text
                                707                 :  */
 7915 bruce                     708 GIC           3 : PG_FUNCTION_INFO_V1(soundex);
 7915 bruce                     709 ECB             : 
                                710                 : Datum
 7915 bruce                     711 GIC           7 : soundex(PG_FUNCTION_ARGS)
 7915 bruce                     712 ECB             : {
                                713                 :     char        outstr[SOUNDEX_LEN + 1];
                                714                 :     char       *arg;
                                715                 : 
 2219 noah                      716 GIC           7 :     arg = text_to_cstring(PG_GETARG_TEXT_PP(0));
 7915 bruce                     717 ECB             : 
 7915 bruce                     718 GIC           7 :     _soundex(arg, outstr);
 7915 bruce                     719 ECB             : 
 5493 tgl                       720 GIC           7 :     PG_RETURN_TEXT_P(cstring_to_text(outstr));
 7915 bruce                     721 ECB             : }
                                722                 : 
                                723                 : static void
 7915 bruce                     724 GIC          13 : _soundex(const char *instr, char *outstr)
 7915 bruce                     725 ECB             : {
                                726                 :     int         count;
                                727                 : 
  163 peter                     728 GNC          13 :     Assert(instr);
                                729              13 :     Assert(outstr);
 7915 bruce                     730 ECB             : 
 7915 bruce                     731 GIC          13 :     outstr[SOUNDEX_LEN] = '\0';
 7915 bruce                     732 ECB             : 
                                733                 :     /* Skip leading non-alphabetic characters */
 7915 bruce                     734 GIC          13 :     while (!isalpha((unsigned char) instr[0]) && instr[0])
 7915 bruce                     735 LBC           0 :         ++instr;
 7915 bruce                     736 EUB             : 
                                737                 :     /* No string left */
 7915 bruce                     738 GIC          13 :     if (!instr[0])
 7915 bruce                     739 ECB             :     {
 7915 bruce                     740 UIC           0 :         outstr[0] = (char) 0;
 7915 bruce                     741 UBC           0 :         return;
 7915 bruce                     742 EUB             :     }
                                743                 : 
                                744                 :     /* Take the first letter as is */
 7915 bruce                     745 GIC          13 :     *outstr++ = (char) toupper((unsigned char) *instr++);
 7915 bruce                     746 ECB             : 
 7915 bruce                     747 GIC          13 :     count = 1;
 7915 bruce                     748 CBC          60 :     while (*instr && count < SOUNDEX_LEN)
 7915 bruce                     749 ECB             :     {
 7915 bruce                     750 GIC          93 :         if (isalpha((unsigned char) *instr) &&
 7915 bruce                     751 CBC          46 :             soundex_code(*instr) != soundex_code(*(instr - 1)))
 7915 bruce                     752 ECB             :         {
 7915 bruce                     753 GIC          35 :             *outstr = soundex_code(instr[0]);
 7915 bruce                     754 CBC          35 :             if (*outstr != '0')
 7915 bruce                     755 ECB             :             {
 7915 bruce                     756 GIC          23 :                 ++outstr;
 7915 bruce                     757 CBC          23 :                 ++count;
 7915 bruce                     758 ECB             :             }
                                759                 :         }
 7915 bruce                     760 GIC          47 :         ++instr;
 7915 bruce                     761 ECB             :     }
                                762                 : 
                                763                 :     /* Fill with 0's */
 7915 bruce                     764 GIC          29 :     while (count < SOUNDEX_LEN)
 7915 bruce                     765 ECB             :     {
 7915 bruce                     766 GIC          16 :         *outstr = '0';
 7915 bruce                     767 CBC          16 :         ++outstr;
                                768              16 :         ++count;
 7915 bruce                     769 ECB             :     }
                                770                 : }
                                771                 : 
 6647 neilc                     772 GIC           2 : PG_FUNCTION_INFO_V1(difference);
 6647 neilc                     773 ECB             : 
                                774                 : Datum
 6647 neilc                     775 GIC           3 : difference(PG_FUNCTION_ARGS)
 6647 neilc                     776 ECB             : {
                                777                 :     char        sndx1[SOUNDEX_LEN + 1],
                                778                 :                 sndx2[SOUNDEX_LEN + 1];
                                779                 :     int         i,
                                780                 :                 result;
                                781                 : 
 2219 noah                      782 GIC           3 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(0)), sndx1);
 2219 noah                      783 CBC           3 :     _soundex(text_to_cstring(PG_GETARG_TEXT_PP(1)), sndx2);
 6647 neilc                     784 ECB             : 
 6647 neilc                     785 GIC           3 :     result = 0;
 6385 bruce                     786 CBC          15 :     for (i = 0; i < SOUNDEX_LEN; i++)
 6385 bruce                     787 ECB             :     {
 6647 neilc                     788 GIC          12 :         if (sndx1[i] == sndx2[i])
 6647 neilc                     789 CBC           6 :             result++;
 6647 neilc                     790 ECB             :     }
                                791                 : 
 6647 neilc                     792 GIC           3 :     PG_RETURN_INT32(result);
 6647 neilc                     793 ECB             : }
        

Generated by: LCOV version v1.16-55-g56c0a2a