LCOV - differential code coverage report
Current view: top level - src/backend/utils/adt - levenshtein.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 77.7 % 103 80 2 7 14 4 13 1 62 5 15 1
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 2 2 2
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * levenshtein.c
       4                 :  *    Levenshtein distance implementation.
       5                 :  *
       6                 :  * Original author:  Joe Conway <mail@joeconway.com>
       7                 :  *
       8                 :  * This file is included by varlena.c twice, to provide matching code for (1)
       9                 :  * Levenshtein distance with custom costings, and (2) Levenshtein distance with
      10                 :  * custom costings and a "max" value above which exact distances are not
      11                 :  * interesting.  Before the inclusion, we rely on the presence of the inline
      12                 :  * function rest_of_char_same().
      13                 :  *
      14                 :  * Written based on a description of the algorithm by Michael Gilleland found
      15                 :  * at http://www.merriampark.com/ld.htm.  Also looked at levenshtein.c in the
      16                 :  * PHP 4.0.6 distribution for inspiration.  Configurable penalty costs
      17                 :  * extension is introduced by Volkan YAZICI <volkan.yazici@gmail.com.
      18                 :  *
      19                 :  * Copyright (c) 2001-2023, PostgreSQL Global Development Group
      20                 :  *
      21                 :  * IDENTIFICATION
      22                 :  *  src/backend/utils/adt/levenshtein.c
      23                 :  *
      24                 :  *-------------------------------------------------------------------------
      25                 :  */
      26                 : #define MAX_LEVENSHTEIN_STRLEN      255
      27                 : 
      28                 : /*
      29                 :  * Calculates Levenshtein distance metric between supplied strings, which are
      30                 :  * not necessarily null-terminated.
      31                 :  *
      32                 :  * source: source string, of length slen bytes.
      33                 :  * target: target string, of length tlen bytes.
      34                 :  * ins_c, del_c, sub_c: costs to charge for character insertion, deletion,
      35                 :  *      and substitution respectively; (1, 1, 1) costs suffice for common
      36                 :  *      cases, but your mileage may vary.
      37                 :  * max_d: if provided and >= 0, maximum distance we care about; see below.
      38                 :  * trusted: caller is trusted and need not obey MAX_LEVENSHTEIN_STRLEN.
      39                 :  *
      40                 :  * One way to compute Levenshtein distance is to incrementally construct
      41                 :  * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
      42                 :  * of operations required to transform the first i characters of s into
      43                 :  * the first j characters of t.  The last column of the final row is the
      44                 :  * answer.
      45                 :  *
      46                 :  * We use that algorithm here with some modification.  In lieu of holding
      47                 :  * the entire array in memory at once, we'll just use two arrays of size
      48                 :  * m+1 for storing accumulated values. At each step one array represents
      49                 :  * the "previous" row and one is the "current" row of the notional large
      50                 :  * array.
      51                 :  *
      52                 :  * If max_d >= 0, we only need to provide an accurate answer when that answer
      53                 :  * is less than or equal to max_d.  From any cell in the matrix, there is
      54                 :  * theoretical "minimum residual distance" from that cell to the last column
      55                 :  * of the final row.  This minimum residual distance is zero when the
      56                 :  * untransformed portions of the strings are of equal length (because we might
      57                 :  * get lucky and find all the remaining characters matching) and is otherwise
      58                 :  * based on the minimum number of insertions or deletions needed to make them
      59                 :  * equal length.  The residual distance grows as we move toward the upper
      60                 :  * right or lower left corners of the matrix.  When the max_d bound is
      61                 :  * usefully tight, we can use this property to avoid computing the entirety
      62                 :  * of each row; instead, we maintain a start_column and stop_column that
      63                 :  * identify the portion of the matrix close to the diagonal which can still
      64                 :  * affect the final answer.
      65                 :  */
      66                 : int
      67                 : #ifdef LEVENSHTEIN_LESS_EQUAL
      68 CBC        1223 : varstr_levenshtein_less_equal(const char *source, int slen,
      69                 :                               const char *target, int tlen,
      70                 :                               int ins_c, int del_c, int sub_c,
      71                 :                               int max_d, bool trusted)
      72                 : #else
      73               2 : varstr_levenshtein(const char *source, int slen,
      74                 :                    const char *target, int tlen,
      75                 :                    int ins_c, int del_c, int sub_c,
      76                 :                    bool trusted)
      77                 : #endif
      78                 : {
      79                 :     int         m,
      80                 :                 n;
      81                 :     int        *prev;
      82                 :     int        *curr;
      83            1225 :     int        *s_char_len = NULL;
      84                 :     int         j;
      85                 :     const char *y;
      86                 : 
      87                 :     /*
      88                 :      * For varstr_levenshtein_less_equal, we have real variables called
      89                 :      * start_column and stop_column; otherwise it's just short-hand for 0 and
      90                 :      * m.
      91                 :      */
      92                 : #ifdef LEVENSHTEIN_LESS_EQUAL
      93                 :     int         start_column,
      94                 :                 stop_column;
      95                 : 
      96                 : #undef START_COLUMN
      97                 : #undef STOP_COLUMN
      98                 : #define START_COLUMN start_column
      99                 : #define STOP_COLUMN stop_column
     100                 : #else
     101                 : #undef START_COLUMN
     102                 : #undef STOP_COLUMN
     103                 : #define START_COLUMN 0
     104                 : #define STOP_COLUMN m
     105                 : #endif
     106                 : 
     107 ECB             :     /* Convert string lengths (in bytes) to lengths in characters */
     108 CBC        1225 :     m = pg_mbstrlen_with_len(source, slen);
     109 GIC        1225 :     n = pg_mbstrlen_with_len(target, tlen);
     110                 : 
     111                 :     /*
     112                 :      * We can transform an empty s into t with n insertions, or a non-empty t
     113                 :      * into an empty s with m deletions.
     114 ECB             :      */
     115 GBC        1225 :     if (!m)
     116 LBC           0 :         return n * ins_c;
     117 GBC        1225 :     if (!n)
     118 UIC           0 :         return m * del_c;
     119                 : 
     120                 :     /*
     121                 :      * For security concerns, restrict excessive CPU+RAM usage. (This
     122                 :      * implementation uses O(m) memory and has O(mn) complexity.)  If
     123                 :      * "trusted" is true, caller is responsible for not making excessive
     124                 :      * requests, typically by using a small max_d along with strings that are
     125                 :      * bounded, though not necessarily to MAX_LEVENSHTEIN_STRLEN exactly.
     126 ECB             :      */
     127 CBC        1225 :     if (!trusted &&
     128 GIC           4 :         (m > MAX_LEVENSHTEIN_STRLEN ||
     129 EUB             :          n > MAX_LEVENSHTEIN_STRLEN))
     130 UIC           0 :         ereport(ERROR,
     131                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     132                 :                  errmsg("levenshtein argument exceeds maximum length of %d characters",
     133                 :                         MAX_LEVENSHTEIN_STRLEN)));
     134                 : 
     135                 : #ifdef LEVENSHTEIN_LESS_EQUAL
     136 ECB             :     /* Initialize start and stop columns. */
     137 CBC        1223 :     start_column = 0;
     138 GIC        1223 :     stop_column = m + 1;
     139                 : 
     140                 :     /*
     141                 :      * If max_d >= 0, determine whether the bound is impossibly tight.  If so,
     142                 :      * return max_d + 1 immediately.  Otherwise, determine whether it's tight
     143                 :      * enough to limit the computation we must perform.  If so, figure out
     144                 :      * initial stop column.
     145 ECB             :      */
     146 GIC        1223 :     if (max_d >= 0)
     147                 :     {
     148                 :         int         min_theo_d; /* Theoretical minimum distance. */
     149 ECB             :         int         max_theo_d; /* Theoretical maximum distance. */
     150 GIC        1223 :         int         net_inserts = n - m;
     151 ECB             : 
     152 CBC        1223 :         min_theo_d = net_inserts < 0 ?
     153            1223 :             -net_inserts * del_c : net_inserts * ins_c;
     154            1223 :         if (min_theo_d > max_d)
     155             415 :             return max_d + 1;
     156 GBC         808 :         if (ins_c + del_c < sub_c)
     157 LBC           0 :             sub_c = ins_c + del_c;
     158 CBC         808 :         max_theo_d = min_theo_d + sub_c * Min(m, n);
     159             808 :         if (max_d >= max_theo_d)
     160             252 :             max_d = -1;
     161 GIC         556 :         else if (ins_c + del_c > 0)
     162                 :         {
     163                 :             /*
     164                 :              * Figure out how much of the first row of the notional matrix we
     165                 :              * need to fill in.  If the string is growing, the theoretical
     166                 :              * minimum distance already incorporates the cost of deleting the
     167                 :              * number of characters necessary to make the two strings equal in
     168                 :              * length.  Each additional deletion forces another insertion, so
     169                 :              * the best-case total cost increases by ins_c + del_c. If the
     170                 :              * string is shrinking, the minimum theoretical cost assumes no
     171                 :              * excess deletions; that is, we're starting no further right than
     172                 :              * column n - m.  If we do start further right, the best-case
     173                 :              * total cost increases by ins_c + del_c for each move right.
     174 ECB             :              */
     175 CBC         556 :             int         slack_d = max_d - min_theo_d;
     176 GIC         556 :             int         best_column = net_inserts < 0 ? -net_inserts : 0;
     177 ECB             : 
     178 CBC         556 :             stop_column = best_column + (slack_d / (ins_c + del_c)) + 1;
     179 GBC         556 :             if (stop_column > m)
     180 UIC           0 :                 stop_column = m + 1;
     181                 :         }
     182                 :     }
     183                 : #endif
     184                 : 
     185                 :     /*
     186                 :      * In order to avoid calling pg_mblen() repeatedly on each character in s,
     187                 :      * we cache all the lengths before starting the main loop -- but if all
     188                 :      * the characters in both strings are single byte, then we skip this and
     189                 :      * use a fast-path in the main loop.  If only one string contains
     190                 :      * multi-byte characters, we still build the array, so that the fast-path
     191                 :      * needn't deal with the case where the array hasn't been initialized.
     192 ECB             :      */
     193 GIC         810 :     if (m != slen || n != tlen)
     194                 :     {
     195 EUB             :         int         i;
     196 UIC           0 :         const char *cp = source;
     197 EUB             : 
     198 UBC           0 :         s_char_len = (int *) palloc((m + 1) * sizeof(int));
     199 UIC           0 :         for (i = 0; i < m; ++i)
     200 EUB             :         {
     201 UBC           0 :             s_char_len[i] = pg_mblen(cp);
     202 UIC           0 :             cp += s_char_len[i];
     203 EUB             :         }
     204 UIC           0 :         s_char_len[i] = 0;
     205                 :     }
     206                 : 
     207 ECB             :     /* One more cell for initialization column and row. */
     208 CBC         810 :     ++m;
     209 GIC         810 :     ++n;
     210                 : 
     211 ECB             :     /* Previous and current rows of notional array. */
     212 CBC         810 :     prev = (int *) palloc(2 * m * sizeof(int));
     213 GIC         810 :     curr = prev + m;
     214                 : 
     215                 :     /*
     216                 :      * To transform the first i characters of s into the first 0 characters of
     217                 :      * t, we must perform i deletions.
     218 ECB             :      */
     219 GNC        3176 :     for (int i = START_COLUMN; i < STOP_COLUMN; i++)
     220 GIC        2366 :         prev[i] = i * del_c;
     221                 : 
     222 ECB             :     /* Loop through rows of the notional array */
     223 GIC        3153 :     for (y = target, j = 1; j < n; j++)
     224                 :     {
     225 ECB             :         int        *temp;
     226 CBC        2804 :         const char *x = source;
     227 GIC        2804 :         int         y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
     228                 :         int         i;
     229                 : 
     230                 : #ifdef LEVENSHTEIN_LESS_EQUAL
     231                 : 
     232                 :         /*
     233                 :          * In the best case, values percolate down the diagonal unchanged, so
     234                 :          * we must increment stop_column unless it's already on the right end
     235                 :          * of the array.  The inner loop will read prev[stop_column], so we
     236                 :          * have to initialize it even though it shouldn't affect the result.
     237                 :          */
     238 CBC        2792 :         if (stop_column < m)
     239                 :         {
     240            2206 :             prev[stop_column] = max_d + 1;
     241            2206 :             ++stop_column;
     242                 :         }
     243                 : 
     244                 :         /*
     245                 :          * The main loop fills in curr, but curr[0] needs a special case: to
     246                 :          * transform the first 0 characters of s into the first j characters
     247                 :          * of t, we must perform j insertions.  However, if start_column > 0,
     248                 :          * this special case does not apply.
     249                 :          */
     250            2792 :         if (start_column == 0)
     251                 :         {
     252            1763 :             curr[0] = j * ins_c;
     253            1763 :             i = 1;
     254                 :         }
     255                 :         else
     256            1029 :             i = start_column;
     257                 : #else
     258              12 :         curr[0] = j * ins_c;
     259              12 :         i = 1;
     260                 : #endif
     261                 : 
     262                 :         /*
     263                 :          * This inner loop is critical to performance, so we include a
     264                 :          * fast-path to handle the (fairly common) case where no multibyte
     265                 :          * characters are in the mix.  The fast-path is entitled to assume
     266                 :          * that if s_char_len is not initialized then BOTH strings contain
     267                 :          * only single-byte characters.
     268                 :          */
     269            2804 :         if (s_char_len != NULL)
     270                 :         {
     271 UBC           0 :             for (; i < STOP_COLUMN; i++)
     272                 :             {
     273                 :                 int         ins;
     274                 :                 int         del;
     275                 :                 int         sub;
     276               0 :                 int         x_char_len = s_char_len[i - 1];
     277                 : 
     278                 :                 /*
     279                 :                  * Calculate costs for insertion, deletion, and substitution.
     280                 :                  *
     281                 :                  * When calculating cost for substitution, we compare the last
     282                 :                  * character of each possibly-multibyte character first,
     283                 :                  * because that's enough to rule out most mis-matches.  If we
     284                 :                  * get past that test, then we compare the lengths and the
     285                 :                  * remaining bytes.
     286                 :                  */
     287               0 :                 ins = prev[i] + ins_c;
     288               0 :                 del = curr[i - 1] + del_c;
     289               0 :                 if (x[x_char_len - 1] == y[y_char_len - 1]
     290               0 :                     && x_char_len == y_char_len &&
     291               0 :                     (x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
     292               0 :                     sub = prev[i - 1];
     293                 :                 else
     294               0 :                     sub = prev[i - 1] + sub_c;
     295                 : 
     296                 :                 /* Take the one with minimum cost. */
     297               0 :                 curr[i] = Min(ins, del);
     298               0 :                 curr[i] = Min(curr[i], sub);
     299                 : 
     300                 :                 /* Point to next character. */
     301               0 :                 x += x_char_len;
     302                 :             }
     303                 :         }
     304                 :         else
     305                 :         {
     306 CBC       11494 :             for (; i < STOP_COLUMN; i++)
     307                 :             {
     308                 :                 int         ins;
     309                 :                 int         del;
     310                 :                 int         sub;
     311                 : 
     312                 :                 /* Calculate costs for insertion, deletion, and substitution. */
     313            8690 :                 ins = prev[i] + ins_c;
     314            8690 :                 del = curr[i - 1] + del_c;
     315            8690 :                 sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
     316                 : 
     317                 :                 /* Take the one with minimum cost. */
     318            8690 :                 curr[i] = Min(ins, del);
     319            8690 :                 curr[i] = Min(curr[i], sub);
     320                 : 
     321                 :                 /* Point to next character. */
     322            8690 :                 x++;
     323                 :             }
     324                 :         }
     325                 : 
     326                 :         /* Swap current row with previous row. */
     327            2804 :         temp = curr;
     328            2804 :         curr = prev;
     329            2804 :         prev = temp;
     330                 : 
     331                 :         /* Point to next character. */
     332              12 :         y += y_char_len;
     333                 : 
     334                 : #ifdef LEVENSHTEIN_LESS_EQUAL
     335                 : 
     336                 :         /*
     337                 :          * This chunk of code represents a significant performance hit if used
     338                 :          * in the case where there is no max_d bound.  This is probably not
     339                 :          * because the max_d >= 0 test itself is expensive, but rather because
     340                 :          * the possibility of needing to execute this code prevents tight
     341                 :          * optimization of the loop as a whole.
     342                 :          */
     343            2792 :         if (max_d >= 0)
     344                 :         {
     345                 :             /*
     346                 :              * The "zero point" is the column of the current row where the
     347                 :              * remaining portions of the strings are of equal length.  There
     348                 :              * are (n - 1) characters in the target string, of which j have
     349                 :              * been transformed.  There are (m - 1) characters in the source
     350                 :              * string, so we want to find the value for zp where (n - 1) - j =
     351                 :              * (m - 1) - zp.
     352                 :              */
     353            2276 :             int         zp = j - (n - m);
     354                 : 
     355                 :             /* Check whether the stop column can slide left. */
     356            5408 :             while (stop_column > 0)
     357                 :             {
     358            4947 :                 int         ii = stop_column - 1;
     359            4947 :                 int         net_inserts = ii - zp;
     360                 : 
     361            8424 :                 if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c :
     362            3477 :                                 -net_inserts * del_c) <= max_d)
     363            1815 :                     break;
     364            3132 :                 stop_column--;
     365                 :             }
     366                 : 
     367                 :             /* Check whether the start column can slide right. */
     368            3790 :             while (start_column < stop_column)
     369                 :             {
     370            3329 :                 int         net_inserts = start_column - zp;
     371                 : 
     372            3329 :                 if (prev[start_column] +
     373            3329 :                     (net_inserts > 0 ? net_inserts * ins_c :
     374            3149 :                      -net_inserts * del_c) <= max_d)
     375            1815 :                     break;
     376                 : 
     377                 :                 /*
     378                 :                  * We'll never again update these values, so we must make sure
     379                 :                  * there's nothing here that could confuse any future
     380                 :                  * iteration of the outer loop.
     381                 :                  */
     382            1514 :                 prev[start_column] = max_d + 1;
     383            1514 :                 curr[start_column] = max_d + 1;
     384            1514 :                 if (start_column != 0)
     385            1036 :                     source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
     386            1514 :                 start_column++;
     387                 :             }
     388                 : 
     389                 :             /* If they cross, we're going to exceed the bound. */
     390            2276 :             if (start_column >= stop_column)
     391             461 :                 return max_d + 1;
     392                 :         }
     393                 : #endif
     394                 :     }
     395                 : 
     396                 :     /*
     397                 :      * Because the final value was swapped from the previous row to the
     398                 :      * current row, that's where we'll find it.
     399                 :      */
     400             349 :     return prev[m - 1];
     401                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a