LCOV - Differential Code Coverage HEAD vs 15 - src/backend/utils/adt/levenshtein.c

LCOV - differential code coverage report

Current view:	top level - src/backend/utils/adt - levenshtein.c (source / functions)		Coverage	Total	Hit	LBC	UIC	UBC	GBC	GIC	GNC	CBC	EUB	ECB	DCB
Current:	Differential Code Coverage HEAD vs 15	Lines:	77.7 %	103	80	2	7	14	4	13	1	62	5	15	1
Current Date:	2023-04-08 17:13:01	Functions:	100.0 %	2	2						2
Baseline:	15	Line coverage date bins:
Baseline Date:	2023-04-08 15:09:40	(180,240] days:	100.0 %	1	1						1
Legend:	Lines: hit not hit	(240..) days:	77.5 %	102	79	2	7	14	4	13		62	5	15
		Function coverage date bins:
		(240..) days:	100.0 %	2	2						2

 Age         Owner                  TLA  Line data    Source code

                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * levenshtein.c
                                  4                 :  *    Levenshtein distance implementation.
                                  5                 :  *
                                  6                 :  * Original author:  Joe Conway <mail@joeconway.com>
                                  7                 :  *
                                  8                 :  * This file is included by varlena.c twice, to provide matching code for (1)
                                  9                 :  * Levenshtein distance with custom costings, and (2) Levenshtein distance with
                                 10                 :  * custom costings and a "max" value above which exact distances are not
                                 11                 :  * interesting.  Before the inclusion, we rely on the presence of the inline
                                 12                 :  * function rest_of_char_same().
                                 13                 :  *
                                 14                 :  * Written based on a description of the algorithm by Michael Gilleland found
                                 15                 :  * at http://www.merriampark.com/ld.htm.  Also looked at levenshtein.c in the
                                 16                 :  * PHP 4.0.6 distribution for inspiration.  Configurable penalty costs
                                 17                 :  * extension is introduced by Volkan YAZICI <volkan.yazici@gmail.com.
                                 18                 :  *
                                 19                 :  * Copyright (c) 2001-2023, PostgreSQL Global Development Group
                                 20                 :  *
                                 21                 :  * IDENTIFICATION
                                 22                 :  *  src/backend/utils/adt/levenshtein.c
                                 23                 :  *
                                 24                 :  *-------------------------------------------------------------------------
                                 25                 :  */
                                 26                 : #define MAX_LEVENSHTEIN_STRLEN      255
                                 27                 : 
                                 28                 : /*
                                 29                 :  * Calculates Levenshtein distance metric between supplied strings, which are
                                 30                 :  * not necessarily null-terminated.
                                 31                 :  *
                                 32                 :  * source: source string, of length slen bytes.
                                 33                 :  * target: target string, of length tlen bytes.
                                 34                 :  * ins_c, del_c, sub_c: costs to charge for character insertion, deletion,
                                 35                 :  *      and substitution respectively; (1, 1, 1) costs suffice for common
                                 36                 :  *      cases, but your mileage may vary.
                                 37                 :  * max_d: if provided and >= 0, maximum distance we care about; see below.
                                 38                 :  * trusted: caller is trusted and need not obey MAX_LEVENSHTEIN_STRLEN.
                                 39                 :  *
                                 40                 :  * One way to compute Levenshtein distance is to incrementally construct
                                 41                 :  * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
                                 42                 :  * of operations required to transform the first i characters of s into
                                 43                 :  * the first j characters of t.  The last column of the final row is the
                                 44                 :  * answer.
                                 45                 :  *
                                 46                 :  * We use that algorithm here with some modification.  In lieu of holding
                                 47                 :  * the entire array in memory at once, we'll just use two arrays of size
                                 48                 :  * m+1 for storing accumulated values. At each step one array represents
                                 49                 :  * the "previous" row and one is the "current" row of the notional large
                                 50                 :  * array.
                                 51                 :  *
                                 52                 :  * If max_d >= 0, we only need to provide an accurate answer when that answer
                                 53                 :  * is less than or equal to max_d.  From any cell in the matrix, there is
                                 54                 :  * theoretical "minimum residual distance" from that cell to the last column
                                 55                 :  * of the final row.  This minimum residual distance is zero when the
                                 56                 :  * untransformed portions of the strings are of equal length (because we might
                                 57                 :  * get lucky and find all the remaining characters matching) and is otherwise
                                 58                 :  * based on the minimum number of insertions or deletions needed to make them
                                 59                 :  * equal length.  The residual distance grows as we move toward the upper
                                 60                 :  * right or lower left corners of the matrix.  When the max_d bound is
                                 61                 :  * usefully tight, we can use this property to avoid computing the entirety
                                 62                 :  * of each row; instead, we maintain a start_column and stop_column that
                                 63                 :  * identify the portion of the matrix close to the diagonal which can still
                                 64                 :  * affect the final answer.
                                 65                 :  */
                                 66                 : int
                                 67                 : #ifdef LEVENSHTEIN_LESS_EQUAL
 2634 tgl                        68 CBC        1223 : varstr_levenshtein_less_equal(const char *source, int slen,
                                 69                 :                               const char *target, int tlen,
                                 70                 :                               int ins_c, int del_c, int sub_c,
                                 71                 :                               int max_d, bool trusted)
                                 72                 : #else
                                 73               2 : varstr_levenshtein(const char *source, int slen,
                                 74                 :                    const char *target, int tlen,
                                 75                 :                    int ins_c, int del_c, int sub_c,
                                 76                 :                    bool trusted)
                                 77                 : #endif
                                 78                 : {
                                 79                 :     int         m,
                                 80                 :                 n;
                                 81                 :     int        *prev;
                                 82                 :     int        *curr;
 4555 rhaas                      83            1225 :     int        *s_char_len = NULL;
                                 84                 :     int         j;
                                 85                 :     const char *y;
                                 86                 : 
                                 87                 :     /*
                                 88                 :      * For varstr_levenshtein_less_equal, we have real variables called
                                 89                 :      * start_column and stop_column; otherwise it's just short-hand for 0 and
                                 90                 :      * m.
                                 91                 :      */
                                 92                 : #ifdef LEVENSHTEIN_LESS_EQUAL
                                 93                 :     int         start_column,
                                 94                 :                 stop_column;
                                 95                 : 
                                 96                 : #undef START_COLUMN
                                 97                 : #undef STOP_COLUMN
                                 98                 : #define START_COLUMN start_column
                                 99                 : #define STOP_COLUMN stop_column
                                100                 : #else
                                101                 : #undef START_COLUMN
                                102                 : #undef STOP_COLUMN
                                103                 : #define START_COLUMN 0
                                104                 : #define STOP_COLUMN m
                                105                 : #endif
                                106                 : 
 2634 tgl                       107 ECB             :     /* Convert string lengths (in bytes) to lengths in characters */
 3069 rhaas                     108 CBC        1225 :     m = pg_mbstrlen_with_len(source, slen);
 3069 rhaas                     109 GIC        1225 :     n = pg_mbstrlen_with_len(target, tlen);
                                110                 : 
                                111                 :     /*
                                112                 :      * We can transform an empty s into t with n insertions, or a non-empty t
                                113                 :      * into an empty s with m deletions.
 4555 rhaas                     114 ECB             :      */
 4555 rhaas                     115 GBC        1225 :     if (!m)
 4555 rhaas                     116 LBC           0 :         return n * ins_c;
 4555 rhaas                     117 GBC        1225 :     if (!n)
 4555 rhaas                     118 UIC           0 :         return m * del_c;
                                119                 : 
                                120                 :     /*
                                121                 :      * For security concerns, restrict excessive CPU+RAM usage. (This
                                122                 :      * implementation uses O(m) memory and has O(mn) complexity.)  If
                                123                 :      * "trusted" is true, caller is responsible for not making excessive
                                124                 :      * requests, typically by using a small max_d along with strings that are
                                125                 :      * bounded, though not necessarily to MAX_LEVENSHTEIN_STRLEN exactly.
 4555 rhaas                     126 ECB             :      */
 2634 tgl                       127 CBC        1225 :     if (!trusted &&
 2634 tgl                       128 GIC           4 :         (m > MAX_LEVENSHTEIN_STRLEN ||
 2634 tgl                       129 EUB             :          n > MAX_LEVENSHTEIN_STRLEN))
 4555 rhaas                     130 UIC           0 :         ereport(ERROR,
                                131                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                132                 :                  errmsg("levenshtein argument exceeds maximum length of %d characters",
                                133                 :                         MAX_LEVENSHTEIN_STRLEN)));
                                134                 : 
                                135                 : #ifdef LEVENSHTEIN_LESS_EQUAL
 4555 rhaas                     136 ECB             :     /* Initialize start and stop columns. */
 4555 rhaas                     137 CBC        1223 :     start_column = 0;
 4555 rhaas                     138 GIC        1223 :     stop_column = m + 1;
                                139                 : 
                                140                 :     /*
                                141                 :      * If max_d >= 0, determine whether the bound is impossibly tight.  If so,
                                142                 :      * return max_d + 1 immediately.  Otherwise, determine whether it's tight
                                143                 :      * enough to limit the computation we must perform.  If so, figure out
                                144                 :      * initial stop column.
 4555 rhaas                     145 ECB             :      */
 4555 rhaas                     146 GIC        1223 :     if (max_d >= 0)
                                147                 :     {
                                148                 :         int         min_theo_d; /* Theoretical minimum distance. */
 4382 bruce                     149 ECB             :         int         max_theo_d; /* Theoretical maximum distance. */
 4382 bruce                     150 GIC        1223 :         int         net_inserts = n - m;
 4555 rhaas                     151 ECB             : 
 4555 rhaas                     152 CBC        1223 :         min_theo_d = net_inserts < 0 ?
                                153            1223 :             -net_inserts * del_c : net_inserts * ins_c;
                                154            1223 :         if (min_theo_d > max_d)
                                155             415 :             return max_d + 1;
 4555 rhaas                     156 GBC         808 :         if (ins_c + del_c < sub_c)
 4555 rhaas                     157 LBC           0 :             sub_c = ins_c + del_c;
 4555 rhaas                     158 CBC         808 :         max_theo_d = min_theo_d + sub_c * Min(m, n);
                                159             808 :         if (max_d >= max_theo_d)
                                160             252 :             max_d = -1;
 4555 rhaas                     161 GIC         556 :         else if (ins_c + del_c > 0)
                                162                 :         {
                                163                 :             /*
                                164                 :              * Figure out how much of the first row of the notional matrix we
                                165                 :              * need to fill in.  If the string is growing, the theoretical
                                166                 :              * minimum distance already incorporates the cost of deleting the
                                167                 :              * number of characters necessary to make the two strings equal in
                                168                 :              * length.  Each additional deletion forces another insertion, so
                                169                 :              * the best-case total cost increases by ins_c + del_c. If the
                                170                 :              * string is shrinking, the minimum theoretical cost assumes no
                                171                 :              * excess deletions; that is, we're starting no further right than
                                172                 :              * column n - m.  If we do start further right, the best-case
                                173                 :              * total cost increases by ins_c + del_c for each move right.
 4555 rhaas                     174 ECB             :              */
 4382 bruce                     175 CBC         556 :             int         slack_d = max_d - min_theo_d;
 4382 bruce                     176 GIC         556 :             int         best_column = net_inserts < 0 ? -net_inserts : 0;
 4382 bruce                     177 ECB             : 
 4555 rhaas                     178 CBC         556 :             stop_column = best_column + (slack_d / (ins_c + del_c)) + 1;
 4555 rhaas                     179 GBC         556 :             if (stop_column > m)
 4555 rhaas                     180 UIC           0 :                 stop_column = m + 1;
                                181                 :         }
                                182                 :     }
                                183                 : #endif
                                184                 : 
                                185                 :     /*
                                186                 :      * In order to avoid calling pg_mblen() repeatedly on each character in s,
                                187                 :      * we cache all the lengths before starting the main loop -- but if all
                                188                 :      * the characters in both strings are single byte, then we skip this and
                                189                 :      * use a fast-path in the main loop.  If only one string contains
                                190                 :      * multi-byte characters, we still build the array, so that the fast-path
                                191                 :      * needn't deal with the case where the array hasn't been initialized.
 4555 rhaas                     192 ECB             :      */
 3069 rhaas                     193 GIC         810 :     if (m != slen || n != tlen)
                                194                 :     {
 4382 bruce                     195 EUB             :         int         i;
 3069 rhaas                     196 UIC           0 :         const char *cp = source;
 4555 rhaas                     197 EUB             : 
 4555 rhaas                     198 UBC           0 :         s_char_len = (int *) palloc((m + 1) * sizeof(int));
 4555 rhaas                     199 UIC           0 :         for (i = 0; i < m; ++i)
 4555 rhaas                     200 EUB             :         {
 4555 rhaas                     201 UBC           0 :             s_char_len[i] = pg_mblen(cp);
 4555 rhaas                     202 UIC           0 :             cp += s_char_len[i];
 4555 rhaas                     203 EUB             :         }
 4555 rhaas                     204 UIC           0 :         s_char_len[i] = 0;
                                205                 :     }
                                206                 : 
 4555 rhaas                     207 ECB             :     /* One more cell for initialization column and row. */
 4555 rhaas                     208 CBC         810 :     ++m;
 4555 rhaas                     209 GIC         810 :     ++n;
                                210                 : 
 4555 rhaas                     211 ECB             :     /* Previous and current rows of notional array. */
 4555 rhaas                     212 CBC         810 :     prev = (int *) palloc(2 * m * sizeof(int));
 4555 rhaas                     213 GIC         810 :     curr = prev + m;
                                214                 : 
                                215                 :     /*
                                216                 :      * To transform the first i characters of s into the first 0 characters of
                                217                 :      * t, we must perform i deletions.
 4555 rhaas                     218 ECB             :      */
  228 drowley                   219 GNC        3176 :     for (int i = START_COLUMN; i < STOP_COLUMN; i++)
 4555 rhaas                     220 GIC        2366 :         prev[i] = i * del_c;
                                221                 : 
 4555 rhaas                     222 ECB             :     /* Loop through rows of the notional array */
 3069 rhaas                     223 GIC        3153 :     for (y = target, j = 1; j < n; j++)
                                224                 :     {
 4555 rhaas                     225 ECB             :         int        *temp;
 3069 rhaas                     226 CBC        2804 :         const char *x = source;
 3069 rhaas                     227 GIC        2804 :         int         y_char_len = n != tlen + 1 ? pg_mblen(y) : 1;
                                228                 :         int         i;
                                229                 : 
                                230                 : #ifdef LEVENSHTEIN_LESS_EQUAL
                                231                 : 
                                232                 :         /*
                                233                 :          * In the best case, values percolate down the diagonal unchanged, so
                                234                 :          * we must increment stop_column unless it's already on the right end
                                235                 :          * of the array.  The inner loop will read prev[stop_column], so we
                                236                 :          * have to initialize it even though it shouldn't affect the result.
                                237                 :          */
 4555 rhaas                     238 CBC        2792 :         if (stop_column < m)
                                239                 :         {
                                240            2206 :             prev[stop_column] = max_d + 1;
                                241            2206 :             ++stop_column;
                                242                 :         }
                                243                 : 
                                244                 :         /*
                                245                 :          * The main loop fills in curr, but curr[0] needs a special case: to
                                246                 :          * transform the first 0 characters of s into the first j characters
                                247                 :          * of t, we must perform j insertions.  However, if start_column > 0,
                                248                 :          * this special case does not apply.
                                249                 :          */
                                250            2792 :         if (start_column == 0)
                                251                 :         {
                                252            1763 :             curr[0] = j * ins_c;
                                253            1763 :             i = 1;
                                254                 :         }
                                255                 :         else
                                256            1029 :             i = start_column;
                                257                 : #else
                                258              12 :         curr[0] = j * ins_c;
                                259              12 :         i = 1;
                                260                 : #endif
                                261                 : 
                                262                 :         /*
                                263                 :          * This inner loop is critical to performance, so we include a
                                264                 :          * fast-path to handle the (fairly common) case where no multibyte
                                265                 :          * characters are in the mix.  The fast-path is entitled to assume
                                266                 :          * that if s_char_len is not initialized then BOTH strings contain
                                267                 :          * only single-byte characters.
                                268                 :          */
                                269            2804 :         if (s_char_len != NULL)
                                270                 :         {
 4555 rhaas                     271 UBC           0 :             for (; i < STOP_COLUMN; i++)
                                272                 :             {
                                273                 :                 int         ins;
                                274                 :                 int         del;
                                275                 :                 int         sub;
                                276               0 :                 int         x_char_len = s_char_len[i - 1];
                                277                 : 
                                278                 :                 /*
                                279                 :                  * Calculate costs for insertion, deletion, and substitution.
                                280                 :                  *
                                281                 :                  * When calculating cost for substitution, we compare the last
                                282                 :                  * character of each possibly-multibyte character first,
                                283                 :                  * because that's enough to rule out most mis-matches.  If we
                                284                 :                  * get past that test, then we compare the lengths and the
                                285                 :                  * remaining bytes.
                                286                 :                  */
                                287               0 :                 ins = prev[i] + ins_c;
                                288               0 :                 del = curr[i - 1] + del_c;
 4382 bruce                     289               0 :                 if (x[x_char_len - 1] == y[y_char_len - 1]
 4555 rhaas                     290               0 :                     && x_char_len == y_char_len &&
                                291               0 :                     (x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
                                292               0 :                     sub = prev[i - 1];
                                293                 :                 else
                                294               0 :                     sub = prev[i - 1] + sub_c;
                                295                 : 
                                296                 :                 /* Take the one with minimum cost. */
                                297               0 :                 curr[i] = Min(ins, del);
                                298               0 :                 curr[i] = Min(curr[i], sub);
                                299                 : 
                                300                 :                 /* Point to next character. */
                                301               0 :                 x += x_char_len;
                                302                 :             }
                                303                 :         }
                                304                 :         else
                                305                 :         {
 4555 rhaas                     306 CBC       11494 :             for (; i < STOP_COLUMN; i++)
                                307                 :             {
                                308                 :                 int         ins;
                                309                 :                 int         del;
                                310                 :                 int         sub;
                                311                 : 
                                312                 :                 /* Calculate costs for insertion, deletion, and substitution. */
                                313            8690 :                 ins = prev[i] + ins_c;
                                314            8690 :                 del = curr[i - 1] + del_c;
                                315            8690 :                 sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);
                                316                 : 
                                317                 :                 /* Take the one with minimum cost. */
                                318            8690 :                 curr[i] = Min(ins, del);
                                319            8690 :                 curr[i] = Min(curr[i], sub);
                                320                 : 
                                321                 :                 /* Point to next character. */
                                322            8690 :                 x++;
                                323                 :             }
                                324                 :         }
                                325                 : 
                                326                 :         /* Swap current row with previous row. */
                                327            2804 :         temp = curr;
                                328            2804 :         curr = prev;
                                329            2804 :         prev = temp;
                                330                 : 
                                331                 :         /* Point to next character. */
                                332              12 :         y += y_char_len;
                                333                 : 
                                334                 : #ifdef LEVENSHTEIN_LESS_EQUAL
                                335                 : 
                                336                 :         /*
                                337                 :          * This chunk of code represents a significant performance hit if used
                                338                 :          * in the case where there is no max_d bound.  This is probably not
                                339                 :          * because the max_d >= 0 test itself is expensive, but rather because
                                340                 :          * the possibility of needing to execute this code prevents tight
                                341                 :          * optimization of the loop as a whole.
                                342                 :          */
                                343            2792 :         if (max_d >= 0)
                                344                 :         {
                                345                 :             /*
                                346                 :              * The "zero point" is the column of the current row where the
                                347                 :              * remaining portions of the strings are of equal length.  There
                                348                 :              * are (n - 1) characters in the target string, of which j have
                                349                 :              * been transformed.  There are (m - 1) characters in the source
                                350                 :              * string, so we want to find the value for zp where (n - 1) - j =
                                351                 :              * (m - 1) - zp.
                                352                 :              */
 4382 bruce                     353            2276 :             int         zp = j - (n - m);
                                354                 : 
                                355                 :             /* Check whether the stop column can slide left. */
 4555 rhaas                     356            5408 :             while (stop_column > 0)
                                357                 :             {
 4382 bruce                     358            4947 :                 int         ii = stop_column - 1;
                                359            4947 :                 int         net_inserts = ii - zp;
                                360                 : 
 4555 rhaas                     361            8424 :                 if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c :
 4382 bruce                     362            3477 :                                 -net_inserts * del_c) <= max_d)
 4555 rhaas                     363            1815 :                     break;
                                364            3132 :                 stop_column--;
                                365                 :             }
                                366                 : 
                                367                 :             /* Check whether the start column can slide right. */
                                368            3790 :             while (start_column < stop_column)
                                369                 :             {
 4382 bruce                     370            3329 :                 int         net_inserts = start_column - zp;
                                371                 : 
 4555 rhaas                     372            3329 :                 if (prev[start_column] +
                                373            3329 :                     (net_inserts > 0 ? net_inserts * ins_c :
 4382 bruce                     374            3149 :                      -net_inserts * del_c) <= max_d)
 4555 rhaas                     375            1815 :                     break;
                                376                 : 
                                377                 :                 /*
                                378                 :                  * We'll never again update these values, so we must make sure
                                379                 :                  * there's nothing here that could confuse any future
                                380                 :                  * iteration of the outer loop.
                                381                 :                  */
                                382            1514 :                 prev[start_column] = max_d + 1;
                                383            1514 :                 curr[start_column] = max_d + 1;
                                384            1514 :                 if (start_column != 0)
 3069                           385            1036 :                     source += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
 4555                           386            1514 :                 start_column++;
                                387                 :             }
                                388                 : 
                                389                 :             /* If they cross, we're going to exceed the bound. */
                                390            2276 :             if (start_column >= stop_column)
                                391             461 :                 return max_d + 1;
                                392                 :         }
                                393                 : #endif
                                394                 :     }
                                395                 : 
                                396                 :     /*
                                397                 :      * Because the final value was swapped from the previous row to the
                                398                 :      * current row, that's where we'll find it.
                                399                 :      */
                                400             349 :     return prev[m - 1];
                                401                 : }

Generated by: LCOV version v1.16-55-g56c0a2a