LCOV - differential code coverage report
Current view: top level - src/common - unicode_norm.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 77.6 % 201 156 45 156
Current Date: 2023-04-08 17:13:01 Functions: 100.0 % 11 11 11
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (240..) days: 77.6 % 201 156 45 156
Legend: Lines: hit not hit Function coverage date bins:
(240..) days: 100.0 % 11 11 11

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  * unicode_norm.c
                                  3                 :  *      Normalize a Unicode string
                                  4                 :  *
                                  5                 :  * This implements Unicode normalization, per the documentation at
                                  6                 :  * https://www.unicode.org/reports/tr15/.
                                  7                 :  *
                                  8                 :  * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
                                  9                 :  *
                                 10                 :  * IDENTIFICATION
                                 11                 :  *    src/common/unicode_norm.c
                                 12                 :  *
                                 13                 :  *-------------------------------------------------------------------------
                                 14                 :  */
                                 15                 : #ifndef FRONTEND
                                 16                 : #include "postgres.h"
                                 17                 : #else
                                 18                 : #include "postgres_fe.h"
                                 19                 : #endif
                                 20                 : 
                                 21                 : #include "common/unicode_norm.h"
                                 22                 : #ifndef FRONTEND
                                 23                 : #include "common/unicode_norm_hashfunc.h"
                                 24                 : #include "common/unicode_normprops_table.h"
                                 25                 : #include "port/pg_bswap.h"
                                 26                 : #else
                                 27                 : #include "common/unicode_norm_table.h"
                                 28                 : #endif
                                 29                 : 
                                 30                 : #ifndef FRONTEND
                                 31                 : #define ALLOC(size) palloc(size)
                                 32                 : #define FREE(size) pfree(size)
                                 33                 : #else
                                 34                 : #define ALLOC(size) malloc(size)
                                 35                 : #define FREE(size) free(size)
                                 36                 : #endif
                                 37                 : 
                                 38                 : /* Constants for calculations with Hangul characters */
                                 39                 : #define SBASE       0xAC00      /* U+AC00 */
                                 40                 : #define LBASE       0x1100      /* U+1100 */
                                 41                 : #define VBASE       0x1161      /* U+1161 */
                                 42                 : #define TBASE       0x11A7      /* U+11A7 */
                                 43                 : #define LCOUNT      19
                                 44                 : #define VCOUNT      21
                                 45                 : #define TCOUNT      28
                                 46                 : #define NCOUNT      VCOUNT * TCOUNT
                                 47                 : #define SCOUNT      LCOUNT * NCOUNT
                                 48                 : 
                                 49                 : #ifdef FRONTEND
                                 50                 : /* comparison routine for bsearch() of decomposition lookup table. */
                                 51                 : static int
  897 michael                    52 CBC        1260 : conv_compare(const void *p1, const void *p2)
                                 53                 : {
                                 54                 :     uint32      v1,
                                 55                 :                 v2;
                                 56                 : 
                                 57            1260 :     v1 = *(const uint32 *) p1;
                                 58            1260 :     v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
                                 59            1260 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
                                 60                 : }
                                 61                 : 
                                 62                 : #endif
                                 63                 : 
                                 64                 : /*
                                 65                 :  * get_code_entry
                                 66                 :  *
                                 67                 :  * Get the entry corresponding to code in the decomposition lookup table.
                                 68                 :  * The backend version of this code uses a perfect hash function for the
                                 69                 :  * lookup, while the frontend version uses a binary search.
                                 70                 :  */
                                 71                 : static const pg_unicode_decomposition *
  898                            72            1078 : get_code_entry(pg_wchar code)
                                 73                 : {
                                 74                 : #ifndef FRONTEND
                                 75                 :     int         h;
                                 76                 :     uint32      hashkey;
                                 77             980 :     pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
                                 78                 : 
                                 79                 :     /*
                                 80                 :      * Compute the hash function. The hash key is the codepoint with the bytes
                                 81                 :      * in network order.
                                 82                 :      */
                                 83             980 :     hashkey = pg_hton32(code);
                                 84             980 :     h = decompinfo.hash(&hashkey);
                                 85                 : 
                                 86                 :     /* An out-of-range result implies no match */
                                 87             980 :     if (h < 0 || h >= decompinfo.num_decomps)
                                 88             558 :         return NULL;
                                 89                 : 
                                 90                 :     /*
                                 91                 :      * Since it's a perfect hash, we need only match to the specific codepoint
                                 92                 :      * it identifies.
                                 93                 :      */
                                 94             422 :     if (code != decompinfo.decomps[h].codepoint)
  898 michael                    95 UBC           0 :         return NULL;
                                 96                 : 
                                 97                 :     /* Success! */
  898 michael                    98 CBC         422 :     return &decompinfo.decomps[h];
                                 99                 : #else
 2193 heikki.linnakangas        100              98 :     return bsearch(&(code),
                                101                 :                    UnicodeDecompMain,
                                102                 :                    lengthof(UnicodeDecompMain),
                                103                 :                    sizeof(pg_unicode_decomposition),
                                104                 :                    conv_compare);
                                105                 : #endif
                                106                 : }
                                107                 : 
                                108                 : /*
                                109                 :  * Get the combining class of the given codepoint.
                                110                 :  */
                                111                 : static uint8
  851 michael                   112             534 : get_canonical_class(pg_wchar code)
                                113                 : {
                                114             534 :     const pg_unicode_decomposition *entry = get_code_entry(code);
                                115                 : 
                                116                 :     /*
                                117                 :      * If no entries are found, the character used is either an Hangul
                                118                 :      * character or a character with a class of 0 and no decompositions.
                                119                 :      */
                                120             534 :     if (!entry)
                                121             318 :         return 0;
                                122                 :     else
                                123             216 :         return entry->comb_class;
                                124                 : }
                                125                 : 
                                126                 : /*
                                127                 :  * Given a decomposition entry looked up earlier, get the decomposed
                                128                 :  * characters.
                                129                 :  *
                                130                 :  * Note: the returned pointer can point to statically allocated buffer, and
                                131                 :  * is only valid until next call to this function!
                                132                 :  */
                                133                 : static const pg_wchar *
  898                           134              74 : get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
                                135                 : {
                                136                 :     static pg_wchar x;
                                137                 : 
 2193 heikki.linnakangas        138              74 :     if (DECOMPOSITION_IS_INLINE(entry))
                                139                 :     {
                                140              30 :         Assert(DECOMPOSITION_SIZE(entry) == 1);
                                141              30 :         x = (pg_wchar) entry->dec_index;
                                142              30 :         *dec_size = 1;
                                143              30 :         return &x;
                                144                 :     }
                                145                 :     else
                                146                 :     {
                                147              44 :         *dec_size = DECOMPOSITION_SIZE(entry);
                                148              44 :         return &UnicodeDecomp_codepoints[entry->dec_index];
                                149                 :     }
                                150                 : }
                                151                 : 
                                152                 : /*
                                153                 :  * Calculate how many characters a given character will decompose to.
                                154                 :  *
                                155                 :  * This needs to recurse, if the character decomposes into characters that
                                156                 :  * are, in turn, decomposable.
                                157                 :  */
                                158                 : static int
 1111 peter                     159             272 : get_decomposed_size(pg_wchar code, bool compat)
                                160                 : {
                                161                 :     const pg_unicode_decomposition *entry;
 2193 heikki.linnakangas        162             272 :     int         size = 0;
                                163                 :     int         i;
                                164                 :     const uint32 *decomp;
                                165                 :     int         dec_size;
                                166                 : 
                                167                 :     /*
                                168                 :      * Fast path for Hangul characters not stored in tables to save memory as
                                169                 :      * decomposition is algorithmic. See
                                170                 :      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
                                171                 :      * on the matter.
                                172                 :      */
                                173             272 :     if (code >= SBASE && code < SBASE + SCOUNT)
                                174                 :     {
                                175                 :         uint32      tindex,
                                176                 :                     sindex;
                                177                 : 
 2193 heikki.linnakangas        178 UBC           0 :         sindex = code - SBASE;
                                179               0 :         tindex = sindex % TCOUNT;
                                180                 : 
                                181               0 :         if (tindex != 0)
                                182               0 :             return 3;
                                183               0 :         return 2;
                                184                 :     }
                                185                 : 
 2193 heikki.linnakangas        186 CBC         272 :     entry = get_code_entry(code);
                                187                 : 
                                188                 :     /*
                                189                 :      * Just count current code if no other decompositions.  A NULL entry is
                                190                 :      * equivalent to a character with class 0 and no decompositions.
                                191                 :      */
 1111 peter                     192             272 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
                                193              55 :         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
 2193 heikki.linnakangas        194             235 :         return 1;
                                195                 : 
                                196                 :     /*
                                197                 :      * If this entry has other decomposition codes look at them as well. First
                                198                 :      * get its decomposition in the list of tables available.
                                199                 :      */
                                200              37 :     decomp = get_code_decomposition(entry, &dec_size);
                                201              96 :     for (i = 0; i < dec_size; i++)
                                202                 :     {
                                203              59 :         uint32      lcode = decomp[i];
                                204                 : 
 1111 peter                     205              59 :         size += get_decomposed_size(lcode, compat);
                                206                 :     }
                                207                 : 
 2193 heikki.linnakangas        208              37 :     return size;
                                209                 : }
                                210                 : 
                                211                 : /*
                                212                 :  * Recompose a set of characters. For hangul characters, the calculation
                                213                 :  * is algorithmic. For others, an inverse lookup at the decomposition
                                214                 :  * table is necessary. Returns true if a recomposition can be done, and
                                215                 :  * false otherwise.
                                216                 :  */
                                217                 : static bool
                                218              86 : recompose_code(uint32 start, uint32 code, uint32 *result)
                                219                 : {
                                220                 :     /*
                                221                 :      * Handle Hangul characters algorithmically, per the Unicode spec.
                                222                 :      *
                                223                 :      * Check if two current characters are L and V.
                                224                 :      */
                                225              86 :     if (start >= LBASE && start < LBASE + LCOUNT &&
 2193 heikki.linnakangas        226 UBC           0 :         code >= VBASE && code < VBASE + VCOUNT)
                                227                 :     {
                                228                 :         /* make syllable of form LV */
                                229               0 :         uint32      lindex = start - LBASE;
                                230               0 :         uint32      vindex = code - VBASE;
                                231                 : 
                                232               0 :         *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
                                233               0 :         return true;
                                234                 :     }
                                235                 :     /* Check if two current characters are LV and T */
 2193 heikki.linnakangas        236 CBC          86 :     else if (start >= SBASE && start < (SBASE + SCOUNT) &&
 2193 heikki.linnakangas        237 UBC           0 :              ((start - SBASE) % TCOUNT) == 0 &&
                                238               0 :              code >= TBASE && code < (TBASE + TCOUNT))
                                239                 :     {
                                240                 :         /* make syllable of form LVT */
                                241               0 :         uint32      tindex = code - TBASE;
                                242                 : 
                                243               0 :         *result = start + tindex;
                                244               0 :         return true;
                                245                 :     }
                                246                 :     else
                                247                 :     {
                                248                 :         const pg_unicode_decomposition *entry;
                                249                 : 
                                250                 :         /*
                                251                 :          * Do an inverse lookup of the decomposition tables to see if anything
                                252                 :          * matches. The comparison just needs to be a perfect match on the
                                253                 :          * sub-table of size two, because the start character has already been
                                254                 :          * recomposed partially.  This lookup uses a perfect hash function for
                                255                 :          * the backend code.
                                256                 :          */
                                257                 : #ifndef FRONTEND
                                258                 : 
                                259                 :         int         h,
                                260                 :                     inv_lookup_index;
                                261                 :         uint64      hashkey;
  898 michael                   262 CBC          70 :         pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
                                263                 : 
                                264                 :         /*
                                265                 :          * Compute the hash function. The hash key is formed by concatenating
                                266                 :          * bytes of the two codepoints in network order. See also
                                267                 :          * src/common/unicode/generate-unicode_norm_table.pl.
                                268                 :          */
                                269              70 :         hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
                                270              70 :         h = recompinfo.hash(&hashkey);
                                271                 : 
                                272                 :         /* An out-of-range result implies no match */
                                273              70 :         if (h < 0 || h >= recompinfo.num_recomps)
                                274              68 :             return false;
                                275                 : 
                                276              23 :         inv_lookup_index = recompinfo.inverse_lookup[h];
                                277              23 :         entry = &UnicodeDecompMain[inv_lookup_index];
                                278                 : 
                                279              23 :         if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
                                280              21 :             code == UnicodeDecomp_codepoints[entry->dec_index + 1])
                                281                 :         {
                                282              21 :             *result = entry->codepoint;
                                283              21 :             return true;
                                284                 :         }
                                285                 : 
                                286                 : #else
                                287                 : 
                                288                 :         int         i;
                                289                 : 
 2193 heikki.linnakangas        290          108416 :         for (i = 0; i < lengthof(UnicodeDecompMain); i++)
                                291                 :         {
  898 michael                   292          108400 :             entry = &UnicodeDecompMain[i];
                                293                 : 
 2193 heikki.linnakangas        294          108400 :             if (DECOMPOSITION_SIZE(entry) != 2)
                                295           81616 :                 continue;
                                296                 : 
                                297           26784 :             if (DECOMPOSITION_NO_COMPOSE(entry))
                                298           11728 :                 continue;
                                299                 : 
                                300           15056 :             if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
                                301             140 :                 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
                                302                 :             {
 2193 heikki.linnakangas        303 UBC           0 :                 *result = entry->codepoint;
                                304               0 :                 return true;
                                305                 :             }
                                306                 :         }
                                307                 : #endif                          /* !FRONTEND */
                                308                 :     }
                                309                 : 
 2193 heikki.linnakangas        310 CBC          18 :     return false;
                                311                 : }
                                312                 : 
                                313                 : /*
                                314                 :  * Decompose the given code into the array given by caller. The
                                315                 :  * decomposition begins at the position given by caller, saving one
                                316                 :  * lookup on the decomposition table. The current position needs to be
                                317                 :  * updated here to let the caller know from where to continue filling
                                318                 :  * in the array result.
                                319                 :  */
                                320                 : static void
 1111 peter                     321             272 : decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
                                322                 : {
                                323                 :     const pg_unicode_decomposition *entry;
                                324                 :     int         i;
                                325                 :     const uint32 *decomp;
                                326                 :     int         dec_size;
                                327                 : 
                                328                 :     /*
                                329                 :      * Fast path for Hangul characters not stored in tables to save memory as
                                330                 :      * decomposition is algorithmic. See
                                331                 :      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
                                332                 :      * on the matter.
                                333                 :      */
 2193 heikki.linnakangas        334             272 :     if (code >= SBASE && code < SBASE + SCOUNT)
                                335                 :     {
                                336                 :         uint32      l,
                                337                 :                     v,
                                338                 :                     tindex,
                                339                 :                     sindex;
 2193 heikki.linnakangas        340 UBC           0 :         pg_wchar   *res = *result;
                                341                 : 
                                342               0 :         sindex = code - SBASE;
                                343               0 :         l = LBASE + sindex / (VCOUNT * TCOUNT);
                                344               0 :         v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
                                345               0 :         tindex = sindex % TCOUNT;
                                346                 : 
                                347               0 :         res[*current] = l;
                                348               0 :         (*current)++;
                                349               0 :         res[*current] = v;
                                350               0 :         (*current)++;
                                351                 : 
                                352               0 :         if (tindex != 0)
                                353                 :         {
                                354               0 :             res[*current] = TBASE + tindex;
                                355               0 :             (*current)++;
                                356                 :         }
                                357                 : 
 2193 heikki.linnakangas        358 CBC         235 :         return;
                                359                 :     }
                                360                 : 
                                361             272 :     entry = get_code_entry(code);
                                362                 : 
                                363                 :     /*
                                364                 :      * Just fill in with the current decomposition if there are no
                                365                 :      * decomposition codes to recurse to.  A NULL entry is equivalent to a
                                366                 :      * character with class 0 and no decompositions, so just leave also in
                                367                 :      * this case.
                                368                 :      */
 1111 peter                     369             272 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
                                370              55 :         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
                                371                 :     {
 2193 heikki.linnakangas        372             235 :         pg_wchar   *res = *result;
                                373                 : 
                                374             235 :         res[*current] = code;
                                375             235 :         (*current)++;
                                376             235 :         return;
                                377                 :     }
                                378                 : 
                                379                 :     /*
                                380                 :      * If this entry has other decomposition codes look at them as well.
                                381                 :      */
                                382              37 :     decomp = get_code_decomposition(entry, &dec_size);
                                383              96 :     for (i = 0; i < dec_size; i++)
                                384                 :     {
                                385              59 :         pg_wchar    lcode = (pg_wchar) decomp[i];
                                386                 : 
                                387                 :         /* Leave if no more decompositions */
 1111 peter                     388              59 :         decompose_code(lcode, compat, result, current);
                                389                 :     }
                                390                 : }
                                391                 : 
                                392                 : /*
                                393                 :  * unicode_normalize - Normalize a Unicode string to the specified form.
                                394                 :  *
                                395                 :  * The input is a 0-terminated array of codepoints.
                                396                 :  *
                                397                 :  * In frontend, returns a 0-terminated array of codepoints, allocated with
                                398                 :  * malloc. Or NULL if we run out of memory. In backend, the returned
                                399                 :  * string is palloc'd instead, and OOM is reported with ereport().
                                400                 :  */
                                401                 : pg_wchar *
                                402              68 : unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
                                403                 : {
                                404              68 :     bool        compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
                                405              68 :     bool        recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
                                406                 :     pg_wchar   *decomp_chars;
                                407                 :     pg_wchar   *recomp_chars;
                                408                 :     int         decomp_size,
                                409                 :                 current_size;
                                410                 :     int         count;
                                411                 :     const pg_wchar *p;
                                412                 : 
                                413                 :     /* variables for recomposition */
                                414                 :     int         last_class;
                                415                 :     int         starter_pos;
                                416                 :     int         target_pos;
                                417                 :     uint32      starter_ch;
                                418                 : 
                                419                 :     /* First, do character decomposition */
                                420                 : 
                                421                 :     /*
                                422                 :      * Calculate how many characters long the decomposed version will be.
                                423                 :      */
 2193 heikki.linnakangas        424              68 :     decomp_size = 0;
                                425             281 :     for (p = input; *p; p++)
 1111 peter                     426             213 :         decomp_size += get_decomposed_size(*p, compat);
                                427                 : 
 2193 heikki.linnakangas        428              68 :     decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
                                429              68 :     if (decomp_chars == NULL)
 2193 heikki.linnakangas        430 UBC           0 :         return NULL;
                                431                 : 
                                432                 :     /*
                                433                 :      * Now fill in each entry recursively. This needs a second pass on the
                                434                 :      * decomposition table.
                                435                 :      */
 2193 heikki.linnakangas        436 CBC          68 :     current_size = 0;
                                437             281 :     for (p = input; *p; p++)
 1111 peter                     438             213 :         decompose_code(*p, compat, &decomp_chars, &current_size);
 2193 heikki.linnakangas        439              68 :     decomp_chars[decomp_size] = '\0';
                                440              68 :     Assert(decomp_size == current_size);
                                441                 : 
                                442                 :     /* Leave if there is nothing to decompose */
  514 michael                   443              68 :     if (decomp_size == 0)
                                444               9 :         return decomp_chars;
                                445                 : 
                                446                 :     /*
                                447                 :      * Now apply canonical ordering.
                                448                 :      */
 2193 heikki.linnakangas        449             235 :     for (count = 1; count < decomp_size; count++)
                                450                 :     {
                                451             176 :         pg_wchar    prev = decomp_chars[count - 1];
                                452             176 :         pg_wchar    next = decomp_chars[count];
                                453                 :         pg_wchar    tmp;
  851 michael                   454             176 :         const uint8 prevClass = get_canonical_class(prev);
                                455             176 :         const uint8 nextClass = get_canonical_class(next);
                                456                 : 
                                457                 :         /*
                                458                 :          * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
                                459                 :          * annex 4, a sequence of two adjacent characters in a string is an
                                460                 :          * exchangeable pair if the combining class (from the Unicode
                                461                 :          * Character Database) for the first character is greater than the
                                462                 :          * combining class for the second, and the second is not a starter.  A
                                463                 :          * character is a starter if its combining class is 0.
                                464                 :          */
                                465             176 :         if (prevClass == 0 || nextClass == 0)
 2193 heikki.linnakangas        466             176 :             continue;
                                467                 : 
  851 michael                   468 UBC           0 :         if (prevClass <= nextClass)
 2193 heikki.linnakangas        469               0 :             continue;
                                470                 : 
                                471                 :         /* exchange can happen */
                                472               0 :         tmp = decomp_chars[count - 1];
                                473               0 :         decomp_chars[count - 1] = decomp_chars[count];
                                474               0 :         decomp_chars[count] = tmp;
                                475                 : 
                                476                 :         /* backtrack to check again */
                                477               0 :         if (count > 1)
                                478               0 :             count -= 2;
                                479                 :     }
                                480                 : 
 1111 peter                     481 CBC          59 :     if (!recompose)
                                482              30 :         return decomp_chars;
                                483                 : 
                                484                 :     /*
                                485                 :      * The last phase of NFC and NFKC is the recomposition of the reordered
                                486                 :      * Unicode string using combining classes. The recomposed string cannot be
                                487                 :      * longer than the decomposed one, so make the allocation of the output
                                488                 :      * string based on that assumption.
                                489                 :      */
 2193 heikki.linnakangas        490              29 :     recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
                                491              29 :     if (!recomp_chars)
                                492                 :     {
 2193 heikki.linnakangas        493 UBC           0 :         FREE(decomp_chars);
                                494               0 :         return NULL;
                                495                 :     }
                                496                 : 
 2193 heikki.linnakangas        497 CBC          29 :     last_class = -1;            /* this eliminates a special check */
                                498              29 :     starter_pos = 0;
                                499              29 :     target_pos = 1;
                                500              29 :     starter_ch = recomp_chars[0] = decomp_chars[0];
                                501                 : 
                                502             115 :     for (count = 1; count < decomp_size; count++)
                                503                 :     {
                                504              86 :         pg_wchar    ch = decomp_chars[count];
  851 michael                   505              86 :         int         ch_class = get_canonical_class(ch);
                                506                 :         pg_wchar    composite;
                                507                 : 
 2193 heikki.linnakangas        508             172 :         if (last_class < ch_class &&
                                509              86 :             recompose_code(starter_ch, ch, &composite))
                                510                 :         {
                                511              21 :             recomp_chars[starter_pos] = composite;
                                512              21 :             starter_ch = composite;
                                513                 :         }
                                514              65 :         else if (ch_class == 0)
                                515                 :         {
                                516              65 :             starter_pos = target_pos;
                                517              65 :             starter_ch = ch;
                                518              65 :             last_class = -1;
                                519              65 :             recomp_chars[target_pos++] = ch;
                                520                 :         }
                                521                 :         else
                                522                 :         {
 2193 heikki.linnakangas        523 UBC           0 :             last_class = ch_class;
                                524               0 :             recomp_chars[target_pos++] = ch;
                                525                 :         }
                                526                 :     }
 2193 heikki.linnakangas        527 CBC          29 :     recomp_chars[target_pos] = (pg_wchar) '\0';
                                528                 : 
                                529              29 :     FREE(decomp_chars);
                                530                 : 
                                531              29 :     return recomp_chars;
                                532                 : }
                                533                 : 
                                534                 : /*
                                535                 :  * Normalization "quick check" algorithm; see
                                536                 :  * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
                                537                 :  */
                                538                 : 
                                539                 : /* We only need this in the backend. */
                                540                 : #ifndef FRONTEND
                                541                 : 
                                542                 : static const pg_unicode_normprops *
  910 michael                   543              96 : qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
                                544                 : {
                                545                 :     int         h;
                                546                 :     uint32      hashkey;
                                547                 : 
                                548                 :     /*
                                549                 :      * Compute the hash function. The hash key is the codepoint with the bytes
                                550                 :      * in network order.
                                551                 :      */
  909                           552              96 :     hashkey = pg_hton32(ch);
  910                           553              96 :     h = norminfo->hash(&hashkey);
                                554                 : 
                                555                 :     /* An out-of-range result implies no match */
                                556              96 :     if (h < 0 || h >= norminfo->num_normprops)
                                557              42 :         return NULL;
                                558                 : 
                                559                 :     /*
                                560                 :      * Since it's a perfect hash, we need only match to the specific codepoint
                                561                 :      * it identifies.
                                562                 :      */
                                563              54 :     if (ch != norminfo->normprops[h].codepoint)
                                564              36 :         return NULL;
                                565                 : 
                                566                 :     /* Success! */
                                567              18 :     return &norminfo->normprops[h];
                                568                 : }
                                569                 : 
                                570                 : /*
                                571                 :  * Look up the normalization quick check character property
                                572                 :  */
                                573                 : static UnicodeNormalizationQC
 1109 peter                     574              96 : qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
                                575                 : {
  910 michael                   576              96 :     const pg_unicode_normprops *found = NULL;
                                577                 : 
 1109 peter                     578              96 :     switch (form)
                                579                 :     {
                                580              60 :         case UNICODE_NFC:
  910 michael                   581              60 :             found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
 1109 peter                     582              60 :             break;
                                583              36 :         case UNICODE_NFKC:
  910 michael                   584              36 :             found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
 1109 peter                     585              36 :             break;
 1109 peter                     586 UBC           0 :         default:
                                587               0 :             Assert(false);
                                588                 :             break;
                                589                 :     }
                                590                 : 
 1109 peter                     591 CBC          96 :     if (found)
                                592              18 :         return found->quickcheck;
                                593                 :     else
                                594              78 :         return UNICODE_NORM_QC_YES;
                                595                 : }
                                596                 : 
                                597                 : UnicodeNormalizationQC
                                598              66 : unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
                                599                 : {
                                600              66 :     uint8       lastCanonicalClass = 0;
                                601              66 :     UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
                                602                 : 
                                603                 :     /*
                                604                 :      * For the "D" forms, we don't run the quickcheck.  We don't include the
                                605                 :      * lookup tables for those because they are huge, checking for these
                                606                 :      * particular forms is less common, and running the slow path is faster
                                607                 :      * for the "D" forms than the "C" forms because you don't need to
                                608                 :      * recompose, which is slow.
                                609                 :      */
                                610              66 :     if (form == UNICODE_NFD || form == UNICODE_NFKD)
                                611              30 :         return UNICODE_NORM_QC_MAYBE;
                                612                 : 
                                613             126 :     for (const pg_wchar *p = input; *p; p++)
                                614                 :     {
                                615              96 :         pg_wchar    ch = *p;
                                616                 :         uint8       canonicalClass;
                                617                 :         UnicodeNormalizationQC check;
                                618                 : 
                                619              96 :         canonicalClass = get_canonical_class(ch);
                                620              96 :         if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
 1109 peter                     621 UBC           0 :             return UNICODE_NORM_QC_NO;
                                622                 : 
 1109 peter                     623 CBC          96 :         check = qc_is_allowed(form, ch);
                                624              96 :         if (check == UNICODE_NORM_QC_NO)
                                625               6 :             return UNICODE_NORM_QC_NO;
                                626              90 :         else if (check == UNICODE_NORM_QC_MAYBE)
                                627              12 :             result = UNICODE_NORM_QC_MAYBE;
                                628                 : 
                                629              90 :         lastCanonicalClass = canonicalClass;
                                630                 :     }
                                631              30 :     return result;
                                632                 : }
                                633                 : 
                                634                 : #endif                          /* !FRONTEND */
        

Generated by: LCOV version v1.16-55-g56c0a2a