LCOV - differential code coverage report
Current view: top level - src/common - unicode_norm.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 77.6 % 201 156 45 156
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 11 11 11
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  * unicode_norm.c
       3                 :  *      Normalize a Unicode string
       4                 :  *
       5                 :  * This implements Unicode normalization, per the documentation at
       6                 :  * https://www.unicode.org/reports/tr15/.
       7                 :  *
       8                 :  * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/common/unicode_norm.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : #ifndef FRONTEND
      16                 : #include "postgres.h"
      17                 : #else
      18                 : #include "postgres_fe.h"
      19                 : #endif
      20                 : 
      21                 : #include "common/unicode_norm.h"
      22                 : #ifndef FRONTEND
      23                 : #include "common/unicode_norm_hashfunc.h"
      24                 : #include "common/unicode_normprops_table.h"
      25                 : #include "port/pg_bswap.h"
      26                 : #else
      27                 : #include "common/unicode_norm_table.h"
      28                 : #endif
      29                 : 
      30                 : #ifndef FRONTEND
      31                 : #define ALLOC(size) palloc(size)
      32                 : #define FREE(size) pfree(size)
      33                 : #else
      34                 : #define ALLOC(size) malloc(size)
      35                 : #define FREE(size) free(size)
      36                 : #endif
      37                 : 
      38                 : /* Constants for calculations with Hangul characters */
      39                 : #define SBASE       0xAC00      /* U+AC00 */
      40                 : #define LBASE       0x1100      /* U+1100 */
      41                 : #define VBASE       0x1161      /* U+1161 */
      42                 : #define TBASE       0x11A7      /* U+11A7 */
      43                 : #define LCOUNT      19
      44                 : #define VCOUNT      21
      45                 : #define TCOUNT      28
      46                 : #define NCOUNT      VCOUNT * TCOUNT
      47                 : #define SCOUNT      LCOUNT * NCOUNT
      48                 : 
      49                 : #ifdef FRONTEND
      50                 : /* comparison routine for bsearch() of decomposition lookup table. */
      51                 : static int
      52 CBC        1260 : conv_compare(const void *p1, const void *p2)
      53                 : {
      54                 :     uint32      v1,
      55                 :                 v2;
      56                 : 
      57            1260 :     v1 = *(const uint32 *) p1;
      58            1260 :     v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
      59            1260 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
      60                 : }
      61                 : 
      62                 : #endif
      63                 : 
      64                 : /*
      65                 :  * get_code_entry
      66                 :  *
      67                 :  * Get the entry corresponding to code in the decomposition lookup table.
      68                 :  * The backend version of this code uses a perfect hash function for the
      69                 :  * lookup, while the frontend version uses a binary search.
      70                 :  */
      71                 : static const pg_unicode_decomposition *
      72            1078 : get_code_entry(pg_wchar code)
      73                 : {
      74                 : #ifndef FRONTEND
      75                 :     int         h;
      76                 :     uint32      hashkey;
      77             980 :     pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
      78                 : 
      79                 :     /*
      80                 :      * Compute the hash function. The hash key is the codepoint with the bytes
      81                 :      * in network order.
      82                 :      */
      83             980 :     hashkey = pg_hton32(code);
      84             980 :     h = decompinfo.hash(&hashkey);
      85                 : 
      86                 :     /* An out-of-range result implies no match */
      87             980 :     if (h < 0 || h >= decompinfo.num_decomps)
      88             558 :         return NULL;
      89                 : 
      90                 :     /*
      91                 :      * Since it's a perfect hash, we need only match to the specific codepoint
      92                 :      * it identifies.
      93                 :      */
      94             422 :     if (code != decompinfo.decomps[h].codepoint)
      95 UBC           0 :         return NULL;
      96                 : 
      97                 :     /* Success! */
      98 CBC         422 :     return &decompinfo.decomps[h];
      99                 : #else
     100              98 :     return bsearch(&(code),
     101                 :                    UnicodeDecompMain,
     102                 :                    lengthof(UnicodeDecompMain),
     103                 :                    sizeof(pg_unicode_decomposition),
     104                 :                    conv_compare);
     105                 : #endif
     106                 : }
     107                 : 
     108                 : /*
     109                 :  * Get the combining class of the given codepoint.
     110                 :  */
     111                 : static uint8
     112             534 : get_canonical_class(pg_wchar code)
     113                 : {
     114             534 :     const pg_unicode_decomposition *entry = get_code_entry(code);
     115                 : 
     116                 :     /*
     117                 :      * If no entries are found, the character used is either an Hangul
     118                 :      * character or a character with a class of 0 and no decompositions.
     119                 :      */
     120             534 :     if (!entry)
     121             318 :         return 0;
     122                 :     else
     123             216 :         return entry->comb_class;
     124                 : }
     125                 : 
     126                 : /*
     127                 :  * Given a decomposition entry looked up earlier, get the decomposed
     128                 :  * characters.
     129                 :  *
     130                 :  * Note: the returned pointer can point to statically allocated buffer, and
     131                 :  * is only valid until next call to this function!
     132                 :  */
     133                 : static const pg_wchar *
     134              74 : get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
     135                 : {
     136                 :     static pg_wchar x;
     137                 : 
     138              74 :     if (DECOMPOSITION_IS_INLINE(entry))
     139                 :     {
     140              30 :         Assert(DECOMPOSITION_SIZE(entry) == 1);
     141              30 :         x = (pg_wchar) entry->dec_index;
     142              30 :         *dec_size = 1;
     143              30 :         return &x;
     144                 :     }
     145                 :     else
     146                 :     {
     147              44 :         *dec_size = DECOMPOSITION_SIZE(entry);
     148              44 :         return &UnicodeDecomp_codepoints[entry->dec_index];
     149                 :     }
     150                 : }
     151                 : 
     152                 : /*
     153                 :  * Calculate how many characters a given character will decompose to.
     154                 :  *
     155                 :  * This needs to recurse, if the character decomposes into characters that
     156                 :  * are, in turn, decomposable.
     157                 :  */
     158                 : static int
     159             272 : get_decomposed_size(pg_wchar code, bool compat)
     160                 : {
     161                 :     const pg_unicode_decomposition *entry;
     162             272 :     int         size = 0;
     163                 :     int         i;
     164                 :     const uint32 *decomp;
     165                 :     int         dec_size;
     166                 : 
     167                 :     /*
     168                 :      * Fast path for Hangul characters not stored in tables to save memory as
     169                 :      * decomposition is algorithmic. See
     170                 :      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
     171                 :      * on the matter.
     172                 :      */
     173             272 :     if (code >= SBASE && code < SBASE + SCOUNT)
     174                 :     {
     175                 :         uint32      tindex,
     176                 :                     sindex;
     177                 : 
     178 UBC           0 :         sindex = code - SBASE;
     179               0 :         tindex = sindex % TCOUNT;
     180                 : 
     181               0 :         if (tindex != 0)
     182               0 :             return 3;
     183               0 :         return 2;
     184                 :     }
     185                 : 
     186 CBC         272 :     entry = get_code_entry(code);
     187                 : 
     188                 :     /*
     189                 :      * Just count current code if no other decompositions.  A NULL entry is
     190                 :      * equivalent to a character with class 0 and no decompositions.
     191                 :      */
     192             272 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
     193              55 :         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
     194             235 :         return 1;
     195                 : 
     196                 :     /*
     197                 :      * If this entry has other decomposition codes look at them as well. First
     198                 :      * get its decomposition in the list of tables available.
     199                 :      */
     200              37 :     decomp = get_code_decomposition(entry, &dec_size);
     201              96 :     for (i = 0; i < dec_size; i++)
     202                 :     {
     203              59 :         uint32      lcode = decomp[i];
     204                 : 
     205              59 :         size += get_decomposed_size(lcode, compat);
     206                 :     }
     207                 : 
     208              37 :     return size;
     209                 : }
     210                 : 
     211                 : /*
     212                 :  * Recompose a set of characters. For hangul characters, the calculation
     213                 :  * is algorithmic. For others, an inverse lookup at the decomposition
     214                 :  * table is necessary. Returns true if a recomposition can be done, and
     215                 :  * false otherwise.
     216                 :  */
     217                 : static bool
     218              86 : recompose_code(uint32 start, uint32 code, uint32 *result)
     219                 : {
     220                 :     /*
     221                 :      * Handle Hangul characters algorithmically, per the Unicode spec.
     222                 :      *
     223                 :      * Check if two current characters are L and V.
     224                 :      */
     225              86 :     if (start >= LBASE && start < LBASE + LCOUNT &&
     226 UBC           0 :         code >= VBASE && code < VBASE + VCOUNT)
     227                 :     {
     228                 :         /* make syllable of form LV */
     229               0 :         uint32      lindex = start - LBASE;
     230               0 :         uint32      vindex = code - VBASE;
     231                 : 
     232               0 :         *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
     233               0 :         return true;
     234                 :     }
     235                 :     /* Check if two current characters are LV and T */
     236 CBC          86 :     else if (start >= SBASE && start < (SBASE + SCOUNT) &&
     237 UBC           0 :              ((start - SBASE) % TCOUNT) == 0 &&
     238               0 :              code >= TBASE && code < (TBASE + TCOUNT))
     239                 :     {
     240                 :         /* make syllable of form LVT */
     241               0 :         uint32      tindex = code - TBASE;
     242                 : 
     243               0 :         *result = start + tindex;
     244               0 :         return true;
     245                 :     }
     246                 :     else
     247                 :     {
     248                 :         const pg_unicode_decomposition *entry;
     249                 : 
     250                 :         /*
     251                 :          * Do an inverse lookup of the decomposition tables to see if anything
     252                 :          * matches. The comparison just needs to be a perfect match on the
     253                 :          * sub-table of size two, because the start character has already been
     254                 :          * recomposed partially.  This lookup uses a perfect hash function for
     255                 :          * the backend code.
     256                 :          */
     257                 : #ifndef FRONTEND
     258                 : 
     259                 :         int         h,
     260                 :                     inv_lookup_index;
     261                 :         uint64      hashkey;
     262 CBC          70 :         pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
     263                 : 
     264                 :         /*
     265                 :          * Compute the hash function. The hash key is formed by concatenating
     266                 :          * bytes of the two codepoints in network order. See also
     267                 :          * src/common/unicode/generate-unicode_norm_table.pl.
     268                 :          */
     269              70 :         hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
     270              70 :         h = recompinfo.hash(&hashkey);
     271                 : 
     272                 :         /* An out-of-range result implies no match */
     273              70 :         if (h < 0 || h >= recompinfo.num_recomps)
     274              68 :             return false;
     275                 : 
     276              23 :         inv_lookup_index = recompinfo.inverse_lookup[h];
     277              23 :         entry = &UnicodeDecompMain[inv_lookup_index];
     278                 : 
     279              23 :         if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
     280              21 :             code == UnicodeDecomp_codepoints[entry->dec_index + 1])
     281                 :         {
     282              21 :             *result = entry->codepoint;
     283              21 :             return true;
     284                 :         }
     285                 : 
     286                 : #else
     287                 : 
     288                 :         int         i;
     289                 : 
     290          108416 :         for (i = 0; i < lengthof(UnicodeDecompMain); i++)
     291                 :         {
     292          108400 :             entry = &UnicodeDecompMain[i];
     293                 : 
     294          108400 :             if (DECOMPOSITION_SIZE(entry) != 2)
     295           81616 :                 continue;
     296                 : 
     297           26784 :             if (DECOMPOSITION_NO_COMPOSE(entry))
     298           11728 :                 continue;
     299                 : 
     300           15056 :             if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
     301             140 :                 code == UnicodeDecomp_codepoints[entry->dec_index + 1])
     302                 :             {
     303 UBC           0 :                 *result = entry->codepoint;
     304               0 :                 return true;
     305                 :             }
     306                 :         }
     307                 : #endif                          /* !FRONTEND */
     308                 :     }
     309                 : 
     310 CBC          18 :     return false;
     311                 : }
     312                 : 
     313                 : /*
     314                 :  * Decompose the given code into the array given by caller. The
     315                 :  * decomposition begins at the position given by caller, saving one
     316                 :  * lookup on the decomposition table. The current position needs to be
     317                 :  * updated here to let the caller know from where to continue filling
     318                 :  * in the array result.
     319                 :  */
     320                 : static void
     321             272 : decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
     322                 : {
     323                 :     const pg_unicode_decomposition *entry;
     324                 :     int         i;
     325                 :     const uint32 *decomp;
     326                 :     int         dec_size;
     327                 : 
     328                 :     /*
     329                 :      * Fast path for Hangul characters not stored in tables to save memory as
     330                 :      * decomposition is algorithmic. See
     331                 :      * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
     332                 :      * on the matter.
     333                 :      */
     334             272 :     if (code >= SBASE && code < SBASE + SCOUNT)
     335                 :     {
     336                 :         uint32      l,
     337                 :                     v,
     338                 :                     tindex,
     339                 :                     sindex;
     340 UBC           0 :         pg_wchar   *res = *result;
     341                 : 
     342               0 :         sindex = code - SBASE;
     343               0 :         l = LBASE + sindex / (VCOUNT * TCOUNT);
     344               0 :         v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
     345               0 :         tindex = sindex % TCOUNT;
     346                 : 
     347               0 :         res[*current] = l;
     348               0 :         (*current)++;
     349               0 :         res[*current] = v;
     350               0 :         (*current)++;
     351                 : 
     352               0 :         if (tindex != 0)
     353                 :         {
     354               0 :             res[*current] = TBASE + tindex;
     355               0 :             (*current)++;
     356                 :         }
     357                 : 
     358 CBC         235 :         return;
     359                 :     }
     360                 : 
     361             272 :     entry = get_code_entry(code);
     362                 : 
     363                 :     /*
     364                 :      * Just fill in with the current decomposition if there are no
     365                 :      * decomposition codes to recurse to.  A NULL entry is equivalent to a
     366                 :      * character with class 0 and no decompositions, so just leave also in
     367                 :      * this case.
     368                 :      */
     369             272 :     if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
     370              55 :         (!compat && DECOMPOSITION_IS_COMPAT(entry)))
     371                 :     {
     372             235 :         pg_wchar   *res = *result;
     373                 : 
     374             235 :         res[*current] = code;
     375             235 :         (*current)++;
     376             235 :         return;
     377                 :     }
     378                 : 
     379                 :     /*
     380                 :      * If this entry has other decomposition codes look at them as well.
     381                 :      */
     382              37 :     decomp = get_code_decomposition(entry, &dec_size);
     383              96 :     for (i = 0; i < dec_size; i++)
     384                 :     {
     385              59 :         pg_wchar    lcode = (pg_wchar) decomp[i];
     386                 : 
     387                 :         /* Leave if no more decompositions */
     388              59 :         decompose_code(lcode, compat, result, current);
     389                 :     }
     390                 : }
     391                 : 
     392                 : /*
     393                 :  * unicode_normalize - Normalize a Unicode string to the specified form.
     394                 :  *
     395                 :  * The input is a 0-terminated array of codepoints.
     396                 :  *
     397                 :  * In frontend, returns a 0-terminated array of codepoints, allocated with
     398                 :  * malloc. Or NULL if we run out of memory. In backend, the returned
     399                 :  * string is palloc'd instead, and OOM is reported with ereport().
     400                 :  */
     401                 : pg_wchar *
     402              68 : unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
     403                 : {
     404              68 :     bool        compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
     405              68 :     bool        recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
     406                 :     pg_wchar   *decomp_chars;
     407                 :     pg_wchar   *recomp_chars;
     408                 :     int         decomp_size,
     409                 :                 current_size;
     410                 :     int         count;
     411                 :     const pg_wchar *p;
     412                 : 
     413                 :     /* variables for recomposition */
     414                 :     int         last_class;
     415                 :     int         starter_pos;
     416                 :     int         target_pos;
     417                 :     uint32      starter_ch;
     418                 : 
     419                 :     /* First, do character decomposition */
     420                 : 
     421                 :     /*
     422                 :      * Calculate how many characters long the decomposed version will be.
     423                 :      */
     424              68 :     decomp_size = 0;
     425             281 :     for (p = input; *p; p++)
     426             213 :         decomp_size += get_decomposed_size(*p, compat);
     427                 : 
     428              68 :     decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
     429              68 :     if (decomp_chars == NULL)
     430 UBC           0 :         return NULL;
     431                 : 
     432                 :     /*
     433                 :      * Now fill in each entry recursively. This needs a second pass on the
     434                 :      * decomposition table.
     435                 :      */
     436 CBC          68 :     current_size = 0;
     437             281 :     for (p = input; *p; p++)
     438             213 :         decompose_code(*p, compat, &decomp_chars, &current_size);
     439              68 :     decomp_chars[decomp_size] = '\0';
     440              68 :     Assert(decomp_size == current_size);
     441                 : 
     442                 :     /* Leave if there is nothing to decompose */
     443              68 :     if (decomp_size == 0)
     444               9 :         return decomp_chars;
     445                 : 
     446                 :     /*
     447                 :      * Now apply canonical ordering.
     448                 :      */
     449             235 :     for (count = 1; count < decomp_size; count++)
     450                 :     {
     451             176 :         pg_wchar    prev = decomp_chars[count - 1];
     452             176 :         pg_wchar    next = decomp_chars[count];
     453                 :         pg_wchar    tmp;
     454             176 :         const uint8 prevClass = get_canonical_class(prev);
     455             176 :         const uint8 nextClass = get_canonical_class(next);
     456                 : 
     457                 :         /*
     458                 :          * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
     459                 :          * annex 4, a sequence of two adjacent characters in a string is an
     460                 :          * exchangeable pair if the combining class (from the Unicode
     461                 :          * Character Database) for the first character is greater than the
     462                 :          * combining class for the second, and the second is not a starter.  A
     463                 :          * character is a starter if its combining class is 0.
     464                 :          */
     465             176 :         if (prevClass == 0 || nextClass == 0)
     466             176 :             continue;
     467                 : 
     468 UBC           0 :         if (prevClass <= nextClass)
     469               0 :             continue;
     470                 : 
     471                 :         /* exchange can happen */
     472               0 :         tmp = decomp_chars[count - 1];
     473               0 :         decomp_chars[count - 1] = decomp_chars[count];
     474               0 :         decomp_chars[count] = tmp;
     475                 : 
     476                 :         /* backtrack to check again */
     477               0 :         if (count > 1)
     478               0 :             count -= 2;
     479                 :     }
     480                 : 
     481 CBC          59 :     if (!recompose)
     482              30 :         return decomp_chars;
     483                 : 
     484                 :     /*
     485                 :      * The last phase of NFC and NFKC is the recomposition of the reordered
     486                 :      * Unicode string using combining classes. The recomposed string cannot be
     487                 :      * longer than the decomposed one, so make the allocation of the output
     488                 :      * string based on that assumption.
     489                 :      */
     490              29 :     recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
     491              29 :     if (!recomp_chars)
     492                 :     {
     493 UBC           0 :         FREE(decomp_chars);
     494               0 :         return NULL;
     495                 :     }
     496                 : 
     497 CBC          29 :     last_class = -1;            /* this eliminates a special check */
     498              29 :     starter_pos = 0;
     499              29 :     target_pos = 1;
     500              29 :     starter_ch = recomp_chars[0] = decomp_chars[0];
     501                 : 
     502             115 :     for (count = 1; count < decomp_size; count++)
     503                 :     {
     504              86 :         pg_wchar    ch = decomp_chars[count];
     505              86 :         int         ch_class = get_canonical_class(ch);
     506                 :         pg_wchar    composite;
     507                 : 
     508             172 :         if (last_class < ch_class &&
     509              86 :             recompose_code(starter_ch, ch, &composite))
     510                 :         {
     511              21 :             recomp_chars[starter_pos] = composite;
     512              21 :             starter_ch = composite;
     513                 :         }
     514              65 :         else if (ch_class == 0)
     515                 :         {
     516              65 :             starter_pos = target_pos;
     517              65 :             starter_ch = ch;
     518              65 :             last_class = -1;
     519              65 :             recomp_chars[target_pos++] = ch;
     520                 :         }
     521                 :         else
     522                 :         {
     523 UBC           0 :             last_class = ch_class;
     524               0 :             recomp_chars[target_pos++] = ch;
     525                 :         }
     526                 :     }
     527 CBC          29 :     recomp_chars[target_pos] = (pg_wchar) '\0';
     528                 : 
     529              29 :     FREE(decomp_chars);
     530                 : 
     531              29 :     return recomp_chars;
     532                 : }
     533                 : 
     534                 : /*
     535                 :  * Normalization "quick check" algorithm; see
     536                 :  * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
     537                 :  */
     538                 : 
     539                 : /* We only need this in the backend. */
     540                 : #ifndef FRONTEND
     541                 : 
     542                 : static const pg_unicode_normprops *
     543              96 : qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
     544                 : {
     545                 :     int         h;
     546                 :     uint32      hashkey;
     547                 : 
     548                 :     /*
     549                 :      * Compute the hash function. The hash key is the codepoint with the bytes
     550                 :      * in network order.
     551                 :      */
     552              96 :     hashkey = pg_hton32(ch);
     553              96 :     h = norminfo->hash(&hashkey);
     554                 : 
     555                 :     /* An out-of-range result implies no match */
     556              96 :     if (h < 0 || h >= norminfo->num_normprops)
     557              42 :         return NULL;
     558                 : 
     559                 :     /*
     560                 :      * Since it's a perfect hash, we need only match to the specific codepoint
     561                 :      * it identifies.
     562                 :      */
     563              54 :     if (ch != norminfo->normprops[h].codepoint)
     564              36 :         return NULL;
     565                 : 
     566                 :     /* Success! */
     567              18 :     return &norminfo->normprops[h];
     568                 : }
     569                 : 
     570                 : /*
     571                 :  * Look up the normalization quick check character property
     572                 :  */
     573                 : static UnicodeNormalizationQC
     574              96 : qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
     575                 : {
     576              96 :     const pg_unicode_normprops *found = NULL;
     577                 : 
     578              96 :     switch (form)
     579                 :     {
     580              60 :         case UNICODE_NFC:
     581              60 :             found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
     582              60 :             break;
     583              36 :         case UNICODE_NFKC:
     584              36 :             found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
     585              36 :             break;
     586 UBC           0 :         default:
     587               0 :             Assert(false);
     588                 :             break;
     589                 :     }
     590                 : 
     591 CBC          96 :     if (found)
     592              18 :         return found->quickcheck;
     593                 :     else
     594              78 :         return UNICODE_NORM_QC_YES;
     595                 : }
     596                 : 
     597                 : UnicodeNormalizationQC
     598              66 : unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
     599                 : {
     600              66 :     uint8       lastCanonicalClass = 0;
     601              66 :     UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
     602                 : 
     603                 :     /*
     604                 :      * For the "D" forms, we don't run the quickcheck.  We don't include the
     605                 :      * lookup tables for those because they are huge, checking for these
     606                 :      * particular forms is less common, and running the slow path is faster
     607                 :      * for the "D" forms than the "C" forms because you don't need to
     608                 :      * recompose, which is slow.
     609                 :      */
     610              66 :     if (form == UNICODE_NFD || form == UNICODE_NFKD)
     611              30 :         return UNICODE_NORM_QC_MAYBE;
     612                 : 
     613             126 :     for (const pg_wchar *p = input; *p; p++)
     614                 :     {
     615              96 :         pg_wchar    ch = *p;
     616                 :         uint8       canonicalClass;
     617                 :         UnicodeNormalizationQC check;
     618                 : 
     619              96 :         canonicalClass = get_canonical_class(ch);
     620              96 :         if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
     621 UBC           0 :             return UNICODE_NORM_QC_NO;
     622                 : 
     623 CBC          96 :         check = qc_is_allowed(form, ch);
     624              96 :         if (check == UNICODE_NORM_QC_NO)
     625               6 :             return UNICODE_NORM_QC_NO;
     626              90 :         else if (check == UNICODE_NORM_QC_MAYBE)
     627              12 :             result = UNICODE_NORM_QC_MAYBE;
     628                 : 
     629              90 :         lastCanonicalClass = canonicalClass;
     630                 :     }
     631              30 :     return result;
     632                 : }
     633                 : 
     634                 : #endif                          /* !FRONTEND */
        

Generated by: LCOV version v1.16-55-g56c0a2a