Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * unicode_norm.c
3 : * Normalize a Unicode string
4 : *
5 : * This implements Unicode normalization, per the documentation at
6 : * https://www.unicode.org/reports/tr15/.
7 : *
8 : * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
9 : *
10 : * IDENTIFICATION
11 : * src/common/unicode_norm.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #ifndef FRONTEND
16 : #include "postgres.h"
17 : #else
18 : #include "postgres_fe.h"
19 : #endif
20 :
21 : #include "common/unicode_norm.h"
22 : #ifndef FRONTEND
23 : #include "common/unicode_norm_hashfunc.h"
24 : #include "common/unicode_normprops_table.h"
25 : #include "port/pg_bswap.h"
26 : #else
27 : #include "common/unicode_norm_table.h"
28 : #endif
29 :
30 : #ifndef FRONTEND
31 : #define ALLOC(size) palloc(size)
32 : #define FREE(size) pfree(size)
33 : #else
34 : #define ALLOC(size) malloc(size)
35 : #define FREE(size) free(size)
36 : #endif
37 :
38 : /* Constants for calculations with Hangul characters */
39 : #define SBASE 0xAC00 /* U+AC00 */
40 : #define LBASE 0x1100 /* U+1100 */
41 : #define VBASE 0x1161 /* U+1161 */
42 : #define TBASE 0x11A7 /* U+11A7 */
43 : #define LCOUNT 19
44 : #define VCOUNT 21
45 : #define TCOUNT 28
46 : #define NCOUNT VCOUNT * TCOUNT
47 : #define SCOUNT LCOUNT * NCOUNT
48 :
49 : #ifdef FRONTEND
50 : /* comparison routine for bsearch() of decomposition lookup table. */
51 : static int
897 michael 52 CBC 1260 : conv_compare(const void *p1, const void *p2)
53 : {
54 : uint32 v1,
55 : v2;
56 :
57 1260 : v1 = *(const uint32 *) p1;
58 1260 : v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
59 1260 : return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
60 : }
61 :
62 : #endif
63 :
64 : /*
65 : * get_code_entry
66 : *
67 : * Get the entry corresponding to code in the decomposition lookup table.
68 : * The backend version of this code uses a perfect hash function for the
69 : * lookup, while the frontend version uses a binary search.
70 : */
71 : static const pg_unicode_decomposition *
898 72 1078 : get_code_entry(pg_wchar code)
73 : {
74 : #ifndef FRONTEND
75 : int h;
76 : uint32 hashkey;
77 980 : pg_unicode_decompinfo decompinfo = UnicodeDecompInfo;
78 :
79 : /*
80 : * Compute the hash function. The hash key is the codepoint with the bytes
81 : * in network order.
82 : */
83 980 : hashkey = pg_hton32(code);
84 980 : h = decompinfo.hash(&hashkey);
85 :
86 : /* An out-of-range result implies no match */
87 980 : if (h < 0 || h >= decompinfo.num_decomps)
88 558 : return NULL;
89 :
90 : /*
91 : * Since it's a perfect hash, we need only match to the specific codepoint
92 : * it identifies.
93 : */
94 422 : if (code != decompinfo.decomps[h].codepoint)
898 michael 95 UBC 0 : return NULL;
96 :
97 : /* Success! */
898 michael 98 CBC 422 : return &decompinfo.decomps[h];
99 : #else
2193 heikki.linnakangas 100 98 : return bsearch(&(code),
101 : UnicodeDecompMain,
102 : lengthof(UnicodeDecompMain),
103 : sizeof(pg_unicode_decomposition),
104 : conv_compare);
105 : #endif
106 : }
107 :
108 : /*
109 : * Get the combining class of the given codepoint.
110 : */
111 : static uint8
851 michael 112 534 : get_canonical_class(pg_wchar code)
113 : {
114 534 : const pg_unicode_decomposition *entry = get_code_entry(code);
115 :
116 : /*
117 : * If no entries are found, the character used is either an Hangul
118 : * character or a character with a class of 0 and no decompositions.
119 : */
120 534 : if (!entry)
121 318 : return 0;
122 : else
123 216 : return entry->comb_class;
124 : }
125 :
126 : /*
127 : * Given a decomposition entry looked up earlier, get the decomposed
128 : * characters.
129 : *
130 : * Note: the returned pointer can point to statically allocated buffer, and
131 : * is only valid until next call to this function!
132 : */
133 : static const pg_wchar *
898 134 74 : get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
135 : {
136 : static pg_wchar x;
137 :
2193 heikki.linnakangas 138 74 : if (DECOMPOSITION_IS_INLINE(entry))
139 : {
140 30 : Assert(DECOMPOSITION_SIZE(entry) == 1);
141 30 : x = (pg_wchar) entry->dec_index;
142 30 : *dec_size = 1;
143 30 : return &x;
144 : }
145 : else
146 : {
147 44 : *dec_size = DECOMPOSITION_SIZE(entry);
148 44 : return &UnicodeDecomp_codepoints[entry->dec_index];
149 : }
150 : }
151 :
152 : /*
153 : * Calculate how many characters a given character will decompose to.
154 : *
155 : * This needs to recurse, if the character decomposes into characters that
156 : * are, in turn, decomposable.
157 : */
158 : static int
1111 peter 159 272 : get_decomposed_size(pg_wchar code, bool compat)
160 : {
161 : const pg_unicode_decomposition *entry;
2193 heikki.linnakangas 162 272 : int size = 0;
163 : int i;
164 : const uint32 *decomp;
165 : int dec_size;
166 :
167 : /*
168 : * Fast path for Hangul characters not stored in tables to save memory as
169 : * decomposition is algorithmic. See
170 : * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
171 : * on the matter.
172 : */
173 272 : if (code >= SBASE && code < SBASE + SCOUNT)
174 : {
175 : uint32 tindex,
176 : sindex;
177 :
2193 heikki.linnakangas 178 UBC 0 : sindex = code - SBASE;
179 0 : tindex = sindex % TCOUNT;
180 :
181 0 : if (tindex != 0)
182 0 : return 3;
183 0 : return 2;
184 : }
185 :
2193 heikki.linnakangas 186 CBC 272 : entry = get_code_entry(code);
187 :
188 : /*
189 : * Just count current code if no other decompositions. A NULL entry is
190 : * equivalent to a character with class 0 and no decompositions.
191 : */
1111 peter 192 272 : if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
193 55 : (!compat && DECOMPOSITION_IS_COMPAT(entry)))
2193 heikki.linnakangas 194 235 : return 1;
195 :
196 : /*
197 : * If this entry has other decomposition codes look at them as well. First
198 : * get its decomposition in the list of tables available.
199 : */
200 37 : decomp = get_code_decomposition(entry, &dec_size);
201 96 : for (i = 0; i < dec_size; i++)
202 : {
203 59 : uint32 lcode = decomp[i];
204 :
1111 peter 205 59 : size += get_decomposed_size(lcode, compat);
206 : }
207 :
2193 heikki.linnakangas 208 37 : return size;
209 : }
210 :
211 : /*
212 : * Recompose a set of characters. For hangul characters, the calculation
213 : * is algorithmic. For others, an inverse lookup at the decomposition
214 : * table is necessary. Returns true if a recomposition can be done, and
215 : * false otherwise.
216 : */
217 : static bool
218 86 : recompose_code(uint32 start, uint32 code, uint32 *result)
219 : {
220 : /*
221 : * Handle Hangul characters algorithmically, per the Unicode spec.
222 : *
223 : * Check if two current characters are L and V.
224 : */
225 86 : if (start >= LBASE && start < LBASE + LCOUNT &&
2193 heikki.linnakangas 226 UBC 0 : code >= VBASE && code < VBASE + VCOUNT)
227 : {
228 : /* make syllable of form LV */
229 0 : uint32 lindex = start - LBASE;
230 0 : uint32 vindex = code - VBASE;
231 :
232 0 : *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
233 0 : return true;
234 : }
235 : /* Check if two current characters are LV and T */
2193 heikki.linnakangas 236 CBC 86 : else if (start >= SBASE && start < (SBASE + SCOUNT) &&
2193 heikki.linnakangas 237 UBC 0 : ((start - SBASE) % TCOUNT) == 0 &&
238 0 : code >= TBASE && code < (TBASE + TCOUNT))
239 : {
240 : /* make syllable of form LVT */
241 0 : uint32 tindex = code - TBASE;
242 :
243 0 : *result = start + tindex;
244 0 : return true;
245 : }
246 : else
247 : {
248 : const pg_unicode_decomposition *entry;
249 :
250 : /*
251 : * Do an inverse lookup of the decomposition tables to see if anything
252 : * matches. The comparison just needs to be a perfect match on the
253 : * sub-table of size two, because the start character has already been
254 : * recomposed partially. This lookup uses a perfect hash function for
255 : * the backend code.
256 : */
257 : #ifndef FRONTEND
258 :
259 : int h,
260 : inv_lookup_index;
261 : uint64 hashkey;
898 michael 262 CBC 70 : pg_unicode_recompinfo recompinfo = UnicodeRecompInfo;
263 :
264 : /*
265 : * Compute the hash function. The hash key is formed by concatenating
266 : * bytes of the two codepoints in network order. See also
267 : * src/common/unicode/generate-unicode_norm_table.pl.
268 : */
269 70 : hashkey = pg_hton64(((uint64) start << 32) | (uint64) code);
270 70 : h = recompinfo.hash(&hashkey);
271 :
272 : /* An out-of-range result implies no match */
273 70 : if (h < 0 || h >= recompinfo.num_recomps)
274 68 : return false;
275 :
276 23 : inv_lookup_index = recompinfo.inverse_lookup[h];
277 23 : entry = &UnicodeDecompMain[inv_lookup_index];
278 :
279 23 : if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
280 21 : code == UnicodeDecomp_codepoints[entry->dec_index + 1])
281 : {
282 21 : *result = entry->codepoint;
283 21 : return true;
284 : }
285 :
286 : #else
287 :
288 : int i;
289 :
2193 heikki.linnakangas 290 108416 : for (i = 0; i < lengthof(UnicodeDecompMain); i++)
291 : {
898 michael 292 108400 : entry = &UnicodeDecompMain[i];
293 :
2193 heikki.linnakangas 294 108400 : if (DECOMPOSITION_SIZE(entry) != 2)
295 81616 : continue;
296 :
297 26784 : if (DECOMPOSITION_NO_COMPOSE(entry))
298 11728 : continue;
299 :
300 15056 : if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
301 140 : code == UnicodeDecomp_codepoints[entry->dec_index + 1])
302 : {
2193 heikki.linnakangas 303 UBC 0 : *result = entry->codepoint;
304 0 : return true;
305 : }
306 : }
307 : #endif /* !FRONTEND */
308 : }
309 :
2193 heikki.linnakangas 310 CBC 18 : return false;
311 : }
312 :
313 : /*
314 : * Decompose the given code into the array given by caller. The
315 : * decomposition begins at the position given by caller, saving one
316 : * lookup on the decomposition table. The current position needs to be
317 : * updated here to let the caller know from where to continue filling
318 : * in the array result.
319 : */
320 : static void
1111 peter 321 272 : decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
322 : {
323 : const pg_unicode_decomposition *entry;
324 : int i;
325 : const uint32 *decomp;
326 : int dec_size;
327 :
328 : /*
329 : * Fast path for Hangul characters not stored in tables to save memory as
330 : * decomposition is algorithmic. See
331 : * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
332 : * on the matter.
333 : */
2193 heikki.linnakangas 334 272 : if (code >= SBASE && code < SBASE + SCOUNT)
335 : {
336 : uint32 l,
337 : v,
338 : tindex,
339 : sindex;
2193 heikki.linnakangas 340 UBC 0 : pg_wchar *res = *result;
341 :
342 0 : sindex = code - SBASE;
343 0 : l = LBASE + sindex / (VCOUNT * TCOUNT);
344 0 : v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
345 0 : tindex = sindex % TCOUNT;
346 :
347 0 : res[*current] = l;
348 0 : (*current)++;
349 0 : res[*current] = v;
350 0 : (*current)++;
351 :
352 0 : if (tindex != 0)
353 : {
354 0 : res[*current] = TBASE + tindex;
355 0 : (*current)++;
356 : }
357 :
2193 heikki.linnakangas 358 CBC 235 : return;
359 : }
360 :
361 272 : entry = get_code_entry(code);
362 :
363 : /*
364 : * Just fill in with the current decomposition if there are no
365 : * decomposition codes to recurse to. A NULL entry is equivalent to a
366 : * character with class 0 and no decompositions, so just leave also in
367 : * this case.
368 : */
1111 peter 369 272 : if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
370 55 : (!compat && DECOMPOSITION_IS_COMPAT(entry)))
371 : {
2193 heikki.linnakangas 372 235 : pg_wchar *res = *result;
373 :
374 235 : res[*current] = code;
375 235 : (*current)++;
376 235 : return;
377 : }
378 :
379 : /*
380 : * If this entry has other decomposition codes look at them as well.
381 : */
382 37 : decomp = get_code_decomposition(entry, &dec_size);
383 96 : for (i = 0; i < dec_size; i++)
384 : {
385 59 : pg_wchar lcode = (pg_wchar) decomp[i];
386 :
387 : /* Leave if no more decompositions */
1111 peter 388 59 : decompose_code(lcode, compat, result, current);
389 : }
390 : }
391 :
392 : /*
393 : * unicode_normalize - Normalize a Unicode string to the specified form.
394 : *
395 : * The input is a 0-terminated array of codepoints.
396 : *
397 : * In frontend, returns a 0-terminated array of codepoints, allocated with
398 : * malloc. Or NULL if we run out of memory. In backend, the returned
399 : * string is palloc'd instead, and OOM is reported with ereport().
400 : */
401 : pg_wchar *
402 68 : unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
403 : {
404 68 : bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
405 68 : bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
406 : pg_wchar *decomp_chars;
407 : pg_wchar *recomp_chars;
408 : int decomp_size,
409 : current_size;
410 : int count;
411 : const pg_wchar *p;
412 :
413 : /* variables for recomposition */
414 : int last_class;
415 : int starter_pos;
416 : int target_pos;
417 : uint32 starter_ch;
418 :
419 : /* First, do character decomposition */
420 :
421 : /*
422 : * Calculate how many characters long the decomposed version will be.
423 : */
2193 heikki.linnakangas 424 68 : decomp_size = 0;
425 281 : for (p = input; *p; p++)
1111 peter 426 213 : decomp_size += get_decomposed_size(*p, compat);
427 :
2193 heikki.linnakangas 428 68 : decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
429 68 : if (decomp_chars == NULL)
2193 heikki.linnakangas 430 UBC 0 : return NULL;
431 :
432 : /*
433 : * Now fill in each entry recursively. This needs a second pass on the
434 : * decomposition table.
435 : */
2193 heikki.linnakangas 436 CBC 68 : current_size = 0;
437 281 : for (p = input; *p; p++)
1111 peter 438 213 : decompose_code(*p, compat, &decomp_chars, ¤t_size);
2193 heikki.linnakangas 439 68 : decomp_chars[decomp_size] = '\0';
440 68 : Assert(decomp_size == current_size);
441 :
442 : /* Leave if there is nothing to decompose */
514 michael 443 68 : if (decomp_size == 0)
444 9 : return decomp_chars;
445 :
446 : /*
447 : * Now apply canonical ordering.
448 : */
2193 heikki.linnakangas 449 235 : for (count = 1; count < decomp_size; count++)
450 : {
451 176 : pg_wchar prev = decomp_chars[count - 1];
452 176 : pg_wchar next = decomp_chars[count];
453 : pg_wchar tmp;
851 michael 454 176 : const uint8 prevClass = get_canonical_class(prev);
455 176 : const uint8 nextClass = get_canonical_class(next);
456 :
457 : /*
458 : * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
459 : * annex 4, a sequence of two adjacent characters in a string is an
460 : * exchangeable pair if the combining class (from the Unicode
461 : * Character Database) for the first character is greater than the
462 : * combining class for the second, and the second is not a starter. A
463 : * character is a starter if its combining class is 0.
464 : */
465 176 : if (prevClass == 0 || nextClass == 0)
2193 heikki.linnakangas 466 176 : continue;
467 :
851 michael 468 UBC 0 : if (prevClass <= nextClass)
2193 heikki.linnakangas 469 0 : continue;
470 :
471 : /* exchange can happen */
472 0 : tmp = decomp_chars[count - 1];
473 0 : decomp_chars[count - 1] = decomp_chars[count];
474 0 : decomp_chars[count] = tmp;
475 :
476 : /* backtrack to check again */
477 0 : if (count > 1)
478 0 : count -= 2;
479 : }
480 :
1111 peter 481 CBC 59 : if (!recompose)
482 30 : return decomp_chars;
483 :
484 : /*
485 : * The last phase of NFC and NFKC is the recomposition of the reordered
486 : * Unicode string using combining classes. The recomposed string cannot be
487 : * longer than the decomposed one, so make the allocation of the output
488 : * string based on that assumption.
489 : */
2193 heikki.linnakangas 490 29 : recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
491 29 : if (!recomp_chars)
492 : {
2193 heikki.linnakangas 493 UBC 0 : FREE(decomp_chars);
494 0 : return NULL;
495 : }
496 :
2193 heikki.linnakangas 497 CBC 29 : last_class = -1; /* this eliminates a special check */
498 29 : starter_pos = 0;
499 29 : target_pos = 1;
500 29 : starter_ch = recomp_chars[0] = decomp_chars[0];
501 :
502 115 : for (count = 1; count < decomp_size; count++)
503 : {
504 86 : pg_wchar ch = decomp_chars[count];
851 michael 505 86 : int ch_class = get_canonical_class(ch);
506 : pg_wchar composite;
507 :
2193 heikki.linnakangas 508 172 : if (last_class < ch_class &&
509 86 : recompose_code(starter_ch, ch, &composite))
510 : {
511 21 : recomp_chars[starter_pos] = composite;
512 21 : starter_ch = composite;
513 : }
514 65 : else if (ch_class == 0)
515 : {
516 65 : starter_pos = target_pos;
517 65 : starter_ch = ch;
518 65 : last_class = -1;
519 65 : recomp_chars[target_pos++] = ch;
520 : }
521 : else
522 : {
2193 heikki.linnakangas 523 UBC 0 : last_class = ch_class;
524 0 : recomp_chars[target_pos++] = ch;
525 : }
526 : }
2193 heikki.linnakangas 527 CBC 29 : recomp_chars[target_pos] = (pg_wchar) '\0';
528 :
529 29 : FREE(decomp_chars);
530 :
531 29 : return recomp_chars;
532 : }
533 :
534 : /*
535 : * Normalization "quick check" algorithm; see
536 : * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
537 : */
538 :
539 : /* We only need this in the backend. */
540 : #ifndef FRONTEND
541 :
542 : static const pg_unicode_normprops *
910 michael 543 96 : qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
544 : {
545 : int h;
546 : uint32 hashkey;
547 :
548 : /*
549 : * Compute the hash function. The hash key is the codepoint with the bytes
550 : * in network order.
551 : */
909 552 96 : hashkey = pg_hton32(ch);
910 553 96 : h = norminfo->hash(&hashkey);
554 :
555 : /* An out-of-range result implies no match */
556 96 : if (h < 0 || h >= norminfo->num_normprops)
557 42 : return NULL;
558 :
559 : /*
560 : * Since it's a perfect hash, we need only match to the specific codepoint
561 : * it identifies.
562 : */
563 54 : if (ch != norminfo->normprops[h].codepoint)
564 36 : return NULL;
565 :
566 : /* Success! */
567 18 : return &norminfo->normprops[h];
568 : }
569 :
570 : /*
571 : * Look up the normalization quick check character property
572 : */
573 : static UnicodeNormalizationQC
1109 peter 574 96 : qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
575 : {
910 michael 576 96 : const pg_unicode_normprops *found = NULL;
577 :
1109 peter 578 96 : switch (form)
579 : {
580 60 : case UNICODE_NFC:
910 michael 581 60 : found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
1109 peter 582 60 : break;
583 36 : case UNICODE_NFKC:
910 michael 584 36 : found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
1109 peter 585 36 : break;
1109 peter 586 UBC 0 : default:
587 0 : Assert(false);
588 : break;
589 : }
590 :
1109 peter 591 CBC 96 : if (found)
592 18 : return found->quickcheck;
593 : else
594 78 : return UNICODE_NORM_QC_YES;
595 : }
596 :
597 : UnicodeNormalizationQC
598 66 : unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
599 : {
600 66 : uint8 lastCanonicalClass = 0;
601 66 : UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
602 :
603 : /*
604 : * For the "D" forms, we don't run the quickcheck. We don't include the
605 : * lookup tables for those because they are huge, checking for these
606 : * particular forms is less common, and running the slow path is faster
607 : * for the "D" forms than the "C" forms because you don't need to
608 : * recompose, which is slow.
609 : */
610 66 : if (form == UNICODE_NFD || form == UNICODE_NFKD)
611 30 : return UNICODE_NORM_QC_MAYBE;
612 :
613 126 : for (const pg_wchar *p = input; *p; p++)
614 : {
615 96 : pg_wchar ch = *p;
616 : uint8 canonicalClass;
617 : UnicodeNormalizationQC check;
618 :
619 96 : canonicalClass = get_canonical_class(ch);
620 96 : if (lastCanonicalClass > canonicalClass && canonicalClass != 0)
1109 peter 621 UBC 0 : return UNICODE_NORM_QC_NO;
622 :
1109 peter 623 CBC 96 : check = qc_is_allowed(form, ch);
624 96 : if (check == UNICODE_NORM_QC_NO)
625 6 : return UNICODE_NORM_QC_NO;
626 90 : else if (check == UNICODE_NORM_QC_MAYBE)
627 12 : result = UNICODE_NORM_QC_MAYBE;
628 :
629 90 : lastCanonicalClass = canonicalClass;
630 : }
631 30 : return result;
632 : }
633 :
634 : #endif /* !FRONTEND */
|