Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * Utility functions for conversion procs.
4 : *
5 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
6 : * Portions Copyright (c) 1994, Regents of the University of California
7 : *
8 : * IDENTIFICATION
9 : * src/backend/utils/mb/conv.c
10 : *
11 : *-------------------------------------------------------------------------
12 : */
13 : #include "postgres.h"
14 : #include "mb/pg_wchar.h"
15 :
16 :
17 : /*
18 : * local2local: a generic single byte charset encoding
19 : * conversion between two ASCII-superset encodings.
20 : *
21 : * l points to the source string of length len
22 : * p is the output area (must be large enough!)
23 : * src_encoding is the PG identifier for the source encoding
24 : * dest_encoding is the PG identifier for the target encoding
25 : * tab holds conversion entries for the source charset
26 : * starting from 128 (0x80). each entry in the table holds the corresponding
27 : * code point for the target charset, or 0 if there is no equivalent code.
28 : *
29 : * Returns the number of input bytes consumed. If noError is true, this can
30 : * be less than 'len'.
31 : */
32 : int
2689 tgl 33 CBC 114 : local2local(const unsigned char *l,
34 : unsigned char *p,
35 : int len,
36 : int src_encoding,
37 : int dest_encoding,
38 : const unsigned char *tab,
39 : bool noError)
40 : {
738 heikki.linnakangas 41 114 : const unsigned char *start = l;
42 : unsigned char c1,
43 : c2;
44 :
2689 tgl 45 366 : while (len > 0)
46 : {
47 306 : c1 = *l;
48 306 : if (c1 == 0)
49 : {
738 heikki.linnakangas 50 54 : if (noError)
51 27 : break;
2689 tgl 52 27 : report_invalid_encoding(src_encoding, (const char *) l, len);
53 : }
54 252 : if (!IS_HIGHBIT_SET(c1))
55 153 : *p++ = c1;
56 : else
57 : {
58 99 : c2 = tab[c1 - HIGHBIT];
59 99 : if (c2)
60 99 : *p++ = c2;
61 : else
62 : {
738 heikki.linnakangas 63 UBC 0 : if (noError)
64 0 : break;
2689 tgl 65 0 : report_untranslatable_char(src_encoding, dest_encoding,
66 : (const char *) l, len);
67 : }
68 : }
2689 tgl 69 CBC 252 : l++;
70 252 : len--;
71 : }
72 87 : *p = '\0';
73 :
738 heikki.linnakangas 74 87 : return l - start;
75 : }
76 :
77 : /*
78 : * LATINn ---> MIC when the charset's local codes map directly to MIC
79 : *
80 : * l points to the source string of length len
81 : * p is the output area (must be large enough!)
82 : * lc is the mule character set id for the local encoding
83 : * encoding is the PG identifier for the local encoding
84 : *
85 : * Returns the number of input bytes consumed. If noError is true, this can
86 : * be less than 'len'.
87 : */
88 : int
6167 tgl 89 15 : latin2mic(const unsigned char *l, unsigned char *p, int len,
90 : int lc, int encoding, bool noError)
91 : {
738 heikki.linnakangas 92 15 : const unsigned char *start = l;
93 : int c1;
94 :
6167 tgl 95 60 : while (len > 0)
96 : {
97 45 : c1 = *l;
98 45 : if (c1 == 0)
99 : {
738 heikki.linnakangas 100 UBC 0 : if (noError)
101 0 : break;
6167 tgl 102 0 : report_invalid_encoding(encoding, (const char *) l, len);
103 : }
6313 bruce 104 CBC 45 : if (IS_HIGHBIT_SET(c1))
6167 tgl 105 UBC 0 : *p++ = lc;
8986 bruce 106 CBC 45 : *p++ = c1;
6167 tgl 107 45 : l++;
108 45 : len--;
109 : }
8986 bruce 110 15 : *p = '\0';
111 :
738 heikki.linnakangas 112 15 : return l - start;
113 : }
114 :
115 : /*
116 : * MIC ---> LATINn when the charset's local codes map directly to MIC
117 : *
118 : * mic points to the source string of length len
119 : * p is the output area (must be large enough!)
120 : * lc is the mule character set id for the local encoding
121 : * encoding is the PG identifier for the local encoding
122 : *
123 : * Returns the number of input bytes consumed. If noError is true, this can
124 : * be less than 'len'.
125 : */
126 : int
6167 tgl 127 177 : mic2latin(const unsigned char *mic, unsigned char *p, int len,
128 : int lc, int encoding, bool noError)
129 : {
738 heikki.linnakangas 130 177 : const unsigned char *start = mic;
131 : int c1;
132 :
6167 tgl 133 420 : while (len > 0)
134 : {
135 387 : c1 = *mic;
136 387 : if (c1 == 0)
137 : {
738 heikki.linnakangas 138 UBC 0 : if (noError)
139 0 : break;
6167 tgl 140 0 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
141 : }
6167 tgl 142 CBC 387 : if (!IS_HIGHBIT_SET(c1))
143 : {
144 : /* easy for ASCII */
145 180 : *p++ = c1;
146 180 : mic++;
147 180 : len--;
148 : }
149 : else
150 : {
1179 151 207 : int l = pg_mule_mblen(mic);
152 :
6167 153 207 : if (len < l)
154 : {
738 heikki.linnakangas 155 54 : if (noError)
156 27 : break;
6167 tgl 157 27 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
158 : len);
159 : }
160 153 : if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
161 : {
738 heikki.linnakangas 162 90 : if (noError)
163 45 : break;
6167 tgl 164 45 : report_untranslatable_char(PG_MULE_INTERNAL, encoding,
165 : (const char *) mic, len);
166 : }
167 63 : *p++ = mic[1];
168 63 : mic += 2;
169 63 : len -= 2;
170 : }
171 : }
8986 bruce 172 105 : *p = '\0';
173 :
738 heikki.linnakangas 174 105 : return mic - start;
175 : }
176 :
177 :
178 : /*
179 : * latin2mic_with_table: a generic single byte charset encoding
180 : * conversion from a local charset to the mule internal code.
181 : *
182 : * l points to the source string of length len
183 : * p is the output area (must be large enough!)
184 : * lc is the mule character set id for the local encoding
185 : * encoding is the PG identifier for the local encoding
186 : * tab holds conversion entries for the local charset
187 : * starting from 128 (0x80). each entry in the table holds the corresponding
188 : * code point for the mule encoding, or 0 if there is no equivalent code.
189 : *
190 : * Returns the number of input bytes consumed. If noError is true, this can
191 : * be less than 'len'.
192 : */
193 : int
6167 tgl 194 84 : latin2mic_with_table(const unsigned char *l,
195 : unsigned char *p,
196 : int len,
197 : int lc,
198 : int encoding,
199 : const unsigned char *tab,
200 : bool noError)
201 : {
738 heikki.linnakangas 202 84 : const unsigned char *start = l;
203 : unsigned char c1,
204 : c2;
205 :
6167 tgl 206 246 : while (len > 0)
207 : {
208 216 : c1 = *l;
209 216 : if (c1 == 0)
210 : {
738 heikki.linnakangas 211 54 : if (noError)
212 27 : break;
6167 tgl 213 27 : report_invalid_encoding(encoding, (const char *) l, len);
214 : }
215 162 : if (!IS_HIGHBIT_SET(c1))
8720 bruce 216 63 : *p++ = c1;
217 : else
218 : {
6167 tgl 219 99 : c2 = tab[c1 - HIGHBIT];
8720 bruce 220 99 : if (c2)
221 : {
222 99 : *p++ = lc;
223 99 : *p++ = c2;
224 : }
225 : else
226 : {
738 heikki.linnakangas 227 UBC 0 : if (noError)
228 0 : break;
6167 tgl 229 0 : report_untranslatable_char(encoding, PG_MULE_INTERNAL,
230 : (const char *) l, len);
231 : }
232 : }
6167 tgl 233 CBC 162 : l++;
234 162 : len--;
235 : }
8720 bruce 236 57 : *p = '\0';
237 :
738 heikki.linnakangas 238 57 : return l - start;
239 : }
240 :
241 : /*
242 : * mic2latin_with_table: a generic single byte charset encoding
243 : * conversion from the mule internal code to a local charset.
244 : *
245 : * mic points to the source string of length len
246 : * p is the output area (must be large enough!)
247 : * lc is the mule character set id for the local encoding
248 : * encoding is the PG identifier for the local encoding
249 : * tab holds conversion entries for the mule internal code's second byte,
250 : * starting from 128 (0x80). each entry in the table holds the corresponding
251 : * code point for the local charset, or 0 if there is no equivalent code.
252 : *
253 : * Returns the number of input bytes consumed. If noError is true, this can
254 : * be less than 'len'.
255 : */
256 : int
6167 tgl 257 174 : mic2latin_with_table(const unsigned char *mic,
258 : unsigned char *p,
259 : int len,
260 : int lc,
261 : int encoding,
262 : const unsigned char *tab,
263 : bool noError)
264 : {
738 heikki.linnakangas 265 174 : const unsigned char *start = mic;
266 : unsigned char c1,
267 : c2;
268 :
6167 tgl 269 408 : while (len > 0)
270 : {
271 378 : c1 = *mic;
272 378 : if (c1 == 0)
273 : {
738 heikki.linnakangas 274 UBC 0 : if (noError)
275 0 : break;
6167 tgl 276 0 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
277 : }
6167 tgl 278 CBC 378 : if (!IS_HIGHBIT_SET(c1))
279 : {
280 : /* easy for ASCII */
281 171 : *p++ = c1;
282 171 : mic++;
8720 bruce 283 171 : len--;
284 : }
285 : else
286 : {
1179 tgl 287 207 : int l = pg_mule_mblen(mic);
288 :
6167 289 207 : if (len < l)
290 : {
738 heikki.linnakangas 291 54 : if (noError)
292 27 : break;
6167 tgl 293 27 : report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
294 : len);
295 : }
296 153 : if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
297 63 : (c2 = tab[mic[1] - HIGHBIT]) == 0)
298 : {
738 heikki.linnakangas 299 90 : if (noError)
300 45 : break;
6167 tgl 301 45 : report_untranslatable_char(PG_MULE_INTERNAL, encoding,
302 : (const char *) mic, len);
303 : break; /* keep compiler quiet */
304 : }
305 63 : *p++ = c2;
306 63 : mic += 2;
307 63 : len -= 2;
308 : }
309 : }
8720 bruce 310 102 : *p = '\0';
311 :
738 heikki.linnakangas 312 102 : return mic - start;
313 : }
314 :
315 : /*
316 : * comparison routine for bsearch()
317 : * this routine is intended for combined UTF8 -> local code
318 : */
319 : static int
5859 ishii 320 234 : compare3(const void *p1, const void *p2)
321 : {
322 : uint32 s1,
323 : s2,
324 : d1,
325 : d2;
326 :
4228 peter_e 327 234 : s1 = *(const uint32 *) p1;
328 234 : s2 = *((const uint32 *) p1 + 1);
329 234 : d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
330 234 : d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
5859 ishii 331 234 : return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
332 : }
333 :
334 : /*
335 : * comparison routine for bsearch()
336 : * this routine is intended for local code -> combined UTF8
337 : */
338 : static int
339 81 : compare4(const void *p1, const void *p2)
340 : {
341 : uint32 v1,
342 : v2;
343 :
4228 peter_e 344 81 : v1 = *(const uint32 *) p1;
345 81 : v2 = ((const pg_local_to_utf_combined *) p2)->code;
5859 ishii 346 81 : return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
347 : }
348 :
349 : /*
350 : * store 32bit character representation into multibyte stream
351 : */
352 : static inline unsigned char *
2887 tgl 353 612 : store_coded_char(unsigned char *dest, uint32 code)
354 : {
5859 ishii 355 612 : if (code & 0xff000000)
2887 tgl 356 63 : *dest++ = code >> 24;
5859 ishii 357 612 : if (code & 0x00ff0000)
2887 tgl 358 261 : *dest++ = code >> 16;
5859 ishii 359 612 : if (code & 0x0000ff00)
2887 tgl 360 549 : *dest++ = code >> 8;
5859 ishii 361 612 : if (code & 0x000000ff)
2887 tgl 362 612 : *dest++ = code;
363 612 : return dest;
364 : }
365 :
366 : /*
367 : * Convert a character using a conversion radix tree.
368 : *
369 : * 'l' is the length of the input character in bytes, and b1-b4 are
370 : * the input character's bytes.
371 : */
372 : static inline uint32
2218 heikki.linnakangas 373 1035 : pg_mb_radix_conv(const pg_mb_radix_tree *rt,
374 : int l,
375 : unsigned char b1,
376 : unsigned char b2,
377 : unsigned char b3,
378 : unsigned char b4)
379 : {
380 1035 : if (l == 4)
381 : {
382 : /* 4-byte code */
383 :
384 : /* check code validity */
385 45 : if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
386 45 : b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
387 45 : b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
388 45 : b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
2218 heikki.linnakangas 389 UBC 0 : return 0;
390 :
391 : /* perform lookup */
2218 heikki.linnakangas 392 CBC 45 : if (rt->chars32)
393 : {
394 45 : uint32 idx = rt->b4root;
395 :
396 45 : idx = rt->chars32[b1 + idx - rt->b4_1_lower];
397 45 : idx = rt->chars32[b2 + idx - rt->b4_2_lower];
398 45 : idx = rt->chars32[b3 + idx - rt->b4_3_lower];
399 45 : return rt->chars32[b4 + idx - rt->b4_4_lower];
400 : }
401 : else
402 : {
2218 heikki.linnakangas 403 UBC 0 : uint16 idx = rt->b4root;
404 :
405 0 : idx = rt->chars16[b1 + idx - rt->b4_1_lower];
406 0 : idx = rt->chars16[b2 + idx - rt->b4_2_lower];
407 0 : idx = rt->chars16[b3 + idx - rt->b4_3_lower];
408 0 : return rt->chars16[b4 + idx - rt->b4_4_lower];
409 : }
410 : }
2218 heikki.linnakangas 411 CBC 990 : else if (l == 3)
412 : {
413 : /* 3-byte code */
414 :
415 : /* check code validity */
416 468 : if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
417 144 : b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
418 144 : b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
419 324 : return 0;
420 :
421 : /* perform lookup */
422 144 : if (rt->chars32)
423 : {
424 144 : uint32 idx = rt->b3root;
425 :
426 144 : idx = rt->chars32[b2 + idx - rt->b3_1_lower];
427 144 : idx = rt->chars32[b3 + idx - rt->b3_2_lower];
428 144 : return rt->chars32[b4 + idx - rt->b3_3_lower];
429 : }
430 : else
431 : {
2218 heikki.linnakangas 432 UBC 0 : uint16 idx = rt->b3root;
433 :
434 0 : idx = rt->chars16[b2 + idx - rt->b3_1_lower];
435 0 : idx = rt->chars16[b3 + idx - rt->b3_2_lower];
436 0 : return rt->chars16[b4 + idx - rt->b3_3_lower];
437 : }
438 : }
2218 heikki.linnakangas 439 CBC 522 : else if (l == 2)
440 : {
441 : /* 2-byte code */
442 :
443 : /* check code validity - first byte */
444 378 : if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
445 342 : b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
446 36 : return 0;
447 :
448 : /* perform lookup */
449 342 : if (rt->chars32)
450 : {
451 261 : uint32 idx = rt->b2root;
452 :
453 261 : idx = rt->chars32[b3 + idx - rt->b2_1_lower];
454 261 : return rt->chars32[b4 + idx - rt->b2_2_lower];
455 : }
456 : else
457 : {
458 81 : uint16 idx = rt->b2root;
459 :
460 81 : idx = rt->chars16[b3 + idx - rt->b2_1_lower];
461 81 : return rt->chars16[b4 + idx - rt->b2_2_lower];
462 : }
463 : }
464 144 : else if (l == 1)
465 : {
466 : /* 1-byte code */
467 :
468 : /* check code validity - first byte */
469 144 : if (b4 < rt->b1_lower || b4 > rt->b1_upper)
2218 heikki.linnakangas 470 UBC 0 : return 0;
471 :
472 : /* perform lookup */
2218 heikki.linnakangas 473 CBC 144 : if (rt->chars32)
474 99 : return rt->chars32[b4 + rt->b1root - rt->b1_lower];
475 : else
476 45 : return rt->chars16[b4 + rt->b1root - rt->b1_lower];
477 : }
2153 bruce 478 UBC 0 : return 0; /* shouldn't happen */
479 : }
480 :
481 : /*
482 : * UTF8 ---> local code
483 : *
484 : * utf: input string in UTF8 encoding (need not be null-terminated)
485 : * len: length of input string (in bytes)
486 : * iso: pointer to the output area (must be large enough!)
487 : (output string will be null-terminated)
488 : * map: conversion map for single characters
489 : * cmap: conversion map for combined characters
490 : * (optional, pass NULL if none)
491 : * cmapsize: number of entries in the conversion map for combined characters
492 : * (optional, pass 0 if none)
493 : * conv_func: algorithmic encoding conversion function
494 : * (optional, pass NULL if none)
495 : * encoding: PG identifier for the local encoding
496 : *
497 : * For each character, the cmap (if provided) is consulted first; if no match,
498 : * the map is consulted next; if still no match, the conv_func (if provided)
499 : * is applied. An error is raised if no match is found.
500 : *
501 : * See pg_wchar.h for more details about the data structures used here.
502 : *
503 : * Returns the number of input bytes consumed. If noError is true, this can
504 : * be less than 'len'.
505 : */
506 : int
2887 tgl 507 CBC 1104 : UtfToLocal(const unsigned char *utf, int len,
508 : unsigned char *iso,
509 : const pg_mb_radix_tree *map,
510 : const pg_utf_to_local_combined *cmap, int cmapsize,
511 : utf_local_conversion_func conv_func,
512 : int encoding, bool noError)
513 : {
514 : uint32 iutf;
515 : int l;
516 : const pg_utf_to_local_combined *cp;
738 heikki.linnakangas 517 1104 : const unsigned char *start = utf;
518 :
2887 tgl 519 1104 : if (!PG_VALID_ENCODING(encoding))
2887 tgl 520 UBC 0 : ereport(ERROR,
521 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
522 : errmsg("invalid encoding number: %d", encoding)));
523 :
6167 tgl 524 CBC 3030 : for (; len > 0; len -= l)
525 : {
2218 heikki.linnakangas 526 2736 : unsigned char b1 = 0;
527 2736 : unsigned char b2 = 0;
528 2736 : unsigned char b3 = 0;
529 2736 : unsigned char b4 = 0;
530 :
531 : /* "break" cases all represent errors */
6167 tgl 532 2736 : if (*utf == '\0')
533 90 : break;
534 :
8214 ishii 535 2646 : l = pg_utf_mblen(utf);
6167 tgl 536 2646 : if (len < l)
537 108 : break;
538 :
539 2538 : if (!pg_utf8_islegal(utf, l))
540 180 : break;
541 :
8214 ishii 542 2358 : if (l == 1)
543 : {
544 : /* ASCII case is easy, assume it's one-to-one conversion */
545 1656 : *iso++ = *utf++;
546 1656 : continue;
547 : }
548 :
549 : /* collect coded char of length l */
2887 tgl 550 702 : if (l == 2)
551 : {
2218 heikki.linnakangas 552 207 : b3 = *utf++;
553 207 : b4 = *utf++;
554 : }
6507 bruce 555 495 : else if (l == 3)
556 : {
2218 heikki.linnakangas 557 495 : b2 = *utf++;
558 495 : b3 = *utf++;
559 495 : b4 = *utf++;
560 : }
6507 bruce 561 UBC 0 : else if (l == 4)
562 : {
2218 heikki.linnakangas 563 0 : b1 = *utf++;
564 0 : b2 = *utf++;
565 0 : b3 = *utf++;
566 0 : b4 = *utf++;
567 : }
568 : else
569 : {
3552 tgl 570 0 : elog(ERROR, "unsupported character length %d", l);
571 : iutf = 0; /* keep compiler quiet */
572 : }
2218 heikki.linnakangas 573 CBC 702 : iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
574 :
575 : /* First, try with combined map if possible */
5859 ishii 576 702 : if (cmap && len > l)
577 : {
5624 bruce 578 72 : const unsigned char *utf_save = utf;
579 72 : int len_save = len;
580 72 : int l_save = l;
581 :
582 : /* collect next character, same as above */
5859 ishii 583 72 : len -= l;
584 :
585 72 : l = pg_utf_mblen(utf);
586 72 : if (len < l)
587 : {
588 : /* need more data to decide if this is a combined char */
738 heikki.linnakangas 589 18 : utf -= l_save;
5859 ishii 590 18 : break;
591 : }
592 :
593 54 : if (!pg_utf8_islegal(utf, l))
594 : {
738 heikki.linnakangas 595 UBC 0 : if (!noError)
596 0 : report_invalid_encoding(PG_UTF8, (const char *) utf, len);
597 0 : utf -= l_save;
5859 ishii 598 0 : break;
599 : }
600 :
601 : /* We assume ASCII character cannot be in combined map */
2887 tgl 602 CBC 54 : if (l > 1)
603 : {
604 : uint32 iutf2;
605 : uint32 cutf[2];
606 :
607 54 : if (l == 2)
608 : {
609 27 : iutf2 = *utf++ << 8;
610 27 : iutf2 |= *utf++;
611 : }
612 27 : else if (l == 3)
613 : {
614 27 : iutf2 = *utf++ << 16;
615 27 : iutf2 |= *utf++ << 8;
616 27 : iutf2 |= *utf++;
617 : }
2887 tgl 618 UBC 0 : else if (l == 4)
619 : {
620 0 : iutf2 = *utf++ << 24;
621 0 : iutf2 |= *utf++ << 16;
622 0 : iutf2 |= *utf++ << 8;
623 0 : iutf2 |= *utf++;
624 : }
625 : else
626 : {
627 0 : elog(ERROR, "unsupported character length %d", l);
628 : iutf2 = 0; /* keep compiler quiet */
629 : }
630 :
2887 tgl 631 CBC 54 : cutf[0] = iutf;
632 54 : cutf[1] = iutf2;
633 :
634 54 : cp = bsearch(cutf, cmap, cmapsize,
635 : sizeof(pg_utf_to_local_combined), compare3);
636 :
637 54 : if (cp)
638 : {
639 9 : iso = store_coded_char(iso, cp->code);
640 9 : continue;
641 : }
642 : }
643 :
644 : /* fail, so back up to reprocess second character next time */
645 45 : utf = utf_save;
646 45 : len = len_save;
647 45 : l = l_save;
648 : }
649 :
650 : /* Now check ordinary map */
2218 heikki.linnakangas 651 675 : if (map)
652 : {
2153 bruce 653 675 : uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
654 :
2218 heikki.linnakangas 655 675 : if (converted)
656 : {
657 225 : iso = store_coded_char(iso, converted);
658 225 : continue;
659 : }
660 : }
661 :
662 : /* if there's a conversion function, try that */
2887 tgl 663 450 : if (conv_func)
664 : {
665 36 : uint32 converted = (*conv_func) (iutf);
666 :
667 36 : if (converted)
668 : {
669 36 : iso = store_coded_char(iso, converted);
670 36 : continue;
671 : }
672 : }
673 :
674 : /* failed to translate this character */
738 heikki.linnakangas 675 414 : utf -= l;
676 414 : if (noError)
677 207 : break;
2887 tgl 678 207 : report_untranslatable_char(PG_UTF8, encoding,
679 : (const char *) utf, len);
680 : }
681 :
682 : /* if we broke out of loop early, must be invalid input */
738 heikki.linnakangas 683 897 : if (len > 0 && !noError)
6167 tgl 684 198 : report_invalid_encoding(PG_UTF8, (const char *) utf, len);
685 :
8214 ishii 686 699 : *iso = '\0';
687 :
738 heikki.linnakangas 688 699 : return utf - start;
689 : }
690 :
691 : /*
692 : * local code ---> UTF8
693 : *
694 : * iso: input string in local encoding (need not be null-terminated)
695 : * len: length of input string (in bytes)
696 : * utf: pointer to the output area (must be large enough!)
697 : (output string will be null-terminated)
698 : * map: conversion map for single characters
699 : * cmap: conversion map for combined characters
700 : * (optional, pass NULL if none)
701 : * cmapsize: number of entries in the conversion map for combined characters
702 : * (optional, pass 0 if none)
703 : * conv_func: algorithmic encoding conversion function
704 : * (optional, pass NULL if none)
705 : * encoding: PG identifier for the local encoding
706 : *
707 : * For each character, the map is consulted first; if no match, the cmap
708 : * (if provided) is consulted next; if still no match, the conv_func
709 : * (if provided) is applied. An error is raised if no match is found.
710 : *
711 : * See pg_wchar.h for more details about the data structures used here.
712 : *
713 : * Returns the number of input bytes consumed. If noError is true, this can
714 : * be less than 'len'.
715 : */
716 : int
2887 tgl 717 732 : LocalToUtf(const unsigned char *iso, int len,
718 : unsigned char *utf,
719 : const pg_mb_radix_tree *map,
720 : const pg_local_to_utf_combined *cmap, int cmapsize,
721 : utf_local_conversion_func conv_func,
722 : int encoding,
723 : bool noError)
724 : {
725 : uint32 iiso;
726 : int l;
727 : const pg_local_to_utf_combined *cp;
738 heikki.linnakangas 728 732 : const unsigned char *start = iso;
729 :
7885 ishii 730 732 : if (!PG_VALID_ENCODING(encoding))
7198 tgl 731 UBC 0 : ereport(ERROR,
732 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
733 : errmsg("invalid encoding number: %d", encoding)));
734 :
6167 tgl 735 CBC 3042 : for (; len > 0; len -= l)
736 : {
2218 heikki.linnakangas 737 2688 : unsigned char b1 = 0;
738 2688 : unsigned char b2 = 0;
739 2688 : unsigned char b3 = 0;
740 2688 : unsigned char b4 = 0;
741 :
742 : /* "break" cases all represent errors */
6167 tgl 743 2688 : if (*iso == '\0')
744 162 : break;
745 :
6314 bruce 746 2526 : if (!IS_HIGHBIT_SET(*iso))
747 : {
748 : /* ASCII case is easy, assume it's one-to-one conversion */
8214 ishii 749 1986 : *utf++ = *iso++;
750 1986 : l = 1;
751 1986 : continue;
752 : }
753 :
801 heikki.linnakangas 754 540 : l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
6167 tgl 755 540 : if (l < 0)
756 180 : break;
757 :
758 : /* collect coded char of length l */
8214 ishii 759 360 : if (l == 1)
2218 heikki.linnakangas 760 144 : b4 = *iso++;
8214 ishii 761 216 : else if (l == 2)
762 : {
2218 heikki.linnakangas 763 171 : b3 = *iso++;
764 171 : b4 = *iso++;
765 : }
8214 ishii 766 45 : else if (l == 3)
767 : {
2218 heikki.linnakangas 768 UBC 0 : b2 = *iso++;
769 0 : b3 = *iso++;
770 0 : b4 = *iso++;
771 : }
8214 ishii 772 CBC 45 : else if (l == 4)
773 : {
2218 heikki.linnakangas 774 45 : b1 = *iso++;
775 45 : b2 = *iso++;
776 45 : b3 = *iso++;
777 45 : b4 = *iso++;
778 : }
779 : else
780 : {
3552 tgl 781 UBC 0 : elog(ERROR, "unsupported character length %d", l);
782 : iiso = 0; /* keep compiler quiet */
783 : }
2218 heikki.linnakangas 784 CBC 360 : iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
785 :
786 360 : if (map)
787 : {
2153 bruce 788 360 : uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
789 :
2218 heikki.linnakangas 790 360 : if (converted)
791 : {
792 279 : utf = store_coded_char(utf, converted);
2887 tgl 793 279 : continue;
794 : }
795 :
796 : /* If there's a combined character map, try that */
2218 heikki.linnakangas 797 81 : if (cmap)
798 : {
799 18 : cp = bsearch(&iiso, cmap, cmapsize,
800 : sizeof(pg_local_to_utf_combined), compare4);
801 :
802 18 : if (cp)
803 : {
804 18 : utf = store_coded_char(utf, cp->utf1);
805 18 : utf = store_coded_char(utf, cp->utf2);
806 18 : continue;
807 : }
808 : }
809 : }
810 :
811 : /* if there's a conversion function, try that */
2887 tgl 812 63 : if (conv_func)
813 : {
814 45 : uint32 converted = (*conv_func) (iiso);
815 :
816 45 : if (converted)
817 : {
818 27 : utf = store_coded_char(utf, converted);
819 27 : continue;
820 : }
821 : }
822 :
823 : /* failed to translate this character */
738 heikki.linnakangas 824 36 : iso -= l;
825 36 : if (noError)
826 18 : break;
2887 tgl 827 18 : report_untranslatable_char(encoding, PG_UTF8,
828 : (const char *) iso, len);
829 : }
830 :
831 : /* if we broke out of loop early, must be invalid input */
738 heikki.linnakangas 832 714 : if (len > 0 && !noError)
6167 tgl 833 171 : report_invalid_encoding(encoding, (const char *) iso, len);
834 :
8214 ishii 835 543 : *utf = '\0';
836 :
738 heikki.linnakangas 837 543 : return iso - start;
838 : }
|