Age Owner TLA Line data Source code
1 : /*-----------------------------------------------------------------------
2 : *
3 : * PostgreSQL locale utilities
4 : *
5 : * Portions Copyright (c) 2002-2023, PostgreSQL Global Development Group
6 : *
7 : * src/backend/utils/adt/pg_locale.c
8 : *
9 : *-----------------------------------------------------------------------
10 : */
11 :
12 : /*----------
13 : * Here is how the locale stuff is handled: LC_COLLATE and LC_CTYPE
14 : * are fixed at CREATE DATABASE time, stored in pg_database, and cannot
15 : * be changed. Thus, the effects of strcoll(), strxfrm(), isupper(),
16 : * toupper(), etc. are always in the same fixed locale.
17 : *
18 : * LC_MESSAGES is settable at run time and will take effect
19 : * immediately.
20 : *
21 : * The other categories, LC_MONETARY, LC_NUMERIC, and LC_TIME are also
22 : * settable at run-time. However, we don't actually set those locale
23 : * categories permanently. This would have bizarre effects like no
24 : * longer accepting standard floating-point literals in some locales.
25 : * Instead, we only set these locale categories briefly when needed,
26 : * cache the required information obtained from localeconv() or
27 : * strftime(), and then set the locale categories back to "C".
28 : * The cached information is only used by the formatting functions
29 : * (to_char, etc.) and the money type. For the user, this should all be
30 : * transparent.
31 : *
32 : * !!! NOW HEAR THIS !!!
33 : *
34 : * We've been bitten repeatedly by this bug, so let's try to keep it in
35 : * mind in future: on some platforms, the locale functions return pointers
36 : * to static data that will be overwritten by any later locale function.
37 : * Thus, for example, the obvious-looking sequence
38 : * save = setlocale(category, NULL);
39 : * if (!setlocale(category, value))
40 : * fail = true;
41 : * setlocale(category, save);
42 : * DOES NOT WORK RELIABLY: on some platforms the second setlocale() call
43 : * will change the memory save is pointing at. To do this sort of thing
44 : * safely, you *must* pstrdup what setlocale returns the first time.
45 : *
46 : * The POSIX locale standard is available here:
47 : *
48 : * http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html
49 : *----------
50 : */
51 :
52 :
53 : #include "postgres.h"
54 :
55 : #include <time.h>
56 :
57 : #include "access/htup_details.h"
58 : #include "catalog/pg_collation.h"
59 : #include "catalog/pg_control.h"
60 : #include "mb/pg_wchar.h"
61 : #include "miscadmin.h"
62 : #include "utils/builtins.h"
63 : #include "utils/formatting.h"
64 : #include "utils/guc_hooks.h"
65 : #include "utils/hsearch.h"
66 : #include "utils/lsyscache.h"
67 : #include "utils/memutils.h"
68 : #include "utils/pg_locale.h"
69 : #include "utils/syscache.h"
70 :
71 : #ifdef USE_ICU
72 : #include <unicode/ucnv.h>
73 : #include <unicode/ustring.h>
74 : #endif
75 :
76 : #ifdef __GLIBC__
77 : #include <gnu/libc-version.h>
78 : #endif
79 :
80 : #ifdef WIN32
81 : #include <shlwapi.h>
82 : #endif
83 :
84 : /*
85 : * This should be large enough that most strings will fit, but small enough
86 : * that we feel comfortable putting it on the stack
87 : */
88 : #define TEXTBUFLEN 1024
89 :
90 : #define MAX_L10N_DATA 80
91 :
92 :
93 : /* GUC settings */
94 : char *locale_messages;
95 : char *locale_monetary;
96 : char *locale_numeric;
97 : char *locale_time;
98 :
99 : int icu_validation_level = ERROR;
100 :
101 : /*
102 : * lc_time localization cache.
103 : *
104 : * We use only the first 7 or 12 entries of these arrays. The last array
105 : * element is left as NULL for the convenience of outside code that wants
106 : * to sequentially scan these arrays.
107 : */
108 : char *localized_abbrev_days[7 + 1];
109 : char *localized_full_days[7 + 1];
110 : char *localized_abbrev_months[12 + 1];
111 : char *localized_full_months[12 + 1];
112 :
113 : /* is the databases's LC_CTYPE the C locale? */
114 : bool database_ctype_is_c = false;
115 :
116 : /* indicates whether locale information cache is valid */
117 : static bool CurrentLocaleConvValid = false;
118 : static bool CurrentLCTimeValid = false;
119 :
120 : /* Cache for collation-related knowledge */
121 :
122 : typedef struct
123 : {
124 : Oid collid; /* hash key: pg_collation OID */
125 : bool collate_is_c; /* is collation's LC_COLLATE C? */
126 : bool ctype_is_c; /* is collation's LC_CTYPE C? */
127 : bool flags_valid; /* true if above flags are valid */
128 : pg_locale_t locale; /* locale_t struct, or 0 if not valid */
129 : } collation_cache_entry;
130 :
131 : static HTAB *collation_cache = NULL;
132 :
133 :
134 : #if defined(WIN32) && defined(LC_MESSAGES)
135 : static char *IsoLocaleName(const char *);
136 : #endif
137 :
138 : #ifdef USE_ICU
139 : /*
140 : * Converter object for converting between ICU's UChar strings and C strings
141 : * in database encoding. Since the database encoding doesn't change, we only
142 : * need one of these per session.
143 : */
144 : static UConverter *icu_converter = NULL;
145 :
146 : static UCollator *pg_ucol_open(const char *loc_str);
147 : static void init_icu_converter(void);
148 : static size_t uchar_length(UConverter *converter,
149 : const char *str, int32_t len);
150 : static int32_t uchar_convert(UConverter *converter,
151 : UChar *dest, int32_t destlen,
152 : const char *str, int32_t srclen);
153 : static void icu_set_collation_attributes(UCollator *collator, const char *loc,
154 : UErrorCode *status);
155 : #endif
156 :
157 : /*
158 : * pg_perm_setlocale
159 : *
160 : * This wraps the libc function setlocale(), with two additions. First, when
161 : * changing LC_CTYPE, update gettext's encoding for the current message
162 : * domain. GNU gettext automatically tracks LC_CTYPE on most platforms, but
163 : * not on Windows. Second, if the operation is successful, the corresponding
164 : * LC_XXX environment variable is set to match. By setting the environment
165 : * variable, we ensure that any subsequent use of setlocale(..., "") will
166 : * preserve the settings made through this routine. Of course, LC_ALL must
167 : * also be unset to fully ensure that, but that has to be done elsewhere after
168 : * all the individual LC_XXX variables have been set correctly. (Thank you
169 : * Perl for making this kluge necessary.)
170 : */
171 : char *
6311 tgl 172 GIC 43528 : pg_perm_setlocale(int category, const char *locale)
173 : {
174 : char *result;
175 : const char *envvar;
176 :
177 : #ifndef WIN32
178 43528 : result = setlocale(category, locale);
179 : #else
180 :
181 : /*
182 : * On Windows, setlocale(LC_MESSAGES) does not work, so just assume that
183 : * the given value is good and set it in the environment variables. We
184 : * must ignore attempts to set to "", which means "keep using the old
185 : * environment value".
186 : */
187 : #ifdef LC_MESSAGES
188 : if (category == LC_MESSAGES)
189 : {
190 : result = (char *) locale;
191 : if (locale == NULL || locale[0] == '\0')
192 : return result;
193 : }
194 : else
195 : #endif
196 : result = setlocale(category, locale);
197 : #endif /* WIN32 */
6311 tgl 198 ECB :
6311 tgl 199 GIC 43528 : if (result == NULL)
6311 tgl 200 UIC 0 : return result; /* fall out immediately on failure */
201 :
202 : /*
203 : * Use the right encoding in translated messages. Under ENABLE_NLS, let
3574 noah 204 ECB : * pg_bind_textdomain_codeset() figure it out. Under !ENABLE_NLS, message
205 : * format strings are ASCII, but database-encoding strings may enter the
206 : * message via %s. This makes the overall message encoding equal to the
207 : * database encoding.
208 : */
3574 noah 209 GIC 43528 : if (category == LC_CTYPE)
210 : {
211 : static char save_lc_ctype[LOCALE_NAME_BUFLEN];
212 :
213 : /* copy setlocale() return value before callee invokes it again */
2850 214 12785 : strlcpy(save_lc_ctype, result, sizeof(save_lc_ctype));
215 12785 : result = save_lc_ctype;
216 :
217 : #ifdef ENABLE_NLS
3574 218 12785 : SetMessageEncoding(pg_bind_textdomain_codeset(textdomain(NULL)));
219 : #else
220 : SetMessageEncoding(GetDatabaseEncoding());
221 : #endif
222 : }
223 :
6311 tgl 224 43528 : switch (category)
6311 tgl 225 ECB : {
6311 tgl 226 GBC 12785 : case LC_COLLATE:
6311 tgl 227 GIC 12785 : envvar = "LC_COLLATE";
228 12785 : break;
229 12785 : case LC_CTYPE:
230 12785 : envvar = "LC_CTYPE";
231 12785 : break;
232 : #ifdef LC_MESSAGES
233 10101 : case LC_MESSAGES:
234 10101 : envvar = "LC_MESSAGES";
5191 magnus 235 ECB : #ifdef WIN32
236 : result = IsoLocaleName(locale);
237 : if (result == NULL)
238 : result = (char *) locale;
239 : elog(DEBUG3, "IsoLocaleName() executed; locale: \"%s\"", result);
2118 tgl 240 : #endif /* WIN32 */
6311 tgl 241 CBC 10101 : break;
242 : #endif /* LC_MESSAGES */
6311 tgl 243 GIC 2619 : case LC_MONETARY:
6311 tgl 244 CBC 2619 : envvar = "LC_MONETARY";
6311 tgl 245 GIC 2619 : break;
246 2619 : case LC_NUMERIC:
247 2619 : envvar = "LC_NUMERIC";
248 2619 : break;
249 2619 : case LC_TIME:
6311 tgl 250 CBC 2619 : envvar = "LC_TIME";
6311 tgl 251 GIC 2619 : break;
6311 tgl 252 LBC 0 : default:
253 0 : elog(FATAL, "unrecognized LC category: %d", category);
830 tgl 254 ECB : return NULL; /* keep compiler quiet */
6311 255 : }
256 :
830 tgl 257 CBC 43528 : if (setenv(envvar, result, 1) != 0)
6311 tgl 258 UIC 0 : return NULL;
6311 tgl 259 ECB :
6311 tgl 260 CBC 43528 : return result;
261 : }
262 :
263 :
264 : /*
265 : * Is the locale name valid for the locale category?
266 : *
4032 tgl 267 ECB : * If successful, and canonname isn't NULL, a palloc'd copy of the locale's
268 : * canonical name is stored there. This is especially useful for figuring out
269 : * what locale name "" means (ie, the server environment value). (Actually,
270 : * it seems that on most implementations that's the only thing it's good for;
271 : * we could wish that setlocale gave back a canonically spelled version of
272 : * the locale name, but typically it doesn't.)
5311 heikki.linnakangas 273 : */
274 : bool
4032 tgl 275 CBC 33149 : check_locale(int category, const char *locale, char **canonname)
5311 heikki.linnakangas 276 ECB : {
277 : char *save;
4032 tgl 278 EUB : char *res;
279 :
4032 tgl 280 GIC 33149 : if (canonname)
281 1607 : *canonname = NULL; /* in case of failure */
282 :
5311 heikki.linnakangas 283 CBC 33149 : save = setlocale(category, NULL);
5311 heikki.linnakangas 284 GBC 33149 : if (!save)
5311 heikki.linnakangas 285 UIC 0 : return false; /* won't happen, we hope */
5311 heikki.linnakangas 286 ECB :
287 : /* save may be pointing at a modifiable scratch variable, see above. */
5311 heikki.linnakangas 288 GIC 33149 : save = pstrdup(save);
289 :
290 : /* set the locale with setlocale, to see if it accepts it. */
4032 tgl 291 33149 : res = setlocale(category, locale);
292 :
293 : /* save canonical name if requested. */
294 33149 : if (res && canonname)
295 1605 : *canonname = pstrdup(res);
296 :
297 : /* restore old value. */
4238 heikki.linnakangas 298 33149 : if (!setlocale(category, save))
4032 tgl 299 UIC 0 : elog(WARNING, "failed to restore old locale \"%s\"", save);
5311 heikki.linnakangas 300 GIC 33149 : pfree(save);
5311 heikki.linnakangas 301 ECB :
4032 tgl 302 GIC 33149 : return (res != NULL);
303 : }
304 :
305 :
7548 peter_e 306 ECB : /*
4385 tgl 307 : * GUC check/assign hooks
308 : *
309 : * For most locale categories, the assign hook doesn't actually set the locale
310 : * permanently, just reset flags so that the next use will cache the
3260 bruce 311 EUB : * appropriate values. (See explanation at the top of this file.)
312 : *
313 : * Note: we accept value = "" as selecting the postmaster's environment
6154 tgl 314 ECB : * value, whatever it was (so long as the environment setting is legal).
315 : * This will have been locked down by an earlier call to pg_perm_setlocale.
316 : */
4385 317 : bool
4385 tgl 318 GIC 9039 : check_locale_monetary(char **newval, void **extra, GucSource source)
319 : {
4032 tgl 320 CBC 9039 : return check_locale(LC_MONETARY, *newval, NULL);
8493 bruce 321 ECB : }
322 :
323 : void
4385 tgl 324 CBC 8949 : assign_locale_monetary(const char *newval, void *extra)
4385 tgl 325 EUB : {
4385 tgl 326 CBC 8949 : CurrentLocaleConvValid = false;
4385 tgl 327 GIC 8949 : }
7548 peter_e 328 ECB :
329 : bool
4385 tgl 330 GIC 9042 : check_locale_numeric(char **newval, void **extra, GucSource source)
331 : {
4032 332 9042 : return check_locale(LC_NUMERIC, *newval, NULL);
333 : }
334 :
335 : void
4385 336 8955 : assign_locale_numeric(const char *newval, void *extra)
337 : {
338 8955 : CurrentLocaleConvValid = false;
7676 peter_e 339 8955 : }
340 :
341 : bool
4385 tgl 342 9042 : check_locale_time(char **newval, void **extra, GucSource source)
343 : {
4032 tgl 344 CBC 9042 : return check_locale(LC_TIME, *newval, NULL);
345 : }
8493 bruce 346 ECB :
347 : void
4385 tgl 348 GIC 8952 : assign_locale_time(const char *newval, void *extra)
349 : {
4385 tgl 350 CBC 8952 : CurrentLCTimeValid = false;
4385 tgl 351 GIC 8952 : }
8170 tgl 352 ECB :
7548 peter_e 353 : /*
354 : * We allow LC_MESSAGES to actually be set globally.
355 : *
6154 tgl 356 : * Note: we normally disallow value = "" because it wouldn't have consistent
357 : * semantics (it'd effectively just use the previous value). However, this
358 : * is the value passed for PGC_S_DEFAULT, so don't complain in that case,
359 : * not even if the attempted setting fails due to invalid environment value.
360 : * The idea there is just to accept the environment setting *if possible*
361 : * during startup, until we can read the proper value from postgresql.conf.
7548 peter_e 362 : */
363 : bool
4385 tgl 364 CBC 7569 : check_locale_messages(char **newval, void **extra, GucSource source)
7548 peter_e 365 ECB : {
4385 tgl 366 GIC 7569 : if (**newval == '\0')
367 : {
4385 tgl 368 CBC 3150 : if (source == PGC_S_DEFAULT)
4385 tgl 369 GIC 3150 : return true;
4385 tgl 370 ECB : else
4385 tgl 371 UIC 0 : return false;
372 : }
373 :
7522 bruce 374 ECB : /*
375 : * LC_MESSAGES category does not exist everywhere, but accept it anyway
6311 tgl 376 : *
4385 377 : * On Windows, we can't even check the value, so accept blindly
378 : */
379 : #if defined(LC_MESSAGES) && !defined(WIN32)
4032 tgl 380 GIC 4419 : return check_locale(LC_MESSAGES, *newval, NULL);
381 : #else
382 : return true;
383 : #endif
384 : }
385 :
386 : void
4385 387 7482 : assign_locale_messages(const char *newval, void *extra)
388 : {
389 : /*
4385 tgl 390 ECB : * LC_MESSAGES category does not exist everywhere, but accept it anyway.
391 : * We ignore failure, as per comment above.
7522 bruce 392 : */
393 : #ifdef LC_MESSAGES
4385 tgl 394 CBC 7482 : (void) pg_perm_setlocale(LC_MESSAGES, newval);
4385 tgl 395 ECB : #endif
7548 peter_e 396 GIC 7482 : }
7548 peter_e 397 EUB :
398 :
399 : /*
400 : * Frees the malloced content of a struct lconv. (But not the struct
401 : * itself.) It's important that this not throw elog(ERROR).
402 : */
403 : static void
2118 tgl 404 GIC 3 : free_struct_lconv(struct lconv *s)
405 : {
297 peter 406 GNC 3 : free(s->decimal_point);
407 3 : free(s->thousands_sep);
408 3 : free(s->grouping);
409 3 : free(s->int_curr_symbol);
410 3 : free(s->currency_symbol);
411 3 : free(s->mon_decimal_point);
412 3 : free(s->mon_thousands_sep);
413 3 : free(s->mon_grouping);
414 3 : free(s->positive_sign);
415 3 : free(s->negative_sign);
2330 tgl 416 GIC 3 : }
417 :
418 : /*
419 : * Check that all fields of a struct lconv (or at least, the ones we care
2330 tgl 420 ECB : * about) are non-NULL. The field list must match free_struct_lconv().
421 : */
422 : static bool
2118 tgl 423 CBC 51 : struct_lconv_is_valid(struct lconv *s)
2330 tgl 424 ECB : {
2330 tgl 425 CBC 51 : if (s->decimal_point == NULL)
2330 tgl 426 LBC 0 : return false;
2330 tgl 427 CBC 51 : if (s->thousands_sep == NULL)
2330 tgl 428 LBC 0 : return false;
2330 tgl 429 CBC 51 : if (s->grouping == NULL)
2330 tgl 430 LBC 0 : return false;
2330 tgl 431 CBC 51 : if (s->int_curr_symbol == NULL)
2330 tgl 432 LBC 0 : return false;
2330 tgl 433 GIC 51 : if (s->currency_symbol == NULL)
2330 tgl 434 UIC 0 : return false;
2330 tgl 435 GIC 51 : if (s->mon_decimal_point == NULL)
2330 tgl 436 UIC 0 : return false;
2330 tgl 437 GIC 51 : if (s->mon_thousands_sep == NULL)
2330 tgl 438 UIC 0 : return false;
2330 tgl 439 CBC 51 : if (s->mon_grouping == NULL)
2330 tgl 440 UIC 0 : return false;
2330 tgl 441 CBC 51 : if (s->positive_sign == NULL)
2330 tgl 442 UBC 0 : return false;
2330 tgl 443 CBC 51 : if (s->negative_sign == NULL)
2330 tgl 444 UBC 0 : return false;
2330 tgl 445 CBC 51 : return true;
7548 peter_e 446 EUB : }
7548 peter_e 447 ECB :
7548 peter_e 448 EUB :
4735 itagaki.takahiro 449 ECB : /*
2330 tgl 450 EUB : * Convert the strdup'd string at *str from the specified encoding to the
4735 itagaki.takahiro 451 ECB : * database encoding.
4735 itagaki.takahiro 452 EUB : */
2330 tgl 453 ECB : static void
2330 tgl 454 GBC 408 : db_encoding_convert(int encoding, char **str)
4735 itagaki.takahiro 455 ECB : {
4660 bruce 456 EUB : char *pstr;
4660 bruce 457 ECB : char *mstr;
4735 itagaki.takahiro 458 EUB :
4735 itagaki.takahiro 459 ECB : /* convert the string to the database encoding */
2330 tgl 460 GBC 408 : pstr = pg_any_to_server(*str, strlen(*str), encoding);
2330 tgl 461 CBC 408 : if (pstr == *str)
2330 tgl 462 GIC 408 : return; /* no conversion happened */
463 :
464 : /* need it malloc'd not palloc'd */
4735 itagaki.takahiro 465 UIC 0 : mstr = strdup(pstr);
2330 tgl 466 0 : if (mstr == NULL)
467 0 : ereport(ERROR,
468 : (errcode(ERRCODE_OUT_OF_MEMORY),
469 : errmsg("out of memory")));
2330 tgl 470 ECB :
471 : /* replace old string */
2330 tgl 472 UIC 0 : free(*str);
473 0 : *str = mstr;
474 :
475 0 : pfree(pstr);
4735 itagaki.takahiro 476 ECB : }
477 :
478 :
479 : /*
480 : * Return the POSIX lconv struct (contains number/money formatting
7676 peter_e 481 EUB : * information) with locale information for all categories.
8493 bruce 482 : */
483 : struct lconv *
8422 tgl 484 GIC 7968 : PGLC_localeconv(void)
485 : {
486 : static struct lconv CurrentLocaleConv;
487 : static bool CurrentLocaleConvAllocated = false;
7632 tgl 488 EUB : struct lconv *extlconv;
2330 489 : struct lconv worklconv;
490 : char *save_lc_monetary;
7548 peter_e 491 : char *save_lc_numeric;
492 : #ifdef WIN32
493 : char *save_lc_ctype;
494 : #endif
495 :
496 : /* Did we do it already? */
7862 tgl 497 GIC 7968 : if (CurrentLocaleConvValid)
498 7917 : return &CurrentLocaleConv;
499 :
2597 tgl 500 ECB : /* Free any already-allocated storage */
2597 tgl 501 GIC 51 : if (CurrentLocaleConvAllocated)
502 : {
503 3 : free_struct_lconv(&CurrentLocaleConv);
504 3 : CurrentLocaleConvAllocated = false;
505 : }
506 :
507 : /*
508 : * This is tricky because we really don't want to risk throwing error
509 : * while the locale is set to other than our usual settings. Therefore,
510 : * the process is: collect the usual settings, set locale to special
511 : * setting, copy relevant data into worklconv using strdup(), restore
512 : * normal settings, convert data to desired encoding, and finally stash
2330 tgl 513 ECB : * the collected data in CurrentLocaleConv. This makes it safe if we
514 : * throw an error during encoding conversion or run out of memory anywhere
515 : * in the process. All data pointed to by struct lconv members is
516 : * allocated with strdup, to avoid premature elog(ERROR) and to allow
517 : * using a single cleanup routine.
518 : */
2330 tgl 519 CBC 51 : memset(&worklconv, 0, sizeof(worklconv));
2330 tgl 520 ECB :
521 : /* Save prevailing values of monetary and numeric locales */
7548 peter_e 522 GIC 51 : save_lc_monetary = setlocale(LC_MONETARY, NULL);
1447 tgl 523 51 : if (!save_lc_monetary)
1447 tgl 524 UIC 0 : elog(ERROR, "setlocale(NULL) failed");
1447 tgl 525 GIC 51 : save_lc_monetary = pstrdup(save_lc_monetary);
526 :
7548 peter_e 527 51 : save_lc_numeric = setlocale(LC_NUMERIC, NULL);
1447 tgl 528 51 : if (!save_lc_numeric)
1447 tgl 529 UIC 0 : elog(ERROR, "setlocale(NULL) failed");
1447 tgl 530 GIC 51 : save_lc_numeric = pstrdup(save_lc_numeric);
531 :
532 : #ifdef WIN32
533 :
534 : /*
1447 tgl 535 ECB : * The POSIX standard explicitly says that it is undefined what happens if
536 : * LC_MONETARY or LC_NUMERIC imply an encoding (codeset) different from
537 : * that implied by LC_CTYPE. In practice, all Unix-ish platforms seem to
538 : * believe that localeconv() should return strings that are encoded in the
539 : * codeset implied by the LC_MONETARY or LC_NUMERIC locale name. Hence,
1447 tgl 540 EUB : * once we have successfully collected the localeconv() results, we will
1447 tgl 541 ECB : * convert them from that codeset to the desired server encoding.
542 : *
543 : * Windows, of course, resolutely does things its own way; on that
544 : * platform LC_CTYPE has to match LC_MONETARY/LC_NUMERIC to get sane
1447 tgl 545 EUB : * results. Hence, we must temporarily set that category as well.
4660 bruce 546 ECB : */
547 :
548 : /* Save prevailing value of ctype locale */
549 : save_lc_ctype = setlocale(LC_CTYPE, NULL);
550 : if (!save_lc_ctype)
551 : elog(ERROR, "setlocale(NULL) failed");
552 : save_lc_ctype = pstrdup(save_lc_ctype);
553 :
554 : /* Here begins the critical section where we must not throw error */
555 :
556 : /* use numeric to set the ctype */
557 : setlocale(LC_CTYPE, locale_numeric);
558 : #endif
559 :
560 : /* Get formatting information for numeric */
4789 bruce 561 GIC 51 : setlocale(LC_NUMERIC, locale_numeric);
4735 itagaki.takahiro 562 51 : extlconv = localeconv();
563 :
564 : /* Must copy data now in case setlocale() overwrites it */
2330 tgl 565 51 : worklconv.decimal_point = strdup(extlconv->decimal_point);
566 51 : worklconv.thousands_sep = strdup(extlconv->thousands_sep);
567 51 : worklconv.grouping = strdup(extlconv->grouping);
568 :
569 : #ifdef WIN32
570 : /* use monetary to set the ctype */
571 : setlocale(LC_CTYPE, locale_monetary);
572 : #endif
573 :
574 : /* Get formatting information for monetary */
4735 itagaki.takahiro 575 51 : setlocale(LC_MONETARY, locale_monetary);
7862 tgl 576 51 : extlconv = localeconv();
7862 tgl 577 ECB :
2330 578 : /* Must copy data now in case setlocale() overwrites it */
2330 tgl 579 GIC 51 : worklconv.int_curr_symbol = strdup(extlconv->int_curr_symbol);
580 51 : worklconv.currency_symbol = strdup(extlconv->currency_symbol);
2330 tgl 581 CBC 51 : worklconv.mon_decimal_point = strdup(extlconv->mon_decimal_point);
582 51 : worklconv.mon_thousands_sep = strdup(extlconv->mon_thousands_sep);
583 51 : worklconv.mon_grouping = strdup(extlconv->mon_grouping);
2330 tgl 584 GIC 51 : worklconv.positive_sign = strdup(extlconv->positive_sign);
585 51 : worklconv.negative_sign = strdup(extlconv->negative_sign);
586 : /* Copy scalar fields as well */
587 51 : worklconv.int_frac_digits = extlconv->int_frac_digits;
588 51 : worklconv.frac_digits = extlconv->frac_digits;
589 51 : worklconv.p_cs_precedes = extlconv->p_cs_precedes;
590 51 : worklconv.p_sep_by_space = extlconv->p_sep_by_space;
2330 tgl 591 CBC 51 : worklconv.n_cs_precedes = extlconv->n_cs_precedes;
592 51 : worklconv.n_sep_by_space = extlconv->n_sep_by_space;
2330 tgl 593 GIC 51 : worklconv.p_sign_posn = extlconv->p_sign_posn;
594 51 : worklconv.n_sign_posn = extlconv->n_sign_posn;
7548 peter_e 595 ECB :
1447 tgl 596 : /*
597 : * Restore the prevailing locale settings; failure to do so is fatal.
598 : * Possibly we could limp along with nondefault LC_MONETARY or LC_NUMERIC,
599 : * but proceeding with the wrong value of LC_CTYPE would certainly be bad
600 : * news; and considering that the prevailing LC_MONETARY and LC_NUMERIC
601 : * are almost certainly "C", there's really no reason that restoring those
602 : * should fail.
603 : */
4735 itagaki.takahiro 604 : #ifdef WIN32
1447 tgl 605 : if (!setlocale(LC_CTYPE, save_lc_ctype))
606 : elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
4735 itagaki.takahiro 607 : #endif
1447 tgl 608 CBC 51 : if (!setlocale(LC_MONETARY, save_lc_monetary))
1447 tgl 609 LBC 0 : elog(FATAL, "failed to restore LC_MONETARY to \"%s\"", save_lc_monetary);
1447 tgl 610 CBC 51 : if (!setlocale(LC_NUMERIC, save_lc_numeric))
1447 tgl 611 UIC 0 : elog(FATAL, "failed to restore LC_NUMERIC to \"%s\"", save_lc_numeric);
612 :
613 : /*
614 : * At this point we've done our best to clean up, and can call functions
615 : * that might possibly throw errors with a clean conscience. But let's
616 : * make sure we don't leak any already-strdup'd fields in worklconv.
617 : */
2330 tgl 618 GIC 51 : PG_TRY();
619 : {
620 : int encoding;
621 :
622 : /* Release the pstrdup'd locale names */
1447 623 51 : pfree(save_lc_monetary);
1447 tgl 624 CBC 51 : pfree(save_lc_numeric);
2330 tgl 625 EUB : #ifdef WIN32
1447 tgl 626 ECB : pfree(save_lc_ctype);
2330 tgl 627 EUB : #endif
628 :
629 : /* If any of the preceding strdup calls failed, complain now. */
2330 tgl 630 GIC 51 : if (!struct_lconv_is_valid(&worklconv))
2330 tgl 631 UIC 0 : ereport(ERROR,
632 : (errcode(ERRCODE_OUT_OF_MEMORY),
633 : errmsg("out of memory")));
2330 tgl 634 ECB :
635 : /*
636 : * Now we must perform encoding conversion from whatever's associated
637 : * with the locales into the database encoding. If we can't identify
638 : * the encoding implied by LC_NUMERIC or LC_MONETARY (ie we get -1),
1447 639 : * use PG_SQL_ASCII, which will result in just validating that the
640 : * strings are OK in the database encoding.
641 : */
2330 tgl 642 GIC 51 : encoding = pg_get_encoding_from_locale(locale_numeric, true);
1447 643 51 : if (encoding < 0)
1447 tgl 644 UIC 0 : encoding = PG_SQL_ASCII;
645 :
2330 tgl 646 CBC 51 : db_encoding_convert(encoding, &worklconv.decimal_point);
2330 tgl 647 GBC 51 : db_encoding_convert(encoding, &worklconv.thousands_sep);
648 : /* grouping is not text and does not require conversion */
649 :
2330 tgl 650 GIC 51 : encoding = pg_get_encoding_from_locale(locale_monetary, true);
1447 651 51 : if (encoding < 0)
1447 tgl 652 UIC 0 : encoding = PG_SQL_ASCII;
653 :
2330 tgl 654 GIC 51 : db_encoding_convert(encoding, &worklconv.int_curr_symbol);
655 51 : db_encoding_convert(encoding, &worklconv.currency_symbol);
656 51 : db_encoding_convert(encoding, &worklconv.mon_decimal_point);
657 51 : db_encoding_convert(encoding, &worklconv.mon_thousands_sep);
2330 tgl 658 ECB : /* mon_grouping is not text and does not require conversion */
2330 tgl 659 CBC 51 : db_encoding_convert(encoding, &worklconv.positive_sign);
2330 tgl 660 GBC 51 : db_encoding_convert(encoding, &worklconv.negative_sign);
661 : }
2330 tgl 662 LBC 0 : PG_CATCH();
2330 tgl 663 ECB : {
2330 tgl 664 UIC 0 : free_struct_lconv(&worklconv);
665 0 : PG_RE_THROW();
2330 tgl 666 ECB : }
2330 tgl 667 CBC 51 : PG_END_TRY();
2330 tgl 668 EUB :
669 : /*
2330 tgl 670 ECB : * Everything is good, so save the results.
671 : */
2330 tgl 672 CBC 51 : CurrentLocaleConv = worklconv;
673 51 : CurrentLocaleConvAllocated = true;
7862 tgl 674 GIC 51 : CurrentLocaleConvValid = true;
7862 tgl 675 CBC 51 : return &CurrentLocaleConv;
8493 bruce 676 ECB : }
677 :
5203 magnus 678 EUB : #ifdef WIN32
679 : /*
1447 tgl 680 : * On Windows, strftime() returns its output in encoding CP_ACP (the default
681 : * operating system codepage for the computer), which is likely different
682 : * from SERVER_ENCODING. This is especially important in Japanese versions
4733 bruce 683 ECB : * of Windows which will use SJIS encoding, which we don't support as a
684 : * server encoding.
685 : *
686 : * So, instead of using strftime(), use wcsftime() to return the value in
687 : * wide characters (internally UTF16) and then convert to UTF8, which we
1447 tgl 688 : * know how to handle directly.
5203 magnus 689 : *
690 : * Note that this only affects the calls to strftime() in this file, which are
691 : * used to get the locale-aware strings. Other parts of the backend use
692 : * pg_strftime(), which isn't locale-aware and does not need to be replaced.
693 : */
694 : static size_t
695 : strftime_win32(char *dst, size_t dstlen,
696 : const char *format, const struct tm *tm)
697 : {
698 : size_t len;
699 : wchar_t wformat[8]; /* formats used below need 3 chars */
700 : wchar_t wbuf[MAX_L10N_DATA];
701 :
702 : /*
703 : * Get a wchar_t version of the format string. We only actually use
704 : * plain-ASCII formats in this file, so we can say that they're UTF8.
705 : */
706 : len = MultiByteToWideChar(CP_UTF8, 0, format, -1,
707 : wformat, lengthof(wformat));
708 : if (len == 0)
709 : elog(ERROR, "could not convert format string from UTF-8: error code %lu",
710 : GetLastError());
711 :
712 : len = wcsftime(wbuf, MAX_L10N_DATA, wformat, tm);
713 : if (len == 0)
714 : {
715 : /*
716 : * wcsftime failed, possibly because the result would not fit in
717 : * MAX_L10N_DATA. Return 0 with the contents of dst unspecified.
718 : */
719 : return 0;
720 : }
721 :
722 : len = WideCharToMultiByte(CP_UTF8, 0, wbuf, len, dst, dstlen - 1,
723 : NULL, NULL);
724 : if (len == 0)
725 : elog(ERROR, "could not convert string to UTF-8: error code %lu",
726 : GetLastError());
727 :
728 : dst[len] = '\0';
729 :
730 : return len;
731 : }
732 :
733 : /* redefine strftime() */
734 : #define strftime(a,b,c,d) strftime_win32(a,b,c,d)
735 : #endif /* WIN32 */
736 :
737 : /*
738 : * Subroutine for cache_locale_time().
739 : * Convert the given string from encoding "encoding" to the database
740 : * encoding, and store the result at *dst, replacing any previous value.
741 : */
742 : static void
1447 tgl 743 GIC 950 : cache_single_string(char **dst, const char *src, int encoding)
744 : {
745 : char *ptr;
746 : char *olddst;
747 :
748 : /* Convert the string to the database encoding, or validate it's OK */
749 950 : ptr = pg_any_to_server(src, strlen(src), encoding);
750 :
751 : /* Store the string in long-lived storage, replacing any previous value */
752 950 : olddst = *dst;
753 950 : *dst = MemoryContextStrdup(TopMemoryContext, ptr);
754 950 : if (olddst)
1447 tgl 755 UIC 0 : pfree(olddst);
756 :
757 : /* Might as well clean up any palloc'd conversion result, too */
1447 tgl 758 GIC 950 : if (ptr != src)
1447 tgl 759 CBC 114 : pfree(ptr);
2883 noah 760 GIC 950 : }
761 :
762 : /*
763 : * Update the lc_time localization cache variables if needed.
764 : */
5438 tgl 765 ECB : void
5438 tgl 766 GIC 9277 : cache_locale_time(void)
767 : {
1447 tgl 768 ECB : char buf[(2 * 7 + 2 * 12) * MAX_L10N_DATA];
769 : char *bufptr;
5438 770 : time_t timenow;
5050 bruce 771 EUB : struct tm *timeinfo;
1447 tgl 772 GIC 9277 : bool strftimefail = false;
773 : int encoding;
5438 tgl 774 ECB : int i;
1447 775 : char *save_lc_time;
5203 magnus 776 : #ifdef WIN32
777 : char *save_lc_ctype;
778 : #endif
779 :
780 : /* did we do this already? */
5438 tgl 781 GIC 9277 : if (CurrentLCTimeValid)
5438 tgl 782 CBC 9252 : return;
783 :
5438 tgl 784 GIC 25 : elog(DEBUG3, "cache_locale_time() executed; locale: \"%s\"", locale_time);
785 :
786 : /*
787 : * As in PGLC_localeconv(), it's critical that we not throw error while
1447 tgl 788 ECB : * libc's locale settings have nondefault values. Hence, we just call
789 : * strftime() within the critical section, and then convert and save its
790 : * results afterwards.
791 : */
792 :
793 : /* Save prevailing value of time locale */
4733 bruce 794 GIC 25 : save_lc_time = setlocale(LC_TIME, NULL);
1447 tgl 795 25 : if (!save_lc_time)
1447 tgl 796 UIC 0 : elog(ERROR, "setlocale(NULL) failed");
1447 tgl 797 CBC 25 : save_lc_time = pstrdup(save_lc_time);
4733 bruce 798 ECB :
799 : #ifdef WIN32
3260 800 :
801 : /*
802 : * On Windows, it appears that wcsftime() internally uses LC_CTYPE, so we
803 : * must set it here. This code looks the same as what PGLC_localeconv()
804 : * does, but the underlying reason is different: this does NOT determine
805 : * the encoding we'll get back from strftime_win32().
806 : */
807 :
808 : /* Save prevailing value of ctype locale */
809 : save_lc_ctype = setlocale(LC_CTYPE, NULL);
1447 tgl 810 : if (!save_lc_ctype)
811 : elog(ERROR, "setlocale(NULL) failed");
1447 tgl 812 EUB : save_lc_ctype = pstrdup(save_lc_ctype);
5203 magnus 813 ECB :
814 : /* use lc_time to set the ctype */
815 : setlocale(LC_CTYPE, locale_time);
816 : #endif
817 :
5438 tgl 818 GIC 25 : setlocale(LC_TIME, locale_time);
819 :
820 : /* We use times close to current time as data for strftime(). */
821 25 : timenow = time(NULL);
822 25 : timeinfo = localtime(&timenow);
823 :
824 : /* Store the strftime results in MAX_L10N_DATA-sized portions of buf[] */
1447 825 25 : bufptr = buf;
826 :
827 : /*
828 : * MAX_L10N_DATA is sufficient buffer space for every known locale, and
829 : * POSIX defines no strftime() errors. (Buffer space exhaustion is not an
830 : * error.) An implementation might report errors (e.g. ENOMEM) by
831 : * returning 0 (or, less plausibly, a negative value) and setting errno.
832 : * Report errno just in case the implementation did that, but clear it in
833 : * advance of the calls so we don't emit a stale, unrelated errno.
1447 tgl 834 ECB : */
1447 tgl 835 GIC 25 : errno = 0;
836 :
5438 tgl 837 ECB : /* localized days */
5438 tgl 838 CBC 200 : for (i = 0; i < 7; i++)
839 : {
5438 tgl 840 GIC 175 : timeinfo->tm_wday = i;
1447 tgl 841 CBC 175 : if (strftime(bufptr, MAX_L10N_DATA, "%a", timeinfo) <= 0)
1447 tgl 842 UIC 0 : strftimefail = true;
1447 tgl 843 GIC 175 : bufptr += MAX_L10N_DATA;
844 175 : if (strftime(bufptr, MAX_L10N_DATA, "%A", timeinfo) <= 0)
1447 tgl 845 UIC 0 : strftimefail = true;
1447 tgl 846 GIC 175 : bufptr += MAX_L10N_DATA;
847 : }
848 :
849 : /* localized months */
5438 850 325 : for (i = 0; i < 12; i++)
5438 tgl 851 ECB : {
5438 tgl 852 GIC 300 : timeinfo->tm_mon = i;
853 300 : timeinfo->tm_mday = 1; /* make sure we don't have invalid date */
1447 tgl 854 CBC 300 : if (strftime(bufptr, MAX_L10N_DATA, "%b", timeinfo) <= 0)
1447 tgl 855 UIC 0 : strftimefail = true;
1447 tgl 856 CBC 300 : bufptr += MAX_L10N_DATA;
857 300 : if (strftime(bufptr, MAX_L10N_DATA, "%B", timeinfo) <= 0)
1447 tgl 858 UBC 0 : strftimefail = true;
1447 tgl 859 CBC 300 : bufptr += MAX_L10N_DATA;
5438 tgl 860 ECB : }
5438 tgl 861 EUB :
1447 tgl 862 ECB : /*
863 : * Restore the prevailing locale settings; as in PGLC_localeconv(),
864 : * failure to do so is fatal.
865 : */
866 : #ifdef WIN32
867 : if (!setlocale(LC_CTYPE, save_lc_ctype))
868 : elog(FATAL, "failed to restore LC_CTYPE to \"%s\"", save_lc_ctype);
869 : #endif
1447 tgl 870 CBC 25 : if (!setlocale(LC_TIME, save_lc_time))
1447 tgl 871 UBC 0 : elog(FATAL, "failed to restore LC_TIME to \"%s\"", save_lc_time);
1447 tgl 872 ECB :
873 : /*
1447 tgl 874 EUB : * At this point we've done our best to clean up, and can throw errors, or
1447 tgl 875 ECB : * call functions that might throw errors, with a clean conscience.
876 : */
1447 tgl 877 GIC 25 : if (strftimefail)
1447 tgl 878 UIC 0 : elog(ERROR, "strftime() failed: %m");
879 :
880 : /* Release the pstrdup'd locale names */
1447 tgl 881 GIC 25 : pfree(save_lc_time);
882 : #ifdef WIN32
883 : pfree(save_lc_ctype);
884 : #endif
885 :
1447 tgl 886 ECB : #ifndef WIN32
1447 tgl 887 EUB :
888 : /*
889 : * As in PGLC_localeconv(), we must convert strftime()'s output from the
890 : * encoding implied by LC_TIME to the database encoding. If we can't
891 : * identify the LC_TIME encoding, just perform encoding validation.
892 : */
1447 tgl 893 CBC 25 : encoding = pg_get_encoding_from_locale(locale_time, true);
1447 tgl 894 GBC 25 : if (encoding < 0)
1447 tgl 895 UIC 0 : encoding = PG_SQL_ASCII;
896 :
1447 tgl 897 ECB : #else
898 :
899 : /*
900 : * On Windows, strftime_win32() always returns UTF8 data, so convert from
901 : * that if necessary.
902 : */
903 : encoding = PG_UTF8;
904 :
905 : #endif /* WIN32 */
906 :
1447 tgl 907 GIC 25 : bufptr = buf;
908 :
1447 tgl 909 ECB : /* localized days */
1447 tgl 910 CBC 200 : for (i = 0; i < 7; i++)
5438 tgl 911 EUB : {
1447 tgl 912 GIC 175 : cache_single_string(&localized_abbrev_days[i], bufptr, encoding);
913 175 : bufptr += MAX_L10N_DATA;
914 175 : cache_single_string(&localized_full_days[i], bufptr, encoding);
915 175 : bufptr += MAX_L10N_DATA;
916 : }
1132 917 25 : localized_abbrev_days[7] = NULL;
918 25 : localized_full_days[7] = NULL;
919 :
920 : /* localized months */
1447 921 325 : for (i = 0; i < 12; i++)
922 : {
1447 tgl 923 CBC 300 : cache_single_string(&localized_abbrev_months[i], bufptr, encoding);
1447 tgl 924 GIC 300 : bufptr += MAX_L10N_DATA;
925 300 : cache_single_string(&localized_full_months[i], bufptr, encoding);
1447 tgl 926 CBC 300 : bufptr += MAX_L10N_DATA;
927 : }
1132 928 25 : localized_abbrev_months[12] = NULL;
929 25 : localized_full_months[12] = NULL;
5203 magnus 930 ECB :
5438 tgl 931 CBC 25 : CurrentLCTimeValid = true;
932 : }
5191 magnus 933 ECB :
934 :
935 : #if defined(WIN32) && defined(LC_MESSAGES)
936 : /*
3714 andrew 937 : * Convert a Windows setlocale() argument to a Unix-style one.
938 : *
939 : * Regardless of platform, we install message catalogs under a Unix-style
3260 bruce 940 : * LL[_CC][.ENCODING][@VARIANT] naming convention. Only LC_MESSAGES settings
3714 andrew 941 : * following that style will elicit localized interface strings.
942 : *
943 : * Before Visual Studio 2012 (msvcr110.dll), Windows setlocale() accepted "C"
944 : * (but not "c") and strings of the form <Language>[_<Country>][.<CodePage>],
945 : * case-insensitive. setlocale() returns the fully-qualified form; for
946 : * example, setlocale("thaI") returns "Thai_Thailand.874". Internally,
947 : * setlocale() and _create_locale() select a "locale identifier"[1] and store
948 : * it in an undocumented _locale_t field. From that LCID, we can retrieve the
949 : * ISO 639 language and the ISO 3166 country. Character encoding does not
950 : * matter, because the server and client encodings govern that.
951 : *
952 : * Windows Vista introduced the "locale name" concept[2], closely following
953 : * RFC 4646. Locale identifiers are now deprecated. Starting with Visual
954 : * Studio 2012, setlocale() accepts locale names in addition to the strings it
955 : * accepted historically. It does not standardize them; setlocale("Th-tH")
956 : * returns "Th-tH". setlocale(category, "") still returns a traditional
957 : * string. Furthermore, msvcr110.dll changed the undocumented _locale_t
958 : * content to carry locale names instead of locale identifiers.
959 : *
960 : * Visual Studio 2015 should still be able to do the same as Visual Studio
961 : * 2012, but the declaration of locale_name is missing in _locale_t, causing
962 : * this code compilation to fail, hence this falls back instead on to
963 : * enumerating all system locales by using EnumSystemLocalesEx to find the
964 : * required locale name. If the input argument is in Unix-style then we can
965 : * get ISO Locale name directly by using GetLocaleInfoEx() with LCType as
966 : * LOCALE_SNAME.
967 : *
968 : * MinGW headers declare _create_locale(), but msvcrt.dll lacks that symbol in
969 : * releases before Windows 8. IsoLocaleName() always fails in a MinGW-built
970 : * postgres.exe, so only Unix-style values of the lc_messages GUC can elicit
971 : * localized messages. In particular, every lc_messages setting that initdb
972 : * can select automatically will yield only C-locale messages. XXX This could
973 : * be fixed by running the fully-qualified locale name through a lookup table.
974 : *
975 : * This function returns a pointer to a static buffer bearing the converted
976 : * name or NULL if conversion fails.
977 : *
978 : * [1] https://docs.microsoft.com/en-us/windows/win32/intl/locale-identifiers
979 : * [2] https://docs.microsoft.com/en-us/windows/win32/intl/locale-names
980 : */
981 :
982 : #if defined(_MSC_VER)
983 :
984 : /*
985 : * Callback function for EnumSystemLocalesEx() in get_iso_localename().
986 : *
987 : * This function enumerates all system locales, searching for one that matches
988 : * an input with the format: <Language>[_<Country>], e.g.
989 : * English[_United States]
990 : *
991 : * The input is a three wchar_t array as an LPARAM. The first element is the
992 : * locale_name we want to match, the second element is an allocated buffer
993 : * where the Unix-style locale is copied if a match is found, and the third
994 : * element is the search status, 1 if a match was found, 0 otherwise.
995 : */
996 : static BOOL CALLBACK
997 : search_locale_enum(LPWSTR pStr, DWORD dwFlags, LPARAM lparam)
998 : {
999 : wchar_t test_locale[LOCALE_NAME_MAX_LENGTH];
1000 : wchar_t **argv;
1001 :
1002 : (void) (dwFlags);
1003 :
1004 : argv = (wchar_t **) lparam;
1005 : *argv[2] = (wchar_t) 0;
1006 :
1007 : memset(test_locale, 0, sizeof(test_locale));
1008 :
1009 : /* Get the name of the <Language> in English */
1010 : if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHLANGUAGENAME,
1011 : test_locale, LOCALE_NAME_MAX_LENGTH))
1012 : {
1013 : /*
1014 : * If the enumerated locale does not have a hyphen ("en") OR the
1015 : * lc_message input does not have an underscore ("English"), we only
1016 : * need to compare the <Language> tags.
1017 : */
1018 : if (wcsrchr(pStr, '-') == NULL || wcsrchr(argv[0], '_') == NULL)
1019 : {
1020 : if (_wcsicmp(argv[0], test_locale) == 0)
1021 : {
1022 : wcscpy(argv[1], pStr);
1023 : *argv[2] = (wchar_t) 1;
1024 : return FALSE;
1025 : }
1026 : }
1027 :
1028 : /*
1029 : * We have to compare a full <Language>_<Country> tag, so we append
1030 : * the underscore and name of the country/region in English, e.g.
1031 : * "English_United States".
1032 : */
1033 : else
1034 : {
1035 : size_t len;
1036 :
1037 : wcscat(test_locale, L"_");
1038 : len = wcslen(test_locale);
1039 : if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHCOUNTRYNAME,
1040 : test_locale + len,
1041 : LOCALE_NAME_MAX_LENGTH - len))
1042 : {
1043 : if (_wcsicmp(argv[0], test_locale) == 0)
1044 : {
1045 : wcscpy(argv[1], pStr);
1046 : *argv[2] = (wchar_t) 1;
1047 : return FALSE;
1048 : }
1049 : }
1050 : }
1051 : }
1052 :
1053 : return TRUE;
1054 : }
1055 :
1056 : /*
1057 : * This function converts a Windows locale name to an ISO formatted version
1058 : * for Visual Studio 2015 or greater.
1059 : *
1060 : * Returns NULL, if no valid conversion was found.
1061 : */
1062 : static char *
1063 : get_iso_localename(const char *winlocname)
1064 : {
1065 : wchar_t wc_locale_name[LOCALE_NAME_MAX_LENGTH];
1066 : wchar_t buffer[LOCALE_NAME_MAX_LENGTH];
1067 : static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1068 : char *period;
1069 : int len;
1070 : int ret_val;
1071 :
1072 : /*
1073 : * Valid locales have the following syntax:
1074 : * <Language>[_<Country>[.<CodePage>]]
1075 : *
1076 : * GetLocaleInfoEx can only take locale name without code-page and for the
1077 : * purpose of this API the code-page doesn't matter.
1078 : */
1079 : period = strchr(winlocname, '.');
1080 : if (period != NULL)
1081 : len = period - winlocname;
1082 : else
1083 : len = pg_mbstrlen(winlocname);
1084 :
1085 : memset(wc_locale_name, 0, sizeof(wc_locale_name));
1086 : memset(buffer, 0, sizeof(buffer));
1087 : MultiByteToWideChar(CP_ACP, 0, winlocname, len, wc_locale_name,
1088 : LOCALE_NAME_MAX_LENGTH);
1089 :
1090 : /*
1091 : * If the lc_messages is already a Unix-style string, we have a direct
1092 : * match with LOCALE_SNAME, e.g. en-US, en_US.
1093 : */
1094 : ret_val = GetLocaleInfoEx(wc_locale_name, LOCALE_SNAME, (LPWSTR) &buffer,
1095 : LOCALE_NAME_MAX_LENGTH);
1096 : if (!ret_val)
1097 : {
1098 : /*
1099 : * Search for a locale in the system that matches language and country
1100 : * name.
1101 : */
1102 : wchar_t *argv[3];
1103 :
1104 : argv[0] = wc_locale_name;
1105 : argv[1] = buffer;
1106 : argv[2] = (wchar_t *) &ret_val;
1107 : EnumSystemLocalesEx(search_locale_enum, LOCALE_WINDOWS, (LPARAM) argv,
1108 : NULL);
1109 : }
1110 :
1111 : if (ret_val)
1112 : {
1113 : size_t rc;
1114 : char *hyphen;
1115 :
1116 : /* Locale names use only ASCII, any conversion locale suffices. */
1117 : rc = wchar2char(iso_lc_messages, buffer, sizeof(iso_lc_messages), NULL);
1118 : if (rc == -1 || rc == sizeof(iso_lc_messages))
1119 : return NULL;
1120 :
1121 : /*
1122 : * Since the message catalogs sit on a case-insensitive filesystem, we
1123 : * need not standardize letter case here. So long as we do not ship
1124 : * message catalogs for which it would matter, we also need not
1125 : * translate the script/variant portion, e.g. uz-Cyrl-UZ to
1126 : * uz_UZ@cyrillic. Simply replace the hyphen with an underscore.
1127 : */
1128 : hyphen = strchr(iso_lc_messages, '-');
1129 : if (hyphen)
1130 : *hyphen = '_';
1131 : return iso_lc_messages;
1132 : }
1133 :
1134 : return NULL;
1135 : }
1136 :
1137 : static char *
1138 : IsoLocaleName(const char *winlocname)
1139 : {
1140 : static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
1141 :
1142 : if (pg_strcasecmp("c", winlocname) == 0 ||
1143 : pg_strcasecmp("posix", winlocname) == 0)
1144 : {
1145 : strcpy(iso_lc_messages, "C");
1146 : return iso_lc_messages;
1147 : }
1148 : else
1149 : return get_iso_localename(winlocname);
1150 : }
1151 :
1152 : #else /* !defined(_MSC_VER) */
1153 :
1154 : static char *
1155 : IsoLocaleName(const char *winlocname)
1156 : {
1157 : return NULL; /* Not supported on MinGW */
1158 : }
1159 :
1160 : #endif /* defined(_MSC_VER) */
1161 :
1162 : #endif /* WIN32 && LC_MESSAGES */
1163 :
1164 :
1165 : /*
2832 noah 1166 : * Detect aging strxfrm() implementations that, in a subset of locales, write
1167 : * past the specified buffer length. Affected users must update OS packages
1168 : * before using PostgreSQL 9.5 or later.
1169 : *
1170 : * Assume that the bug can come and go from one postmaster startup to another
1171 : * due to physical replication among diverse machines. Assume that the bug's
1172 : * presence will not change during the life of a particular postmaster. Given
1173 : * those assumptions, call this no less than once per postmaster startup per
1174 : * LC_COLLATE setting used. No known-affected system offers strxfrm_l(), so
1175 : * there is no need to consider pg_collation locales.
1176 : */
1177 : void
2832 noah 1178 GIC 12783 : check_strxfrm_bug(void)
1179 : {
1180 : char buf[32];
1181 12783 : const int canary = 0x7F;
2832 noah 1182 CBC 12783 : bool ok = true;
2832 noah 1183 ECB :
1184 : /*
2832 noah 1185 EUB : * Given a two-byte ASCII string and length limit 7, 8 or 9, Solaris 10
1186 : * 05/08 returns 18 and modifies 10 bytes. It respects limits above or
1187 : * below that range.
1188 : *
1189 : * The bug is present in Solaris 8 as well; it is absent in Solaris 10
1190 : * 01/13 and Solaris 11.2. Affected locales include is_IS.ISO8859-1,
1191 : * en_US.UTF-8, en_US.ISO8859-1, and ru_RU.KOI8-R. Unaffected locales
1192 : * include de_DE.UTF-8, de_DE.ISO8859-1, zh_TW.UTF-8, and C.
1193 : */
2832 noah 1194 GIC 12783 : buf[7] = canary;
1195 12783 : (void) strxfrm(buf, "ab", 7);
1196 12783 : if (buf[7] != canary)
2832 noah 1197 LBC 0 : ok = false;
2832 noah 1198 ECB :
1199 : /*
2832 noah 1200 EUB : * illumos bug #1594 was present in the source tree from 2010-10-11 to
1201 : * 2012-02-01. Given an ASCII string of any length and length limit 1,
2832 noah 1202 ECB : * affected systems ignore the length limit and modify a number of bytes
2832 noah 1203 EUB : * one less than the return value. The problem inputs for this bug do not
1204 : * overlap those for the Solaris bug, hence a distinct test.
1205 : *
1206 : * Affected systems include smartos-20110926T021612Z. Affected locales
1207 : * include en_US.ISO8859-1 and en_US.UTF-8. Unaffected locales include C.
2832 noah 1208 ECB : */
2832 noah 1209 GIC 12783 : buf[1] = canary;
1210 12783 : (void) strxfrm(buf, "a", 1);
1211 12783 : if (buf[1] != canary)
2832 noah 1212 UIC 0 : ok = false;
1213 :
2832 noah 1214 GIC 12783 : if (!ok)
2832 noah 1215 UIC 0 : ereport(ERROR,
1216 : (errcode(ERRCODE_SYSTEM_ERROR),
1217 : errmsg_internal("strxfrm(), in locale \"%s\", writes past the specified array length",
1218 : setlocale(LC_COLLATE, NULL)),
1219 : errhint("Apply system library package updates.")));
2832 noah 1220 GIC 12783 : }
1221 :
1222 :
1223 : /*
1224 : * Cache mechanism for collation information.
1225 : *
1226 : * We cache two flags: whether the collation's LC_COLLATE or LC_CTYPE is C
1227 : * (or POSIX), so we can optimize a few code paths in various places.
1228 : * For the built-in C and POSIX collations, we can know that without even
1229 : * doing a cache lookup, but we want to support aliases for C/POSIX too.
1230 : * For the "default" collation, there are separate static cache variables,
1231 : * since consulting the pg_collation catalog doesn't tell us what we need.
1232 : *
1233 : * Also, if a pg_locale_t has been requested for a collation, we cache that
1234 : * for the life of a backend.
1235 : *
4403 tgl 1236 ECB : * Note that some code relies on the flags not reporting false negatives
1237 : * (that is, saying it's not C when it is). For example, char2wchar()
1238 : * could fail if the locale is C, so str_tolower() shouldn't call it
1239 : * in that case.
1240 : *
3260 bruce 1241 : * Note that we currently lack any way to flush the cache. Since we don't
4403 tgl 1242 : * support ALTER COLLATION, this is OK. The worst case is that someone
1243 : * drops a collation, and a useless cache entry hangs around in existing
1244 : * backends.
1245 : */
1246 :
1247 : static collation_cache_entry *
4403 tgl 1248 GIC 21478 : lookup_collation_cache(Oid collation, bool set_flags)
4443 peter_e 1249 ECB : {
4403 tgl 1250 : collation_cache_entry *cache_entry;
1251 : bool found;
1252 :
4403 tgl 1253 GIC 21478 : Assert(OidIsValid(collation));
1254 21478 : Assert(collation != DEFAULT_COLLATION_OID);
4403 tgl 1255 ECB :
4403 tgl 1256 CBC 21478 : if (collation_cache == NULL)
1257 : {
1258 : /* First time through, initialize the hash table */
1259 : HASHCTL ctl;
1260 :
4403 tgl 1261 GIC 23 : ctl.keysize = sizeof(Oid);
4403 tgl 1262 CBC 23 : ctl.entrysize = sizeof(collation_cache_entry);
1263 23 : collation_cache = hash_create("Collation cache", 100, &ctl,
1264 : HASH_ELEM | HASH_BLOBS);
1265 : }
4403 tgl 1266 ECB :
4403 tgl 1267 GIC 21478 : cache_entry = hash_search(collation_cache, &collation, HASH_ENTER, &found);
1268 21478 : if (!found)
1269 : {
1270 : /*
1271 : * Make sure cache entry is marked invalid, in case we fail before
4403 tgl 1272 ECB : * setting things.
1273 : */
4403 tgl 1274 GBC 148 : cache_entry->flags_valid = false;
4403 tgl 1275 CBC 148 : cache_entry->locale = 0;
1276 : }
4403 tgl 1277 ECB :
4403 tgl 1278 GIC 21478 : if (set_flags && !cache_entry->flags_valid)
1279 : {
1280 : /* Attempt to set the flags */
1281 : HeapTuple tp;
1282 : Form_pg_collation collform;
4403 tgl 1283 ECB :
4403 tgl 1284 CBC 148 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collation));
1285 148 : if (!HeapTupleIsValid(tp))
4403 tgl 1286 LBC 0 : elog(ERROR, "cache lookup failed for collation %u", collation);
388 peter 1287 GIC 148 : collform = (Form_pg_collation) GETSTRUCT(tp);
4403 tgl 1288 ECB :
388 peter 1289 CBC 148 : if (collform->collprovider == COLLPROVIDER_LIBC)
388 peter 1290 ECB : {
1291 : Datum datum;
1292 : const char *collcollate;
1293 : const char *collctype;
1294 :
15 dgustafsson 1295 GNC 56 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
388 peter 1296 GIC 56 : collcollate = TextDatumGetCString(datum);
15 dgustafsson 1297 GNC 56 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
388 peter 1298 CBC 56 : collctype = TextDatumGetCString(datum);
1299 :
388 peter 1300 GIC 87 : cache_entry->collate_is_c = ((strcmp(collcollate, "C") == 0) ||
388 peter 1301 CBC 31 : (strcmp(collcollate, "POSIX") == 0));
388 peter 1302 GIC 87 : cache_entry->ctype_is_c = ((strcmp(collctype, "C") == 0) ||
1303 31 : (strcmp(collctype, "POSIX") == 0));
1304 : }
1305 : else
1306 : {
1307 92 : cache_entry->collate_is_c = false;
1308 92 : cache_entry->ctype_is_c = false;
388 peter 1309 ECB : }
1310 :
4403 tgl 1311 GIC 148 : cache_entry->flags_valid = true;
1312 :
1313 148 : ReleaseSysCache(tp);
1314 : }
4403 tgl 1315 ECB :
4403 tgl 1316 GBC 21478 : return cache_entry;
1317 : }
1318 :
1319 :
1320 : /*
1321 : * Detect whether collation's LC_COLLATE property is C
4403 tgl 1322 ECB : */
1323 : bool
4403 tgl 1324 GIC 9450429 : lc_collate_is_c(Oid collation)
1325 : {
1326 : /*
4382 bruce 1327 ECB : * If we're asked about "collation 0", return false, so that the code will
1328 : * go into the non-C path and report that the collation is bogus.
1329 : */
4403 tgl 1330 CBC 9450429 : if (!OidIsValid(collation))
4403 tgl 1331 LBC 0 : return false;
4403 tgl 1332 ECB :
1333 : /*
4382 bruce 1334 EUB : * If we're asked about the default collation, we have to inquire of the C
1335 : * library. Cache the result so we only have to compute it once.
4403 tgl 1336 ECB : */
4403 tgl 1337 CBC 9450429 : if (collation == DEFAULT_COLLATION_OID)
4403 tgl 1338 ECB : {
4403 tgl 1339 EUB : static int result = -1;
1340 : char *localeptr;
4403 tgl 1341 ECB :
388 peter 1342 CBC 6418465 : if (default_locale.provider == COLLPROVIDER_ICU)
388 peter 1343 GIC 6412117 : return false;
1344 :
4403 tgl 1345 6348 : if (result >= 0)
1346 6340 : return (bool) result;
1347 8 : localeptr = setlocale(LC_COLLATE, NULL);
4403 tgl 1348 CBC 8 : if (!localeptr)
4403 tgl 1349 UIC 0 : elog(ERROR, "invalid LC_COLLATE setting");
4403 tgl 1350 ECB :
4403 tgl 1351 GIC 8 : if (strcmp(localeptr, "C") == 0)
1352 1 : result = true;
1353 7 : else if (strcmp(localeptr, "POSIX") == 0)
4403 tgl 1354 UIC 0 : result = true;
4403 tgl 1355 ECB : else
4403 tgl 1356 GIC 7 : result = false;
1357 8 : return (bool) result;
1358 : }
1359 :
1360 : /*
1361 : * If we're asked about the built-in C/POSIX collations, we know that.
4403 tgl 1362 ECB : */
4403 tgl 1363 GIC 3031964 : if (collation == C_COLLATION_OID ||
1364 : collation == POSIX_COLLATION_OID)
1365 3022864 : return true;
1366 :
1367 : /*
4403 tgl 1368 ECB : * Otherwise, we have to consult pg_collation, but we cache that.
4403 tgl 1369 EUB : */
4403 tgl 1370 GIC 9100 : return (lookup_collation_cache(collation, true))->collate_is_c;
1371 : }
1372 :
1373 : /*
1374 : * Detect whether collation's LC_CTYPE property is C
4403 tgl 1375 ECB : */
1376 : bool
4403 tgl 1377 GIC 2989314 : lc_ctype_is_c(Oid collation)
1378 : {
1379 : /*
4382 bruce 1380 ECB : * If we're asked about "collation 0", return false, so that the code will
1381 : * go into the non-C path and report that the collation is bogus.
1382 : */
4403 tgl 1383 CBC 2989314 : if (!OidIsValid(collation))
4403 tgl 1384 LBC 0 : return false;
4403 tgl 1385 ECB :
1386 : /*
4382 bruce 1387 EUB : * If we're asked about the default collation, we have to inquire of the C
1388 : * library. Cache the result so we only have to compute it once.
4403 tgl 1389 ECB : */
4403 tgl 1390 GBC 2989314 : if (collation == DEFAULT_COLLATION_OID)
4403 tgl 1391 ECB : {
4403 tgl 1392 EUB : static int result = -1;
1393 : char *localeptr;
4403 tgl 1394 ECB :
388 peter 1395 CBC 1560268 : if (default_locale.provider == COLLPROVIDER_ICU)
388 peter 1396 GIC 1560262 : return false;
1397 :
4403 tgl 1398 6 : if (result >= 0)
1399 3 : return (bool) result;
1400 3 : localeptr = setlocale(LC_CTYPE, NULL);
4403 tgl 1401 CBC 3 : if (!localeptr)
4403 tgl 1402 UIC 0 : elog(ERROR, "invalid LC_CTYPE setting");
4403 tgl 1403 ECB :
4403 tgl 1404 GIC 3 : if (strcmp(localeptr, "C") == 0)
4403 tgl 1405 UIC 0 : result = true;
4403 tgl 1406 GIC 3 : else if (strcmp(localeptr, "POSIX") == 0)
4403 tgl 1407 UIC 0 : result = true;
4403 tgl 1408 ECB : else
4403 tgl 1409 GIC 3 : result = false;
1410 3 : return (bool) result;
1411 : }
1412 :
1413 : /*
4403 tgl 1414 ECB : * If we're asked about the built-in C/POSIX collations, we know that.
1415 : */
4403 tgl 1416 GIC 1429046 : if (collation == C_COLLATION_OID ||
1417 : collation == POSIX_COLLATION_OID)
1418 1427352 : return true;
1419 :
1420 : /*
4403 tgl 1421 ECB : * Otherwise, we have to consult pg_collation, but we cache that.
1422 : */
4403 tgl 1423 GIC 1694 : return (lookup_collation_cache(collation, true))->ctype_is_c;
1424 : }
1425 :
1426 : struct pg_locale_struct default_locale;
388 peter 1427 ECB :
1428 : void
388 peter 1429 GIC 10206 : make_icu_collator(const char *iculocstr,
1430 : const char *icurules,
1431 : struct pg_locale_struct *resultp)
1432 : {
1433 : #ifdef USE_ICU
1434 : UCollator *collator;
388 peter 1435 ECB :
17 jdavis 1436 GNC 10206 : collator = pg_ucol_open(iculocstr);
388 peter 1437 ECB :
1438 : /*
1439 : * If rules are specified, we extract the rules of the standard collation,
1440 : * add our own rules, and make a new collator with the combined rules.
1441 : */
32 peter 1442 GNC 10204 : if (icurules)
1443 : {
1444 : const UChar *default_rules;
1445 : UChar *agg_rules;
1446 : UChar *my_rules;
1447 : UErrorCode status;
1448 : int32_t length;
1449 :
1450 6 : default_rules = ucol_getRules(collator, &length);
1451 6 : icu_to_uchar(&my_rules, icurules, strlen(icurules));
1452 :
1453 6 : agg_rules = palloc_array(UChar, u_strlen(default_rules) + u_strlen(my_rules) + 1);
1454 6 : u_strcpy(agg_rules, default_rules);
1455 6 : u_strcat(agg_rules, my_rules);
1456 :
1457 6 : ucol_close(collator);
1458 :
1459 6 : status = U_ZERO_ERROR;
1460 6 : collator = ucol_openRules(agg_rules, u_strlen(agg_rules),
1461 : UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, NULL, &status);
1462 6 : if (U_FAILURE(status))
1463 3 : ereport(ERROR,
1464 : (errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
1465 : iculocstr, icurules, u_errorName(status))));
1466 : }
32 peter 1467 ECB :
1468 : /* We will leak this string if the caller errors later :-( */
388 peter 1469 CBC 10201 : resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
1470 10201 : resultp->info.icu.ucol = collator;
1471 : #else /* not USE_ICU */
1472 : /* could get here if a collation was created by a build with ICU */
1473 : ereport(ERROR,
1474 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1475 : errmsg("ICU is not supported in this build")));
388 peter 1476 ECB : #endif /* not USE_ICU */
388 peter 1477 CBC 10201 : }
1478 :
1479 :
1480 : /* simple subroutine for reporting errors from newlocale() */
1481 : #ifdef HAVE_LOCALE_T
1482 : static void
4219 tgl 1483 GIC 3 : report_newlocale_failure(const char *localename)
4219 tgl 1484 ECB : {
1485 : int save_errno;
1486 :
1487 : /*
1488 : * Windows doesn't provide any useful error indication from
1489 : * _create_locale(), and BSD-derived platforms don't seem to feel they
2077 1490 : * need to set errno either (even though POSIX is pretty clear that
1491 : * newlocale should do so). So, if errno hasn't been set, assume ENOENT
1492 : * is what to report.
1493 : */
2077 tgl 1494 GIC 3 : if (errno == 0)
1495 3 : errno = ENOENT;
1496 :
1497 : /*
1498 : * ENOENT means "no such locale", not "no such file", so clarify that
1499 : * errno with an errdetail message.
1500 : */
2077 tgl 1501 CBC 3 : save_errno = errno; /* auxiliary funcs might change errno */
4219 1502 3 : ereport(ERROR,
1503 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1504 : errmsg("could not create locale \"%s\": %m",
1505 : localename),
1506 : (save_errno == ENOENT ?
1507 : errdetail("The operating system could not find any locale data for the locale name \"%s\".",
4219 tgl 1508 ECB : localename) : 0)));
1509 : }
1510 : #endif /* HAVE_LOCALE_T */
1511 :
1512 : bool
45 jdavis 1513 GNC 5377091 : pg_locale_deterministic(pg_locale_t locale)
1514 : {
1515 : /* default locale must always be deterministic */
1516 5377091 : if (locale == NULL)
1517 296865 : return true;
1518 : else
1519 5080226 : return locale->deterministic;
1520 : }
1521 :
1522 : /*
1523 : * Create a locale_t from a collation OID. Results are cached for the
1524 : * lifetime of the backend. Thus, do not free the result with freelocale().
1525 : *
1526 : * As a special optimization, the default/database collation returns 0.
1527 : * Callers should then revert to the non-locale_t-enabled code path.
1528 : * Also, callers should avoid calling this before going down a C/POSIX
4403 tgl 1529 ECB : * fastpath, because such a fastpath should work even on platforms without
1530 : * locale_t support in the C library.
1531 : *
4443 peter_e 1532 : * For simplicity, we always generate COLLATE + CTYPE even though we
4403 tgl 1533 : * might only need one of them. Since this is called only once per session,
1534 : * it shouldn't cost much.
4443 peter_e 1535 : */
1536 : pg_locale_t
4443 peter_e 1537 GIC 7986982 : pg_newlocale_from_collation(Oid collid)
1538 : {
1539 : collation_cache_entry *cache_entry;
1540 :
1541 : /* Callers must pass a valid OID */
4401 tgl 1542 7986982 : Assert(OidIsValid(collid));
1543 :
4443 peter_e 1544 7986982 : if (collid == DEFAULT_COLLATION_OID)
1545 : {
388 peter 1546 7976298 : if (default_locale.provider == COLLPROVIDER_ICU)
1547 7969945 : return &default_locale;
1548 : else
1549 6353 : return (pg_locale_t) 0;
1550 : }
1551 :
4403 tgl 1552 10684 : cache_entry = lookup_collation_cache(collid, false);
4443 peter_e 1553 ECB :
4403 tgl 1554 GIC 10684 : if (cache_entry->locale == 0)
1555 : {
1556 : /* We haven't computed this yet in this session, so do it */
1557 : HeapTuple tp;
4403 tgl 1558 ECB : Form_pg_collation collform;
1559 : struct pg_locale_struct result;
2027 1560 : pg_locale_t resultp;
1561 : Datum datum;
702 tmunro 1562 : bool isnull;
4443 peter_e 1563 :
4403 tgl 1564 GIC 113 : tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
4403 tgl 1565 CBC 113 : if (!HeapTupleIsValid(tp))
4403 tgl 1566 UIC 0 : elog(ERROR, "cache lookup failed for collation %u", collid);
4403 tgl 1567 GIC 113 : collform = (Form_pg_collation) GETSTRUCT(tp);
4443 peter_e 1568 ECB :
1569 : /* We'll fill in the result struct locally before allocating memory */
2027 tgl 1570 CBC 113 : memset(&result, 0, sizeof(result));
2027 tgl 1571 GIC 113 : result.provider = collform->collprovider;
1479 peter 1572 113 : result.deterministic = collform->collisdeterministic;
1573 :
2208 peter_e 1574 113 : if (collform->collprovider == COLLPROVIDER_LIBC)
1575 : {
1576 : #ifdef HAVE_LOCALE_T
1577 : const char *collcollate;
1578 : const char *collctype pg_attribute_unused();
1579 : locale_t loc;
2208 peter_e 1580 ECB :
15 dgustafsson 1581 GNC 21 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
388 peter 1582 CBC 21 : collcollate = TextDatumGetCString(datum);
15 dgustafsson 1583 GNC 21 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
388 peter 1584 CBC 21 : collctype = TextDatumGetCString(datum);
388 peter 1585 ECB :
2208 peter_e 1586 CBC 21 : if (strcmp(collcollate, collctype) == 0)
1587 : {
2208 peter_e 1588 ECB : /* Normal case where they're the same */
2077 tgl 1589 GIC 21 : errno = 0;
1590 : #ifndef WIN32
2208 peter_e 1591 21 : loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collcollate,
1592 : NULL);
1593 : #else
1594 : loc = _create_locale(LC_ALL, collcollate);
4382 peter_e 1595 ECB : #endif
2208 peter_e 1596 CBC 21 : if (!loc)
1597 3 : report_newlocale_failure(collcollate);
2208 peter_e 1598 ECB : }
1599 : else
1600 : {
1601 : #ifndef WIN32
1602 : /* We need two newlocale() steps */
1603 : locale_t loc1;
1604 :
2077 tgl 1605 LBC 0 : errno = 0;
2208 peter_e 1606 UIC 0 : loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL);
1607 0 : if (!loc1)
1608 0 : report_newlocale_failure(collcollate);
2077 tgl 1609 0 : errno = 0;
2208 peter_e 1610 LBC 0 : loc = newlocale(LC_CTYPE_MASK, collctype, loc1);
1611 0 : if (!loc)
2208 peter_e 1612 UIC 0 : report_newlocale_failure(collctype);
1613 : #else
1614 :
1615 : /*
1616 : * XXX The _create_locale() API doesn't appear to support
1617 : * this. Could perhaps be worked around by changing
1618 : * pg_locale_t to contain two separate fields.
2208 peter_e 1619 EUB : */
1620 : ereport(ERROR,
1621 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1622 : errmsg("collations with different collate and ctype values are not supported on this platform")));
1623 : #endif
1624 : }
1625 :
2027 tgl 1626 GBC 18 : result.info.lt = loc;
1627 : #else /* not HAVE_LOCALE_T */
1628 : /* platform that doesn't support locale_t */
1629 : ereport(ERROR,
1630 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1631 : errmsg("collation provider LIBC is not supported on this platform")));
1632 : #endif /* not HAVE_LOCALE_T */
1633 : }
2208 peter_e 1634 GIC 92 : else if (collform->collprovider == COLLPROVIDER_ICU)
1635 : {
1636 : const char *iculocstr;
1637 : const char *icurules;
1638 :
15 dgustafsson 1639 GNC 92 : datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colliculocale);
388 peter 1640 CBC 92 : iculocstr = TextDatumGetCString(datum);
1641 :
32 peter 1642 GNC 92 : datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
1643 92 : if (!isnull)
1644 6 : icurules = TextDatumGetCString(datum);
1645 : else
1646 86 : icurules = NULL;
1647 :
1648 92 : make_icu_collator(iculocstr, icurules, &result);
1649 : }
1650 :
437 peter 1651 GIC 107 : datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
1652 : &isnull);
702 tmunro 1653 107 : if (!isnull)
1654 : {
702 tmunro 1655 ECB : char *actual_versionstr;
1656 : char *collversionstr;
1657 :
437 peter 1658 GIC 104 : collversionstr = TextDatumGetCString(datum);
1659 :
15 dgustafsson 1660 GNC 104 : datum = SysCacheGetAttrNotNull(COLLOID, tp, collform->collprovider == COLLPROVIDER_ICU ? Anum_pg_collation_colliculocale : Anum_pg_collation_collcollate);
1661 :
388 peter 1662 CBC 104 : actual_versionstr = get_collation_actual_version(collform->collprovider,
1663 104 : TextDatumGetCString(datum));
702 tmunro 1664 104 : if (!actual_versionstr)
1665 : {
702 tmunro 1666 ECB : /*
1667 : * This could happen when specifying a version in CREATE
423 peter 1668 : * COLLATION but the provider does not support versioning, or
1669 : * manually creating a mess in the catalogs.
1670 : */
702 tmunro 1671 LBC 0 : ereport(ERROR,
1672 : (errmsg("collation \"%s\" has no actual version, but a version was recorded",
702 tmunro 1673 ECB : NameStr(collform->collname))));
1674 : }
1675 :
702 tmunro 1676 GIC 104 : if (strcmp(actual_versionstr, collversionstr) != 0)
702 tmunro 1677 UIC 0 : ereport(WARNING,
702 tmunro 1678 ECB : (errmsg("collation \"%s\" has version mismatch",
1679 : NameStr(collform->collname)),
1680 : errdetail("The collation in the database was created using version %s, "
1681 : "but the operating system provides version %s.",
1682 : collversionstr, actual_versionstr),
1683 : errhint("Rebuild all objects affected by this collation and run "
1684 : "ALTER COLLATION %s REFRESH VERSION, "
1685 : "or build PostgreSQL with the right library version.",
1686 : quote_qualified_identifier(get_namespace_name(collform->collnamespace),
1687 : NameStr(collform->collname)))));
1688 : }
1689 :
4403 tgl 1690 GIC 107 : ReleaseSysCache(tp);
4403 tgl 1691 EUB :
1692 : /* We'll keep the pg_locale_t structures in TopMemoryContext */
2027 tgl 1693 GIC 107 : resultp = MemoryContextAlloc(TopMemoryContext, sizeof(*resultp));
1694 107 : *resultp = result;
1695 :
2027 tgl 1696 CBC 107 : cache_entry->locale = resultp;
4403 tgl 1697 EUB : }
1698 :
4403 tgl 1699 GIC 10678 : return cache_entry->locale;
1700 : }
1701 :
1702 : /*
1703 : * Get provider-specific collation version string for the given collation from
1704 : * the operating system/library.
1705 : */
1706 : char *
772 tmunro 1707 452360 : get_collation_actual_version(char collprovider, const char *collcollate)
1708 : {
1271 1709 452360 : char *collversion = NULL;
2208 peter_e 1710 ECB :
1711 : #ifdef USE_ICU
2208 peter_e 1712 GIC 452360 : if (collprovider == COLLPROVIDER_ICU)
2208 peter_e 1713 ECB : {
1714 : UCollator *collator;
1715 : UVersionInfo versioninfo;
1716 : char buf[U_MAX_VERSION_STRING_LENGTH];
1717 :
17 jdavis 1718 GNC 255271 : collator = pg_ucol_open(collcollate);
1719 :
2208 peter_e 1720 GIC 255271 : ucol_getVersion(collator, versioninfo);
1721 255271 : ucol_close(collator);
2208 peter_e 1722 ECB :
2208 peter_e 1723 GIC 255271 : u_versionToString(versioninfo, buf);
2208 peter_e 1724 CBC 255271 : collversion = pstrdup(buf);
1725 : }
1726 : else
2208 peter_e 1727 ECB : #endif
697 tgl 1728 GIC 394178 : if (collprovider == COLLPROVIDER_LIBC &&
1729 394145 : pg_strcasecmp("C", collcollate) != 0 &&
1730 393506 : pg_strncasecmp("C.", collcollate, 2) != 0 &&
1731 196450 : pg_strcasecmp("POSIX", collcollate) != 0)
1732 : {
1271 tmunro 1733 ECB : #if defined(__GLIBC__)
1734 : /* Use the glibc version because we don't have anything better. */
1271 tmunro 1735 CBC 196438 : collversion = pstrdup(gnu_get_libc_version());
870 tmunro 1736 ECB : #elif defined(LC_VERSION_MASK)
1737 : locale_t loc;
1738 :
1739 : /* Look up FreeBSD collation version. */
1740 : loc = newlocale(LC_COLLATE, collcollate, NULL);
1741 : if (loc)
1742 : {
1743 : collversion =
1744 : pstrdup(querylocale(LC_COLLATE_MASK | LC_VERSION_MASK, loc));
1745 : freelocale(loc);
1746 : }
1747 : else
1748 : ereport(ERROR,
1749 : (errmsg("could not load locale \"%s\"", collcollate)));
1750 : #elif defined(WIN32)
1751 : /*
1752 : * If we are targeting Windows Vista and above, we can ask for a name
1753 : * given a collation name (earlier versions required a location code
1754 : * that we don't have).
1755 : */
1756 : NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
1757 : WCHAR wide_collcollate[LOCALE_NAME_MAX_LENGTH];
1758 :
1759 : MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
1760 : LOCALE_NAME_MAX_LENGTH);
1761 : if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
1762 : {
1763 : /*
1764 : * GetNLSVersionEx() wants a language tag such as "en-US", not a
1765 : * locale name like "English_United States.1252". Until those
1766 : * values can be prevented from entering the system, or 100%
1767 : * reliably converted to the more useful tag format, tolerate the
1768 : * resulting error and report that we have no version data.
1769 : */
1770 : if (GetLastError() == ERROR_INVALID_PARAMETER)
1771 : return NULL;
1772 :
1773 : ereport(ERROR,
1774 : (errmsg("could not get collation version for locale \"%s\": error code %lu",
1775 : collcollate,
1776 : GetLastError())));
1777 : }
1778 : collversion = psprintf("%lu.%lu,%lu.%lu",
1779 : (version.dwNLSVersion >> 8) & 0xFFFF,
1780 : version.dwNLSVersion & 0xFF,
1781 : (version.dwDefinedVersion >> 8) & 0xFFFF,
1782 : version.dwDefinedVersion & 0xFF);
1783 : #endif
1784 : }
1785 :
2208 peter_e 1786 GIC 452360 : return collversion;
1787 : }
1788 :
1789 : /*
1790 : * pg_strncoll_libc_win32_utf8
1791 : *
1792 : * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
1793 : * invoke wcscoll() or wcscoll_l().
1794 : */
1795 : #ifdef WIN32
1796 : static int
1797 : pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
1798 : size_t len2, pg_locale_t locale)
1799 : {
1800 : char sbuf[TEXTBUFLEN];
1801 : char *buf = sbuf;
1802 : char *a1p,
1803 : *a2p;
1804 : int a1len = len1 * 2 + 2;
1805 : int a2len = len2 * 2 + 2;
1806 : int r;
1807 : int result;
1808 :
1809 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1810 : Assert(GetDatabaseEncoding() == PG_UTF8);
1811 : #ifndef WIN32
1812 : Assert(false);
1813 : #endif
1814 :
1815 : if (a1len + a2len > TEXTBUFLEN)
1816 : buf = palloc(a1len + a2len);
1817 :
1818 : a1p = buf;
1819 : a2p = buf + a1len;
1820 :
1821 : /* API does not work for zero-length input */
1822 : if (len1 == 0)
1823 : r = 0;
1824 : else
1825 : {
1826 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1827 : (LPWSTR) a1p, a1len / 2);
1828 : if (!r)
1829 : ereport(ERROR,
1830 : (errmsg("could not convert string to UTF-16: error code %lu",
1831 : GetLastError())));
1832 : }
1833 : ((LPWSTR) a1p)[r] = 0;
1834 :
1835 : if (len2 == 0)
1836 : r = 0;
1837 : else
1838 : {
1839 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1840 : (LPWSTR) a2p, a2len / 2);
1841 : if (!r)
1842 : ereport(ERROR,
1843 : (errmsg("could not convert string to UTF-16: error code %lu",
1844 : GetLastError())));
1845 : }
1846 : ((LPWSTR) a2p)[r] = 0;
1847 :
1848 : errno = 0;
1849 : #ifdef HAVE_LOCALE_T
1850 : if (locale)
1851 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
1852 : else
1853 : #endif
1854 : result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1855 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1856 : * headers */
1857 : ereport(ERROR,
1858 : (errmsg("could not compare Unicode strings: %m")));
1859 :
1860 : if (buf != sbuf)
1861 : pfree(buf);
1862 :
1863 : return result;
1864 : }
1865 : #endif /* WIN32 */
1866 :
1867 : /*
1868 : * pg_strcoll_libc
1869 : *
1870 : * Call strcoll(), strcoll_l(), wcscoll(), or wcscoll_l() as appropriate for
1871 : * the given locale, platform, and database encoding. If the locale is NULL,
1872 : * use the database collation.
1873 : *
1874 : * Arguments must be encoded in the database encoding and nul-terminated.
1875 : */
1876 : static int
45 jdavis 1877 GNC 975 : pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
1878 : {
1879 : int result;
1880 :
1881 975 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1882 : #ifdef WIN32
1883 : if (GetDatabaseEncoding() == PG_UTF8)
1884 : {
1885 : size_t len1 = strlen(arg1);
1886 : size_t len2 = strlen(arg2);
1887 : result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1888 : }
1889 : else
1890 : #endif /* WIN32 */
1891 975 : if (locale)
1892 : {
1893 : #ifdef HAVE_LOCALE_T
1894 975 : result = strcoll_l(arg1, arg2, locale->info.lt);
1895 : #else
1896 : /* shouldn't happen */
1897 : elog(ERROR, "unsupported collprovider: %c", locale->provider);
1898 : #endif
1899 : }
1900 : else
45 jdavis 1901 UNC 0 : result = strcoll(arg1, arg2);
1902 :
45 jdavis 1903 GNC 975 : return result;
1904 : }
1905 :
1906 : /*
1907 : * pg_strncoll_libc
1908 : *
1909 : * Nul-terminate the arguments and call pg_strcoll_libc().
1910 : */
1911 : static int
1912 252 : pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
1913 : pg_locale_t locale)
1914 : {
1915 : char sbuf[TEXTBUFLEN];
1916 252 : char *buf = sbuf;
1917 252 : size_t bufsize1 = len1 + 1;
1918 252 : size_t bufsize2 = len2 + 1;
1919 : char *arg1n;
1920 : char *arg2n;
1921 : int result;
1922 :
1923 252 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
1924 :
1925 : #ifdef WIN32
1926 : /* check for this case before doing the work for nul-termination */
1927 : if (GetDatabaseEncoding() == PG_UTF8)
1928 : return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
1929 : #endif /* WIN32 */
1930 :
1931 252 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
45 jdavis 1932 UNC 0 : buf = palloc(bufsize1 + bufsize2);
1933 :
45 jdavis 1934 GNC 252 : arg1n = buf;
1935 252 : arg2n = buf + bufsize1;
1936 :
1937 : /* nul-terminate arguments */
1938 252 : memcpy(arg1n, arg1, len1);
1939 252 : arg1n[len1] = '\0';
1940 252 : memcpy(arg2n, arg2, len2);
1941 252 : arg2n[len2] = '\0';
1942 :
1943 252 : result = pg_strcoll_libc(arg1n, arg2n, locale);
1944 :
1945 252 : if (buf != sbuf)
45 jdavis 1946 UNC 0 : pfree(buf);
1947 :
45 jdavis 1948 GNC 252 : return result;
1949 : }
1950 :
1951 : #ifdef USE_ICU
1952 :
1953 : /*
1954 : * pg_strncoll_icu_no_utf8
1955 : *
1956 : * Convert the arguments from the database encoding to UChar strings, then
1957 : * call ucol_strcoll(). An argument length of -1 means that the string is
1958 : * NUL-terminated.
1959 : *
1960 : * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
1961 : * caller should call that instead.
1962 : */
1963 : static int
45 jdavis 1964 UNC 0 : pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
1965 : const char *arg2, int32_t len2, pg_locale_t locale)
1966 : {
1967 : char sbuf[TEXTBUFLEN];
1968 0 : char *buf = sbuf;
1969 : int32_t ulen1;
1970 : int32_t ulen2;
1971 : size_t bufsize1;
1972 : size_t bufsize2;
1973 : UChar *uchar1,
1974 : *uchar2;
1975 : int result;
1976 :
1977 0 : Assert(locale->provider == COLLPROVIDER_ICU);
1978 : #ifdef HAVE_UCOL_STRCOLLUTF8
1979 0 : Assert(GetDatabaseEncoding() != PG_UTF8);
1980 : #endif
1981 :
1982 0 : init_icu_converter();
1983 :
1984 0 : ulen1 = uchar_length(icu_converter, arg1, len1);
1985 0 : ulen2 = uchar_length(icu_converter, arg2, len2);
1986 :
1987 0 : bufsize1 = (ulen1 + 1) * sizeof(UChar);
1988 0 : bufsize2 = (ulen2 + 1) * sizeof(UChar);
1989 :
1990 0 : if (bufsize1 + bufsize2 > TEXTBUFLEN)
1991 0 : buf = palloc(bufsize1 + bufsize2);
1992 :
1993 0 : uchar1 = (UChar *) buf;
1994 0 : uchar2 = (UChar *) (buf + bufsize1);
1995 :
1996 0 : ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
1997 0 : ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
1998 :
1999 0 : result = ucol_strcoll(locale->info.icu.ucol,
2000 : uchar1, ulen1,
2001 : uchar2, ulen2);
2002 :
2003 0 : if (buf != sbuf)
2004 0 : pfree(buf);
2005 :
2006 0 : return result;
2007 : }
2008 :
2009 : /*
2010 : * pg_strncoll_icu
2011 : *
2012 : * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
2013 : * database encoding. An argument length of -1 means the string is
2014 : * NUL-terminated.
2015 : *
2016 : * Arguments must be encoded in the database encoding.
2017 : */
2018 : static int
45 jdavis 2019 GNC 12027680 : pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
2020 : pg_locale_t locale)
2021 : {
2022 : int result;
2023 :
2024 12027680 : Assert(locale->provider == COLLPROVIDER_ICU);
2025 :
2026 : #ifdef HAVE_UCOL_STRCOLLUTF8
2027 12027680 : if (GetDatabaseEncoding() == PG_UTF8)
2028 : {
2029 : UErrorCode status;
2030 :
2031 12027680 : status = U_ZERO_ERROR;
2032 12027680 : result = ucol_strcollUTF8(locale->info.icu.ucol,
2033 : arg1, len1,
2034 : arg2, len2,
2035 : &status);
2036 12027680 : if (U_FAILURE(status))
45 jdavis 2037 UNC 0 : ereport(ERROR,
2038 : (errmsg("collation failed: %s", u_errorName(status))));
2039 : }
2040 : else
2041 : #endif
2042 : {
2043 0 : result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
2044 : }
2045 :
45 jdavis 2046 GNC 12027680 : return result;
2047 : }
2048 :
2049 : #endif /* USE_ICU */
2050 :
2051 : /*
2052 : * pg_strcoll
2053 : *
2054 : * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
2055 : * or wcscoll_l() as appropriate for the given locale, platform, and database
2056 : * encoding. If the locale is not specified, use the database collation.
2057 : *
2058 : * Arguments must be encoded in the database encoding and nul-terminated.
2059 : *
2060 : * The caller is responsible for breaking ties if the collation is
2061 : * deterministic; this maintains consistency with pg_strxfrm(), which cannot
2062 : * easily account for deterministic collations.
2063 : */
2064 : int
2065 10692953 : pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
2066 : {
2067 : int result;
2068 :
2069 10692953 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2070 723 : result = pg_strcoll_libc(arg1, arg2, locale);
2071 : #ifdef USE_ICU
2072 10692230 : else if (locale->provider == COLLPROVIDER_ICU)
2073 10692230 : result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
2074 : #endif
2075 : else
2076 : /* shouldn't happen */
45 jdavis 2077 UNC 0 : elog(ERROR, "unsupported collprovider: %c", locale->provider);
2078 :
45 jdavis 2079 GNC 10692953 : return result;
2080 : }
2081 :
2082 : /*
2083 : * pg_strncoll
2084 : *
2085 : * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll(), strcoll_l(), wcscoll(),
2086 : * or wcscoll_l() as appropriate for the given locale, platform, and database
2087 : * encoding. If the locale is not specified, use the database collation.
2088 : *
2089 : * Arguments must be encoded in the database encoding.
2090 : *
2091 : * This function may need to nul-terminate the arguments for libc functions;
2092 : * so if the caller already has nul-terminated strings, it should call
2093 : * pg_strcoll() instead.
2094 : *
2095 : * The caller is responsible for breaking ties if the collation is
2096 : * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
2097 : * easily account for deterministic collations.
2098 : */
2099 : int
2100 1335702 : pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
2101 : pg_locale_t locale)
2102 : {
2103 : int result;
2104 :
2105 1335702 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2106 252 : result = pg_strncoll_libc(arg1, len1, arg2, len2, locale);
2107 : #ifdef USE_ICU
2108 1335450 : else if (locale->provider == COLLPROVIDER_ICU)
2109 1335450 : result = pg_strncoll_icu(arg1, len1, arg2, len2, locale);
2110 : #endif
2111 : else
2112 : /* shouldn't happen */
45 jdavis 2113 UNC 0 : elog(ERROR, "unsupported collprovider: %c", locale->provider);
2114 :
45 jdavis 2115 GNC 1335702 : return result;
2116 : }
2117 :
2118 :
2119 : static size_t
45 jdavis 2120 UNC 0 : pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
2121 : pg_locale_t locale)
2122 : {
2123 0 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2124 :
2125 : #ifdef TRUST_STRXFRM
2126 : #ifdef HAVE_LOCALE_T
2127 : if (locale)
2128 : return strxfrm_l(dest, src, destsize, locale->info.lt);
2129 : else
2130 : #endif
2131 : return strxfrm(dest, src, destsize);
2132 : #else
2133 : /* shouldn't happen */
2134 0 : elog(ERROR, "unsupported collprovider: %c", locale->provider);
2135 : return 0; /* keep compiler quiet */
2136 : #endif
2137 : }
2138 :
2139 : static size_t
2140 0 : pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
2141 : pg_locale_t locale)
2142 : {
2143 : char sbuf[TEXTBUFLEN];
2144 0 : char *buf = sbuf;
2145 0 : size_t bufsize = srclen + 1;
2146 : size_t result;
2147 :
2148 0 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
2149 :
2150 0 : if (bufsize > TEXTBUFLEN)
2151 0 : buf = palloc(bufsize);
2152 :
2153 : /* nul-terminate arguments */
2154 0 : memcpy(buf, src, srclen);
2155 0 : buf[srclen] = '\0';
2156 :
2157 0 : result = pg_strxfrm_libc(dest, buf, destsize, locale);
2158 :
2159 0 : if (buf != sbuf)
2160 0 : pfree(buf);
2161 :
2162 : /* if dest is defined, it should be nul-terminated */
2163 0 : Assert(result >= destsize || dest[result] == '\0');
2164 :
2165 0 : return result;
2166 : }
2167 :
2168 : #ifdef USE_ICU
2169 :
2170 : /* 'srclen' of -1 means the strings are NUL-terminated */
2171 : static size_t
45 jdavis 2172 GNC 498 : pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize,
2173 : pg_locale_t locale)
2174 : {
2175 : char sbuf[TEXTBUFLEN];
2176 498 : char *buf = sbuf;
2177 : UChar *uchar;
2178 : int32_t ulen;
2179 : size_t uchar_bsize;
2180 : Size result_bsize;
2181 :
2182 498 : Assert(locale->provider == COLLPROVIDER_ICU);
2183 :
2184 498 : init_icu_converter();
2185 :
2186 498 : ulen = uchar_length(icu_converter, src, srclen);
2187 :
2188 498 : uchar_bsize = (ulen + 1) * sizeof(UChar);
2189 :
2190 498 : if (uchar_bsize > TEXTBUFLEN)
45 jdavis 2191 UNC 0 : buf = palloc(uchar_bsize);
2192 :
45 jdavis 2193 GNC 498 : uchar = (UChar *) buf;
2194 :
2195 498 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
2196 :
2197 498 : result_bsize = ucol_getSortKey(locale->info.icu.ucol,
2198 : uchar, ulen,
2199 : (uint8_t *) dest, destsize);
2200 :
2201 : /*
2202 : * ucol_getSortKey() counts the nul-terminator in the result length, but
2203 : * this function should not.
2204 : */
2205 498 : Assert(result_bsize > 0);
2206 498 : result_bsize--;
2207 :
2208 498 : if (buf != sbuf)
45 jdavis 2209 UNC 0 : pfree(buf);
2210 :
2211 : /* if dest is defined, it should be nul-terminated */
45 jdavis 2212 GNC 498 : Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
2213 :
2214 498 : return result_bsize;
2215 : }
2216 :
2217 : /* 'srclen' of -1 means the strings are NUL-terminated */
2218 : static size_t
45 jdavis 2219 UNC 0 : pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen,
2220 : int32_t destsize, pg_locale_t locale)
2221 : {
2222 : char sbuf[TEXTBUFLEN];
2223 0 : char *buf = sbuf;
2224 : UCharIterator iter;
2225 : uint32_t state[2];
2226 : UErrorCode status;
2227 0 : int32_t ulen = -1;
2228 0 : UChar *uchar = NULL;
2229 : size_t uchar_bsize;
2230 : Size result_bsize;
2231 :
2232 0 : Assert(locale->provider == COLLPROVIDER_ICU);
2233 0 : Assert(GetDatabaseEncoding() != PG_UTF8);
2234 :
2235 0 : init_icu_converter();
2236 :
2237 0 : ulen = uchar_length(icu_converter, src, srclen);
2238 :
2239 0 : uchar_bsize = (ulen + 1) * sizeof(UChar);
2240 :
2241 0 : if (uchar_bsize > TEXTBUFLEN)
2242 0 : buf = palloc(uchar_bsize);
2243 :
2244 0 : uchar = (UChar *) buf;
2245 :
2246 0 : ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
2247 :
2248 0 : uiter_setString(&iter, uchar, ulen);
2249 0 : state[0] = state[1] = 0; /* won't need that again */
2250 0 : status = U_ZERO_ERROR;
2251 0 : result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
2252 : &iter,
2253 : state,
2254 : (uint8_t *) dest,
2255 : destsize,
2256 : &status);
2257 0 : if (U_FAILURE(status))
2258 0 : ereport(ERROR,
2259 : (errmsg("sort key generation failed: %s",
2260 : u_errorName(status))));
2261 :
2262 0 : return result_bsize;
2263 : }
2264 :
2265 : /* 'srclen' of -1 means the strings are NUL-terminated */
2266 : static size_t
45 jdavis 2267 GNC 166790 : pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen,
2268 : int32_t destsize, pg_locale_t locale)
2269 : {
2270 : size_t result;
2271 :
2272 166790 : Assert(locale->provider == COLLPROVIDER_ICU);
2273 :
2274 166790 : if (GetDatabaseEncoding() == PG_UTF8)
2275 : {
2276 : UCharIterator iter;
2277 : uint32_t state[2];
2278 : UErrorCode status;
2279 :
2280 166790 : uiter_setUTF8(&iter, src, srclen);
2281 166790 : state[0] = state[1] = 0; /* won't need that again */
2282 166790 : status = U_ZERO_ERROR;
2283 166790 : result = ucol_nextSortKeyPart(locale->info.icu.ucol,
2284 : &iter,
2285 : state,
2286 : (uint8_t *) dest,
2287 : destsize,
2288 : &status);
2289 166790 : if (U_FAILURE(status))
45 jdavis 2290 UNC 0 : ereport(ERROR,
2291 : (errmsg("sort key generation failed: %s",
2292 : u_errorName(status))));
2293 : }
2294 : else
2295 0 : result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize,
2296 : locale);
2297 :
45 jdavis 2298 GNC 166790 : return result;
2299 : }
2300 :
2301 : #endif
2302 :
2303 : /*
2304 : * Return true if the collation provider supports pg_strxfrm() and
2305 : * pg_strnxfrm(); otherwise false.
2306 : *
2307 : * Unfortunately, it seems that strxfrm() for non-C collations is broken on
2308 : * many common platforms; testing of multiple versions of glibc reveals that,
2309 : * for many locales, strcoll() and strxfrm() do not return consistent
2310 : * results. While no other libc other than Cygwin has so far been shown to
2311 : * have a problem, we take the conservative course of action for right now and
2312 : * disable this categorically. (Users who are certain this isn't a problem on
2313 : * their system can define TRUST_STRXFRM.)
2314 : *
2315 : * No similar problem is known for the ICU provider.
2316 : */
2317 : bool
2318 26140 : pg_strxfrm_enabled(pg_locale_t locale)
2319 : {
2320 26140 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2321 : #ifdef TRUST_STRXFRM
2322 : return true;
2323 : #else
2324 105 : return false;
2325 : #endif
2326 26035 : else if (locale->provider == COLLPROVIDER_ICU)
2327 26035 : return true;
2328 : else
2329 : /* shouldn't happen */
45 jdavis 2330 UNC 0 : elog(ERROR, "unsupported collprovider: %c", locale->provider);
2331 :
2332 : return false; /* keep compiler quiet */
2333 : }
2334 :
2335 : /*
2336 : * pg_strxfrm
2337 : *
2338 : * Transforms 'src' to a nul-terminated string stored in 'dest' such that
2339 : * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
2340 : * untransformed strings.
2341 : *
2342 : * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
2343 : * may be NULL.
2344 : *
2345 : * Returns the number of bytes needed to store the transformed string,
2346 : * excluding the terminating nul byte. If the value returned is 'destsize' or
2347 : * greater, the resulting contents of 'dest' are undefined.
2348 : */
2349 : size_t
2350 0 : pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
2351 : {
42 tgl 2352 0 : size_t result = 0; /* keep compiler quiet */
2353 :
45 jdavis 2354 0 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2355 0 : result = pg_strxfrm_libc(dest, src, destsize, locale);
2356 : #ifdef USE_ICU
2357 0 : else if (locale->provider == COLLPROVIDER_ICU)
2358 0 : result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
2359 : #endif
2360 : else
2361 : /* shouldn't happen */
2362 0 : elog(ERROR, "unsupported collprovider: %c", locale->provider);
2363 :
2364 0 : return result;
2365 : }
2366 :
2367 : /*
2368 : * pg_strnxfrm
2369 : *
2370 : * Transforms 'src' to a nul-terminated string stored in 'dest' such that
2371 : * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
2372 : * untransformed strings.
2373 : *
2374 : * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
2375 : * be NULL.
2376 : *
2377 : * Returns the number of bytes needed to store the transformed string,
2378 : * excluding the terminating nul byte. If the value returned is 'destsize' or
2379 : * greater, the resulting contents of 'dest' are undefined.
2380 : *
2381 : * This function may need to nul-terminate the argument for libc functions;
2382 : * so if the caller already has a nul-terminated string, it should call
2383 : * pg_strxfrm() instead.
2384 : */
2385 : size_t
45 jdavis 2386 GNC 498 : pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
2387 : pg_locale_t locale)
2388 : {
42 tgl 2389 498 : size_t result = 0; /* keep compiler quiet */
2390 :
45 jdavis 2391 498 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
45 jdavis 2392 UNC 0 : result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale);
2393 : #ifdef USE_ICU
45 jdavis 2394 GNC 498 : else if (locale->provider == COLLPROVIDER_ICU)
2395 498 : result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale);
2396 : #endif
2397 : else
2398 : /* shouldn't happen */
45 jdavis 2399 UNC 0 : elog(ERROR, "unsupported collprovider: %c", locale->provider);
2400 :
45 jdavis 2401 GNC 498 : return result;
2402 : }
2403 :
2404 : /*
2405 : * Return true if the collation provider supports pg_strxfrm_prefix() and
2406 : * pg_strnxfrm_prefix(); otherwise false.
2407 : */
2408 : bool
2409 166790 : pg_strxfrm_prefix_enabled(pg_locale_t locale)
2410 : {
2411 166790 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
45 jdavis 2412 UNC 0 : return false;
45 jdavis 2413 GNC 166790 : else if (locale->provider == COLLPROVIDER_ICU)
2414 166790 : return true;
2415 : else
2416 : /* shouldn't happen */
45 jdavis 2417 UNC 0 : elog(ERROR, "unsupported collprovider: %c", locale->provider);
2418 :
2419 : return false; /* keep compiler quiet */
2420 : }
2421 :
2422 : /*
2423 : * pg_strxfrm_prefix
2424 : *
2425 : * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
2426 : * memcmp() on the byte sequence is equivalent to pg_strcoll() on
2427 : * untransformed strings. The result is not nul-terminated.
2428 : *
2429 : * The provided 'src' must be nul-terminated.
2430 : *
2431 : * If destsize is not large enough to hold the resulting byte sequence, stores
2432 : * only the first destsize bytes in 'dest'. Returns the number of bytes
2433 : * actually copied to 'dest'.
2434 : */
2435 : size_t
45 jdavis 2436 GNC 166790 : pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
2437 : pg_locale_t locale)
2438 : {
44 2439 166790 : size_t result = 0; /* keep compiler quiet */
2440 :
45 2441 166790 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
45 jdavis 2442 UNC 0 : elog(ERROR, "collprovider '%c' does not support pg_strxfrm_prefix()",
2443 : locale->provider);
2444 : #ifdef USE_ICU
45 jdavis 2445 GNC 166790 : else if (locale->provider == COLLPROVIDER_ICU)
2446 166790 : result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
2447 : #endif
2448 : else
2449 : /* shouldn't happen */
45 jdavis 2450 UNC 0 : elog(ERROR, "unsupported collprovider: %c", locale->provider);
2451 :
45 jdavis 2452 GNC 166790 : return result;
2453 : }
2454 :
2455 : /*
2456 : * pg_strnxfrm_prefix
2457 : *
2458 : * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
2459 : * memcmp() on the byte sequence is equivalent to pg_strcoll() on
2460 : * untransformed strings. The result is not nul-terminated.
2461 : *
2462 : * The provided 'src' must be nul-terminated.
2463 : *
2464 : * If destsize is not large enough to hold the resulting byte sequence, stores
2465 : * only the first destsize bytes in 'dest'. Returns the number of bytes
2466 : * actually copied to 'dest'.
2467 : *
2468 : * This function may need to nul-terminate the argument for libc functions;
2469 : * so if the caller already has a nul-terminated string, it should call
2470 : * pg_strxfrm_prefix() instead.
2471 : */
2472 : size_t
45 jdavis 2473 UNC 0 : pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
2474 : size_t srclen, pg_locale_t locale)
2475 : {
44 2476 0 : size_t result = 0; /* keep compiler quiet */
2477 :
45 2478 0 : if (!locale || locale->provider == COLLPROVIDER_LIBC)
2479 0 : elog(ERROR, "collprovider '%c' does not support pg_strnxfrm_prefix()",
2480 : locale->provider);
2481 : #ifdef USE_ICU
2482 0 : else if (locale->provider == COLLPROVIDER_ICU)
2483 0 : result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
2484 : #endif
2485 : else
2486 : /* shouldn't happen */
2487 0 : elog(ERROR, "unsupported collprovider: %c", locale->provider);
2488 :
2489 0 : return result;
2490 : }
2491 :
2492 : #ifdef USE_ICU
2493 :
2494 : /*
2495 : * Wrapper around ucol_open() to handle API differences for older ICU
2496 : * versions.
2497 : */
2498 : static UCollator *
17 jdavis 2499 GNC 266329 : pg_ucol_open(const char *loc_str)
2500 : {
2501 : UCollator *collator;
2502 : UErrorCode status;
12 2503 266329 : const char *orig_str = loc_str;
17 2504 266329 : char *fixed_str = NULL;
2505 :
2506 : /*
2507 : * Must never open default collator, because it depends on the environment
2508 : * and may change at any time. Should not happen, but check here to catch
2509 : * bugs that might be hard to catch otherwise.
2510 : *
2511 : * NB: the default collator is not the same as the collator for the root
2512 : * locale. The root locale may be specified as the empty string, "und", or
2513 : * "root". The default collator is opened by passing NULL to ucol_open().
2514 : */
2515 266329 : if (loc_str == NULL)
16 jdavis 2516 UNC 0 : elog(ERROR, "opening default collator is not supported");
2517 :
2518 : /*
2519 : * In ICU versions 54 and earlier, "und" is not a recognized spelling of
2520 : * the root locale. If the first component of the locale is "und", replace
2521 : * with "root" before opening.
2522 : */
2523 : if (U_ICU_VERSION_MAJOR_NUM < 55)
2524 : {
2525 : char lang[ULOC_LANG_CAPACITY];
2526 :
2527 : status = U_ZERO_ERROR;
2528 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2529 : if (U_FAILURE(status))
2530 : {
2531 : ereport(ERROR,
2532 : (errmsg("could not get language from locale \"%s\": %s",
2533 : loc_str, u_errorName(status))));
2534 : }
2535 :
2536 : if (strcmp(lang, "und") == 0)
2537 : {
2538 : const char *remainder = loc_str + strlen("und");
2539 :
2540 : fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
2541 : strcpy(fixed_str, "root");
2542 : strcat(fixed_str, remainder);
2543 :
2544 : loc_str = fixed_str;
2545 : }
2546 : }
2547 :
17 jdavis 2548 GNC 266329 : status = U_ZERO_ERROR;
2549 266329 : collator = ucol_open(loc_str, &status);
2550 266329 : if (U_FAILURE(status))
2551 6 : ereport(ERROR,
2552 : /* use original string for error report */
2553 : (errmsg("could not open collator for locale \"%s\": %s",
2554 : orig_str, u_errorName(status))));
2555 :
2556 : if (U_ICU_VERSION_MAJOR_NUM < 54)
2557 : {
2558 : status = U_ZERO_ERROR;
2559 : icu_set_collation_attributes(collator, loc_str, &status);
2560 :
2561 : /*
2562 : * Pretend the error came from ucol_open(), for consistent error
2563 : * message across ICU versions.
2564 : */
2565 : if (U_FAILURE(status))
2566 : {
2567 : ucol_close(collator);
2568 : ereport(ERROR,
2569 : (errmsg("could not open collator for locale \"%s\": %s",
2570 : orig_str, u_errorName(status))));
2571 : }
2572 : }
2573 :
2574 266323 : if (fixed_str != NULL)
17 jdavis 2575 UNC 0 : pfree(fixed_str);
2576 :
17 jdavis 2577 GNC 266323 : return collator;
2578 : }
2579 :
2580 : static void
2208 peter_e 2581 GIC 1178278 : init_icu_converter(void)
2582 : {
2208 peter_e 2583 ECB : const char *icu_encoding_name;
2584 : UErrorCode status;
2585 : UConverter *conv;
2586 :
2208 peter_e 2587 GIC 1178278 : if (icu_converter)
1179 tgl 2588 1178223 : return; /* already done */
2589 :
2208 peter_e 2590 55 : icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
1179 tgl 2591 55 : if (!icu_encoding_name)
1179 tgl 2592 UIC 0 : ereport(ERROR,
2593 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2594 : errmsg("encoding \"%s\" not supported by ICU",
2595 : pg_encoding_to_char(GetDatabaseEncoding()))));
2596 :
2208 peter_e 2597 GIC 55 : status = U_ZERO_ERROR;
2598 55 : conv = ucnv_open(icu_encoding_name, &status);
2599 55 : if (U_FAILURE(status))
2208 peter_e 2600 UIC 0 : ereport(ERROR,
2601 : (errmsg("could not open ICU converter for encoding \"%s\": %s",
2602 : icu_encoding_name, u_errorName(status))));
2603 :
2208 peter_e 2604 GIC 55 : icu_converter = conv;
2605 : }
2606 :
2607 : /*
2608 : * Find length, in UChars, of given string if converted to UChar string.
2609 : */
2610 : static size_t
45 jdavis 2611 GNC 589391 : uchar_length(UConverter *converter, const char *str, int32_t len)
2612 : {
2613 589391 : UErrorCode status = U_ZERO_ERROR;
2614 : int32_t ulen;
2615 589391 : ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
2616 589391 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
45 jdavis 2617 UNC 0 : ereport(ERROR,
2618 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
45 jdavis 2619 GNC 589391 : return ulen;
2620 : }
2621 :
2622 : /*
2623 : * Convert the given source string into a UChar string, stored in dest, and
2624 : * return the length (in UChars).
2625 : */
2626 : static int32_t
2627 589391 : uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
2628 : const char *src, int32_t srclen)
2629 : {
2630 589391 : UErrorCode status = U_ZERO_ERROR;
2631 : int32_t ulen;
2632 589391 : status = U_ZERO_ERROR;
2633 589391 : ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
2634 589391 : if (U_FAILURE(status))
45 jdavis 2635 UNC 0 : ereport(ERROR,
2636 : (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
45 jdavis 2637 GNC 589391 : return ulen;
2638 : }
2639 :
2640 : /*
2641 : * Convert a string in the database encoding into a string of UChars.
2642 : *
2643 : * The source string at buff is of length nbytes
2644 : * (it needn't be nul-terminated)
2645 : *
2646 : * *buff_uchar receives a pointer to the palloc'd result string, and
2647 : * the function's result is the number of UChars generated.
2648 : *
2649 : * The result string is nul-terminated, though most callers rely on the
2650 : * result length instead.
2651 : */
2652 : int32_t
2208 peter_e 2653 GIC 588893 : icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
2654 : {
2655 : int32_t len_uchar;
2656 :
2657 588893 : init_icu_converter();
2658 :
45 jdavis 2659 GNC 588893 : len_uchar = uchar_length(icu_converter, buff, nbytes);
2660 :
2115 peter_e 2661 GIC 588893 : *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
45 jdavis 2662 GNC 588893 : len_uchar = uchar_convert(icu_converter,
2663 : *buff_uchar, len_uchar + 1, buff, nbytes);
2664 :
2208 peter_e 2665 GIC 588893 : return len_uchar;
2666 : }
2667 :
2668 : /*
2669 : * Convert a string of UChars into the database encoding.
2670 : *
2671 : * The source string at buff_uchar is of length len_uchar
2672 : * (it needn't be nul-terminated)
2673 : *
2674 : * *result receives a pointer to the palloc'd result string, and the
2675 : * function's result is the number of bytes generated (not counting nul).
2676 : *
2677 : * The result string is nul-terminated.
2678 : */
2679 : int32_t
2116 tgl 2680 588887 : icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
2681 : {
2682 : UErrorCode status;
2683 : int32_t len_result;
2684 :
2208 peter_e 2685 588887 : init_icu_converter();
2686 :
2115 2687 588887 : status = U_ZERO_ERROR;
2688 588887 : len_result = ucnv_fromUChars(icu_converter, NULL, 0,
2689 : buff_uchar, len_uchar, &status);
2690 588887 : if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
2115 peter_e 2691 UIC 0 : ereport(ERROR,
2692 : (errmsg("%s failed: %s", "ucnv_fromUChars",
2693 : u_errorName(status))));
2694 :
2208 peter_e 2695 GIC 588887 : *result = palloc(len_result + 1);
2115 peter_e 2696 ECB :
2208 peter_e 2697 GIC 588887 : status = U_ZERO_ERROR;
2116 tgl 2698 588887 : len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
2699 : buff_uchar, len_uchar, &status);
2208 peter_e 2700 CBC 588887 : if (U_FAILURE(status))
2208 peter_e 2701 UIC 0 : ereport(ERROR,
2702 : (errmsg("%s failed: %s", "ucnv_fromUChars",
2703 : u_errorName(status))));
2704 :
2208 peter_e 2705 GIC 588887 : return len_result;
2706 : }
2707 :
2708 : /*
2709 : * Parse collation attributes from the given locale string and apply them to
2710 : * the open collator.
2711 : *
2712 : * First, the locale string is canonicalized to an ICU format locale ID such
2713 : * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
2714 : * the key-value arguments.
2715 : *
1484 peter 2716 ECB : * Starting with ICU version 54, the attributes are processed automatically by
2717 : * ucol_open(), so this is only necessary for emulating this behavior on older
2718 : * versions.
2719 : */
2720 : pg_attribute_unused()
2721 : static void
12 jdavis 2722 UNC 0 : icu_set_collation_attributes(UCollator *collator, const char *loc,
2723 : UErrorCode *status)
1484 peter 2724 EUB : {
2725 : int32_t len;
2726 : char *icu_locale_id;
2727 : char *lower_str;
2728 : char *str;
19 jdavis 2729 ECB :
2730 : /*
2731 : * The input locale may be a BCP 47 language tag, e.g.
2732 : * "und-u-kc-ks-level1", which expresses the same attributes in a
2733 : * different form. It will be converted to the equivalent ICU format
2734 : * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
2735 : * uloc_canonicalize().
2736 : */
12 jdavis 2737 UNC 0 : *status = U_ZERO_ERROR;
2738 0 : len = uloc_canonicalize(loc, NULL, 0, status);
19 2739 0 : icu_locale_id = palloc(len + 1);
12 2740 0 : *status = U_ZERO_ERROR;
2741 0 : len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
2742 0 : if (U_FAILURE(*status))
2743 0 : return;
2744 :
19 2745 0 : lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
2746 :
2747 0 : pfree(icu_locale_id);
2748 :
2749 0 : str = strchr(lower_str, '@');
1484 peter 2750 UIC 0 : if (!str)
2751 0 : return;
2752 0 : str++;
2753 :
2754 0 : for (char *token = strtok(str, ";"); token; token = strtok(NULL, ";"))
2755 : {
2756 0 : char *e = strchr(token, '=');
1484 peter 2757 ECB :
1484 peter 2758 UIC 0 : if (e)
2759 : {
2760 : char *name;
1484 peter 2761 ECB : char *value;
1482 2762 : UColAttribute uattr;
2763 : UColAttributeValue uvalue;
2764 :
12 jdavis 2765 UNC 0 : *status = U_ZERO_ERROR;
2766 :
1484 peter 2767 LBC 0 : *e = '\0';
1484 peter 2768 UIC 0 : name = token;
2769 0 : value = e + 1;
2770 :
2771 : /*
2772 : * See attribute name and value lists in ICU i18n/coll.cpp
2773 : */
2774 0 : if (strcmp(name, "colstrength") == 0)
1484 peter 2775 LBC 0 : uattr = UCOL_STRENGTH;
1484 peter 2776 UBC 0 : else if (strcmp(name, "colbackwards") == 0)
1484 peter 2777 UIC 0 : uattr = UCOL_FRENCH_COLLATION;
1484 peter 2778 LBC 0 : else if (strcmp(name, "colcaselevel") == 0)
2779 0 : uattr = UCOL_CASE_LEVEL;
1484 peter 2780 UIC 0 : else if (strcmp(name, "colcasefirst") == 0)
2781 0 : uattr = UCOL_CASE_FIRST;
1484 peter 2782 LBC 0 : else if (strcmp(name, "colalternate") == 0)
2783 0 : uattr = UCOL_ALTERNATE_HANDLING;
2784 0 : else if (strcmp(name, "colnormalization") == 0)
2785 0 : uattr = UCOL_NORMALIZATION_MODE;
1484 peter 2786 UIC 0 : else if (strcmp(name, "colnumeric") == 0)
1484 peter 2787 LBC 0 : uattr = UCOL_NUMERIC_COLLATION;
2788 : else
1482 peter 2789 ECB : /* ignore if unknown */
1482 peter 2790 UBC 0 : continue;
2791 :
1484 peter 2792 LBC 0 : if (strcmp(value, "primary") == 0)
1484 peter 2793 UIC 0 : uvalue = UCOL_PRIMARY;
2794 0 : else if (strcmp(value, "secondary") == 0)
2795 0 : uvalue = UCOL_SECONDARY;
2796 0 : else if (strcmp(value, "tertiary") == 0)
2797 0 : uvalue = UCOL_TERTIARY;
2798 0 : else if (strcmp(value, "quaternary") == 0)
2799 0 : uvalue = UCOL_QUATERNARY;
2800 0 : else if (strcmp(value, "identical") == 0)
2801 0 : uvalue = UCOL_IDENTICAL;
2802 0 : else if (strcmp(value, "no") == 0)
2803 0 : uvalue = UCOL_OFF;
2804 0 : else if (strcmp(value, "yes") == 0)
2805 0 : uvalue = UCOL_ON;
2806 0 : else if (strcmp(value, "shifted") == 0)
2807 0 : uvalue = UCOL_SHIFTED;
1484 peter 2808 UBC 0 : else if (strcmp(value, "non-ignorable") == 0)
1484 peter 2809 UIC 0 : uvalue = UCOL_NON_IGNORABLE;
2810 0 : else if (strcmp(value, "lower") == 0)
2811 0 : uvalue = UCOL_LOWER_FIRST;
1484 peter 2812 UBC 0 : else if (strcmp(value, "upper") == 0)
1484 peter 2813 UIC 0 : uvalue = UCOL_UPPER_FIRST;
2814 : else
2815 : {
12 jdavis 2816 UNC 0 : *status = U_ILLEGAL_ARGUMENT_ERROR;
2817 0 : break;
2818 : }
2819 :
2820 0 : ucol_setAttribute(collator, uattr, uvalue, status);
1484 peter 2821 EUB : }
2822 : }
2823 :
19 jdavis 2824 UNC 0 : pfree(lower_str);
2825 : }
1484 peter 2826 EUB :
2827 : #endif
2828 :
5 jdavis 2829 : /*
2830 : * Return the BCP47 language tag representation of the requested locale.
2831 : *
2832 : * This function should be called before passing the string to ucol_open(),
2833 : * because conversion to a language tag also performs "level 2
2834 : * canonicalization". In addition to producing a consistent format, level 2
2835 : * canonicalization is able to more accurately interpret different input
2836 : * locale string formats, such as POSIX and .NET IDs.
2837 : */
2838 : char *
5 jdavis 2839 GNC 244303 : icu_language_tag(const char *loc_str, int elevel)
2840 : {
5 jdavis 2841 EUB : #ifdef USE_ICU
2842 : UErrorCode status;
2843 : char lang[ULOC_LANG_CAPACITY];
2844 : char *langtag;
5 jdavis 2845 GNC 244303 : size_t buflen = 32; /* arbitrary starting buffer size */
2846 244303 : const bool strict = true;
5 jdavis 2847 EUB :
5 jdavis 2848 GIC 244303 : status = U_ZERO_ERROR;
5 jdavis 2849 GNC 244303 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
5 jdavis 2850 GIC 244303 : if (U_FAILURE(status))
2851 : {
5 jdavis 2852 UNC 0 : if (elevel > 0)
2853 0 : ereport(elevel,
2854 : (errmsg("could not get language from locale \"%s\": %s",
2855 : loc_str, u_errorName(status))));
2856 0 : return NULL;
2857 : }
5 jdavis 2858 EUB :
2859 : /* C/POSIX locales aren't handled by uloc_getLanguageTag() */
5 jdavis 2860 GNC 244303 : if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
2861 2 : return pstrdup("en-US-u-va-posix");
2862 :
2863 : /*
2864 : * A BCP47 language tag doesn't have a clearly-defined upper limit
2865 : * (cf. RFC5646 section 4.4). Additionally, in older ICU versions,
2866 : * uloc_toLanguageTag() doesn't always return the ultimate length on the
2867 : * first call, necessitating a loop.
2868 : */
2869 244301 : langtag = palloc(buflen);
2870 : while (true)
5 jdavis 2871 UNC 0 : {
2872 : int32_t len;
2873 :
5 jdavis 2874 GNC 244301 : status = U_ZERO_ERROR;
2875 244301 : len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);
2876 :
2877 : /*
2878 : * If the result fits in the buffer exactly (len == buflen),
2879 : * uloc_toLanguageTag() will return success without nul-terminating
2880 : * the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >=
2881 : * buflen and try again.
2882 : */
2883 244301 : if ((status == U_BUFFER_OVERFLOW_ERROR ||
2884 244301 : (U_SUCCESS(status) && len >= buflen)) &&
2885 : buflen < MaxAllocSize)
2886 : {
5 jdavis 2887 UNC 0 : buflen = Min(buflen * 2, MaxAllocSize);
2888 0 : langtag = repalloc(langtag, buflen);
2889 0 : continue;
2890 : }
2891 :
5 jdavis 2892 GNC 244301 : break;
2893 : }
2894 :
2895 244301 : if (U_FAILURE(status))
2896 : {
2897 6 : pfree(langtag);
2898 :
2899 6 : if (elevel > 0)
2900 6 : ereport(elevel,
2901 : (errmsg("could not convert locale name \"%s\" to language tag: %s",
2902 : loc_str, u_errorName(status))));
2903 3 : return NULL;
2904 : }
2905 :
2906 244295 : return langtag;
2907 : #else /* not USE_ICU */
2908 : ereport(ERROR,
2909 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2910 : errmsg("ICU is not supported in this build")));
2911 : return NULL; /* keep compiler quiet */
2912 : #endif /* not USE_ICU */
2913 : }
2914 :
2915 : /*
2916 : * Perform best-effort check that the locale is a valid one.
2917 : */
2918 : void
12 2919 855 : icu_validate_locale(const char *loc_str)
2920 : {
2921 : #ifdef USE_ICU
2922 : UCollator *collator;
2923 : UErrorCode status;
2924 : char lang[ULOC_LANG_CAPACITY];
2925 855 : bool found = false;
2926 855 : int elevel = icu_validation_level;
2927 :
2928 : /* no validation */
2929 855 : if (elevel < 0)
12 jdavis 2930 UNC 0 : return;
2931 :
2932 : /* downgrade to WARNING during pg_upgrade */
12 jdavis 2933 GNC 855 : if (IsBinaryUpgrade && elevel > WARNING)
2934 7 : elevel = WARNING;
2935 :
2936 : /* validate that we can extract the language */
2937 855 : status = U_ZERO_ERROR;
2938 855 : uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2939 855 : if (U_FAILURE(status))
2940 : {
12 jdavis 2941 UNC 0 : ereport(elevel,
2942 : (errmsg("could not get language from ICU locale \"%s\": %s",
2943 : loc_str, u_errorName(status)),
2944 : errhint("To disable ICU locale validation, set parameter icu_validation_level to DISABLED.")));
2945 0 : return;
2946 : }
2947 :
2948 : /* check for special language name */
12 jdavis 2949 GNC 855 : if (strcmp(lang, "") == 0 ||
2950 804 : strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0 ||
2951 804 : strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
2952 51 : found = true;
2953 :
2954 : /* search for matching language within ICU */
2955 118707 : for (int32_t i = 0; !found && i < uloc_countAvailable(); i++)
2956 : {
2957 117852 : const char *otherloc = uloc_getAvailable(i);
2958 : char otherlang[ULOC_LANG_CAPACITY];
2959 :
2960 117852 : status = U_ZERO_ERROR;
2961 117852 : uloc_getLanguage(otherloc, otherlang, ULOC_LANG_CAPACITY, &status);
2962 117852 : if (U_FAILURE(status))
12 jdavis 2963 UNC 0 : continue;
2964 :
12 jdavis 2965 GNC 117852 : if (strcmp(lang, otherlang) == 0)
2966 798 : found = true;
2967 : }
2968 :
2969 855 : if (!found)
2970 6 : ereport(elevel,
2971 : (errmsg("ICU locale \"%s\" has unknown language \"%s\"",
2972 : loc_str, lang),
2973 : errhint("To disable ICU locale validation, set parameter icu_validation_level to DISABLED.")));
2974 :
2975 : /* check that it can be opened */
2976 852 : collator = pg_ucol_open(loc_str);
385 peter 2977 848 : ucol_close(collator);
2978 : #else /* not USE_ICU */
2979 : /* could get here if a collation was created by a build with ICU */
2980 : ereport(ERROR,
2981 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2982 : errmsg("ICU is not supported in this build")));
2983 : #endif /* not USE_ICU */
2984 : }
2985 :
2986 : /*
2987 : * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
4369 tgl 2988 ECB : * Therefore we keep them here rather than with the mbutils code.
2989 : */
2990 :
2991 : /*
2992 : * wchar2char --- convert wide characters to multibyte format
2993 : *
2994 : * This has the same API as the standard wcstombs_l() function; in particular,
2995 : * tolen is the maximum number of bytes to store at *to, and *from must be
2996 : * zero-terminated. The output will be zero-terminated iff there is room.
2997 : */
2998 : size_t
4369 tgl 2999 GIC 136185 : wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale)
4369 tgl 3000 ECB : {
3001 : size_t result;
3002 :
2208 peter_e 3003 GIC 136185 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
3004 :
4369 tgl 3005 CBC 136185 : if (tolen == 0)
4369 tgl 3006 UBC 0 : return 0;
3007 :
3008 : #ifdef WIN32
3009 :
3010 : /*
3011 : * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
4369 tgl 3012 EUB : * for some reason mbstowcs and wcstombs won't do this for us, so we use
3013 : * MultiByteToWideChar().
3014 : */
4369 tgl 3015 ECB : if (GetDatabaseEncoding() == PG_UTF8)
3016 : {
3017 : result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
3018 : NULL, NULL);
3019 : /* A zero return is failure */
3020 : if (result <= 0)
3021 : result = -1;
3022 : else
3023 : {
3024 : Assert(result <= tolen);
3025 : /* Microsoft counts the zero terminator in the result */
3026 : result--;
3027 : }
3028 : }
3029 : else
3030 : #endif /* WIN32 */
4369 tgl 3031 GIC 136185 : if (locale == (pg_locale_t) 0)
3032 : {
3033 : /* Use wcstombs directly for the default locale */
4369 tgl 3034 CBC 135993 : result = wcstombs(to, from, tolen);
3035 : }
3036 : else
3037 : {
4369 tgl 3038 ECB : #ifdef HAVE_LOCALE_T
3039 : #ifdef HAVE_WCSTOMBS_L
3040 : /* Use wcstombs_l for nondefault locales */
2208 peter_e 3041 : result = wcstombs_l(to, from, tolen, locale->info.lt);
4322 bruce 3042 : #else /* !HAVE_WCSTOMBS_L */
3043 : /* We have to temporarily set the locale as current ... ugh */
2208 peter_e 3044 GIC 192 : locale_t save_locale = uselocale(locale->info.lt);
3045 :
4369 tgl 3046 GBC 192 : result = wcstombs(to, from, tolen);
3047 :
4369 tgl 3048 CBC 192 : uselocale(save_locale);
3049 : #endif /* HAVE_WCSTOMBS_L */
3050 : #else /* !HAVE_LOCALE_T */
3051 : /* Can't have locale != 0 without HAVE_LOCALE_T */
3052 : elog(ERROR, "wcstombs_l is not available");
3053 : result = 0; /* keep compiler quiet */
3054 : #endif /* HAVE_LOCALE_T */
3055 : }
3056 :
4369 tgl 3057 GIC 136185 : return result;
3058 : }
3059 :
3060 : /*
3061 : * char2wchar --- convert multibyte characters to wide characters
3062 : *
3063 : * This has almost the API of mbstowcs_l(), except that *from need not be
3064 : * null-terminated; instead, the number of input bytes is specified as
3065 : * fromlen. Also, we ereport() rather than returning -1 for invalid
3066 : * input encoding. tolen is the maximum number of wchar_t's to store at *to.
3067 : * The output will be zero-terminated iff there is room.
3068 : */
4369 tgl 3069 ECB : size_t
4369 tgl 3070 GIC 137763 : char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen,
3071 : pg_locale_t locale)
3072 : {
3073 : size_t result;
4369 tgl 3074 ECB :
2208 peter_e 3075 CBC 137763 : Assert(!locale || locale->provider == COLLPROVIDER_LIBC);
3076 :
4369 tgl 3077 137763 : if (tolen == 0)
4369 tgl 3078 LBC 0 : return 0;
3079 :
3080 : #ifdef WIN32
3081 : /* See WIN32 "Unicode" comment above */
4369 tgl 3082 EUB : if (GetDatabaseEncoding() == PG_UTF8)
3083 : {
4369 tgl 3084 ECB : /* Win32 API does not work for zero-length input */
3085 : if (fromlen == 0)
3086 : result = 0;
3087 : else
3088 : {
4369 tgl 3089 EUB : result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
3090 : /* A zero return is failure */
3091 : if (result == 0)
3092 : result = -1;
3093 : }
3094 :
3095 : if (result != -1)
3096 : {
3097 : Assert(result < tolen);
3098 : /* Append trailing null wchar (MultiByteToWideChar() does not) */
3099 : to[result] = 0;
3100 : }
3101 : }
3102 : else
2118 3103 : #endif /* WIN32 */
3104 : {
3105 : /* mbstowcs requires ending '\0' */
4369 tgl 3106 GIC 137763 : char *str = pnstrdup(from, fromlen);
3107 :
3108 137763 : if (locale == (pg_locale_t) 0)
4369 tgl 3109 EUB : {
3110 : /* Use mbstowcs directly for the default locale */
4369 tgl 3111 GIC 137571 : result = mbstowcs(to, str, tolen);
3112 : }
4369 tgl 3113 EUB : else
3114 : {
3115 : #ifdef HAVE_LOCALE_T
3116 : #ifdef HAVE_MBSTOWCS_L
3117 : /* Use mbstowcs_l for nondefault locales */
3118 : result = mbstowcs_l(to, str, tolen, locale->info.lt);
3873 3119 : #else /* !HAVE_MBSTOWCS_L */
4369 3120 : /* We have to temporarily set the locale as current ... ugh */
2208 peter_e 3121 GIC 192 : locale_t save_locale = uselocale(locale->info.lt);
3122 :
4369 tgl 3123 GBC 192 : result = mbstowcs(to, str, tolen);
4369 tgl 3124 EUB :
4369 tgl 3125 GIC 192 : uselocale(save_locale);
2118 tgl 3126 EUB : #endif /* HAVE_MBSTOWCS_L */
3127 : #else /* !HAVE_LOCALE_T */
4369 3128 : /* Can't have locale != 0 without HAVE_LOCALE_T */
3129 : elog(ERROR, "mbstowcs_l is not available");
3130 : result = 0; /* keep compiler quiet */
3131 : #endif /* HAVE_LOCALE_T */
3132 : }
3133 :
4369 tgl 3134 GBC 137763 : pfree(str);
3135 : }
3136 :
4369 tgl 3137 GIC 137763 : if (result == -1)
3138 : {
3139 : /*
3140 : * Invalid multibyte character encountered. We try to give a useful
4369 tgl 3141 ECB : * error message by letting pg_verifymbstr check the string. But it's
3142 : * possible that the string is OK to us, and not OK to mbstowcs ---
3143 : * this suggests that the LC_CTYPE locale is different from the
3144 : * database encoding. Give a generic error message if pg_verifymbstr
3145 : * can't find anything wrong.
3146 : */
4369 tgl 3147 UIC 0 : pg_verifymbstr(from, fromlen, false); /* might not return */
3148 : /* but if it does ... */
3149 0 : ereport(ERROR,
3150 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
4369 tgl 3151 ECB : errmsg("invalid multibyte character for locale"),
3152 : errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
3153 : }
3154 :
4369 tgl 3155 CBC 137763 : return result;
3156 : }
|