Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * mbutils.c
4 : * This file contains functions for encoding conversion.
5 : *
6 : * The string-conversion functions in this file share some API quirks.
7 : * Note the following:
8 : *
9 : * The functions return a palloc'd, null-terminated string if conversion
10 : * is required. However, if no conversion is performed, the given source
11 : * string pointer is returned as-is.
12 : *
13 : * Although the presence of a length argument means that callers can pass
14 : * non-null-terminated strings, care is required because the same string
15 : * will be passed back if no conversion occurs. Such callers *must* check
16 : * whether result == src and handle that case differently.
17 : *
18 : * If the source and destination encodings are the same, the source string
19 : * is returned without any verification; it's assumed to be valid data.
20 : * If that might not be the case, the caller is responsible for validating
21 : * the string using a separate call to pg_verify_mbstr(). Whenever the
22 : * source and destination encodings are different, the functions ensure that
23 : * the result is validly encoded according to the destination encoding.
24 : *
25 : *
26 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : *
30 : * IDENTIFICATION
31 : * src/backend/utils/mb/mbutils.c
32 : *
33 : *-------------------------------------------------------------------------
34 : */
35 : #include "postgres.h"
36 :
37 : #include "access/xact.h"
38 : #include "catalog/namespace.h"
39 : #include "mb/pg_wchar.h"
40 : #include "utils/builtins.h"
41 : #include "utils/memutils.h"
42 : #include "utils/syscache.h"
43 : #include "varatt.h"
44 :
45 : /*
46 : * We maintain a simple linked list caching the fmgr lookup info for the
47 : * currently selected conversion functions, as well as any that have been
48 : * selected previously in the current session. (We remember previous
49 : * settings because we must be able to restore a previous setting during
50 : * transaction rollback, without doing any fresh catalog accesses.)
51 : *
52 : * Since we'll never release this data, we just keep it in TopMemoryContext.
53 : */
54 : typedef struct ConvProcInfo
55 : {
56 : int s_encoding; /* server and client encoding IDs */
57 : int c_encoding;
58 : FmgrInfo to_server_info; /* lookup info for conversion procs */
59 : FmgrInfo to_client_info;
60 : } ConvProcInfo;
61 :
62 : static List *ConvProcList = NIL; /* List of ConvProcInfo */
63 :
64 : /*
65 : * These variables point to the currently active conversion functions,
66 : * or are NULL when no conversion is needed.
67 : */
68 : static FmgrInfo *ToServerConvProc = NULL;
69 : static FmgrInfo *ToClientConvProc = NULL;
70 :
71 : /*
72 : * This variable stores the conversion function to convert from UTF-8
73 : * to the server encoding. It's NULL if the server encoding *is* UTF-8,
74 : * or if we lack a conversion function for this.
75 : */
76 : static FmgrInfo *Utf8ToServerConvProc = NULL;
77 :
78 : /*
79 : * These variables track the currently-selected encodings.
80 : */
81 : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
82 : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
83 : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
84 :
85 : /*
86 : * During backend startup we can't set client encoding because we (a)
87 : * can't look up the conversion functions, and (b) may not know the database
88 : * encoding yet either. So SetClientEncoding() just accepts anything and
89 : * remembers it for InitializeClientEncoding() to apply later.
90 : */
91 : static bool backend_startup_complete = false;
92 : static int pending_client_encoding = PG_SQL_ASCII;
93 :
94 :
95 : /* Internal functions */
96 : static char *perform_default_encoding_conversion(const char *src,
97 : int len, bool is_client_to_server);
98 : static int cliplen(const char *str, int len, int limit);
99 :
100 :
101 : /*
102 : * Prepare for a future call to SetClientEncoding. Success should mean
103 : * that SetClientEncoding is guaranteed to succeed for this encoding request.
104 : *
105 : * (But note that success before backend_startup_complete does not guarantee
106 : * success after ...)
107 : *
108 : * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
109 : */
110 : int
4385 tgl 111 GIC 26562 : PrepareClientEncoding(int encoding)
7907 ishii 112 ECB : {
113 : int current_server_encoding;
114 : ListCell *lc;
115 :
7885 ishii 116 GIC 26562 : if (!PG_VALID_FE_ENCODING(encoding))
6297 neilc 117 LBC 0 : return -1;
7907 ishii 118 EUB :
119 : /* Can't do anything during startup, per notes above */
7287 tgl 120 GIC 26562 : if (!backend_startup_complete)
7570 ishii 121 CBC 12948 : return 0;
7885 ishii 122 ECB :
7287 tgl 123 GIC 13614 : current_server_encoding = GetDatabaseEncoding();
7287 tgl 124 ECB :
125 : /*
126 : * Check for cases that require no conversion function.
127 : */
7287 tgl 128 GIC 13614 : if (current_server_encoding == encoding ||
6297 neilc 129 CBC 2601 : current_server_encoding == PG_SQL_ASCII ||
6297 neilc 130 ECB : encoding == PG_SQL_ASCII)
7287 tgl 131 GIC 13610 : return 0;
7885 ishii 132 ECB :
5120 tgl 133 GIC 4 : if (IsTransactionState())
5120 tgl 134 ECB : {
135 : /*
136 : * If we're in a live transaction, it's safe to access the catalogs,
137 : * so look up the functions. We repeat the lookup even if the info is
138 : * already cached, so that we can react to changes in the contents of
139 : * pg_conversion.
140 : */
141 : Oid to_server_proc,
142 : to_client_proc;
143 : ConvProcInfo *convinfo;
144 : MemoryContext oldcontext;
145 :
5120 tgl 146 GIC 4 : to_server_proc = FindDefaultConversionProc(encoding,
5120 tgl 147 ECB : current_server_encoding);
5120 tgl 148 GIC 4 : if (!OidIsValid(to_server_proc))
5120 tgl 149 LBC 0 : return -1;
5120 tgl 150 GBC 4 : to_client_proc = FindDefaultConversionProc(current_server_encoding,
5120 tgl 151 ECB : encoding);
5120 tgl 152 GIC 4 : if (!OidIsValid(to_client_proc))
5120 tgl 153 LBC 0 : return -1;
7287 tgl 154 EUB :
155 : /*
156 : * Load the fmgr info into TopMemoryContext (could still fail here)
157 : */
5120 tgl 158 GIC 4 : convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
5120 tgl 159 ECB : sizeof(ConvProcInfo));
5120 tgl 160 GIC 4 : convinfo->s_encoding = current_server_encoding;
5120 tgl 161 CBC 4 : convinfo->c_encoding = encoding;
162 4 : fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
5120 tgl 163 ECB : TopMemoryContext);
5120 tgl 164 GIC 4 : fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
5120 tgl 165 ECB : TopMemoryContext);
166 :
167 : /* Attach new info to head of list */
5120 tgl 168 GIC 4 : oldcontext = MemoryContextSwitchTo(TopMemoryContext);
5120 tgl 169 CBC 4 : ConvProcList = lcons(convinfo, ConvProcList);
170 4 : MemoryContextSwitchTo(oldcontext);
7570 ishii 171 ECB :
172 : /*
173 : * We cannot yet remove any older entry for the same encoding pair,
174 : * since it could still be in use. SetClientEncoding will clean up.
175 : */
176 :
5120 tgl 177 GIC 4 : return 0; /* success */
6296 neilc 178 ECB : }
179 : else
180 : {
181 : /*
182 : * If we're not in a live transaction, the only thing we can do is
183 : * restore a previous setting using the cache. This covers all
184 : * transaction-rollback cases. The only case it might not work for is
185 : * trying to change client_encoding on the fly by editing
186 : * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
187 : * thing to do anyway.
188 : */
5120 tgl 189 UIC 0 : foreach(lc, ConvProcList)
5120 tgl 190 EUB : {
5120 tgl 191 UIC 0 : ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
7543 ishii 192 EUB :
5120 tgl 193 UIC 0 : if (oldinfo->s_encoding == current_server_encoding &&
5120 tgl 194 UBC 0 : oldinfo->c_encoding == encoding)
195 0 : return 0;
5120 tgl 196 EUB : }
197 :
5120 tgl 198 UIC 0 : return -1; /* it's not cached, so fail */
5120 tgl 199 EUB : }
200 : }
201 :
202 : /*
203 : * Set the active client encoding and set up the conversion-function pointers.
204 : * PrepareClientEncoding should have been called previously for this encoding.
205 : *
206 : * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
207 : */
208 : int
4385 tgl 209 GIC 23873 : SetClientEncoding(int encoding)
4385 tgl 210 ECB : {
211 : int current_server_encoding;
212 : bool found;
213 : ListCell *lc;
214 :
4385 tgl 215 GIC 23873 : if (!PG_VALID_FE_ENCODING(encoding))
4385 tgl 216 LBC 0 : return -1;
4385 tgl 217 EUB :
218 : /* Can't do anything during startup, per notes above */
4385 tgl 219 GIC 23873 : if (!backend_startup_complete)
4385 tgl 220 ECB : {
4385 tgl 221 GIC 11559 : pending_client_encoding = encoding;
4385 tgl 222 CBC 11559 : return 0;
4385 tgl 223 ECB : }
224 :
4385 tgl 225 GIC 12314 : current_server_encoding = GetDatabaseEncoding();
4385 tgl 226 ECB :
227 : /*
228 : * Check for cases that require no conversion function.
229 : */
4385 tgl 230 GIC 12314 : if (current_server_encoding == encoding ||
4385 tgl 231 CBC 1303 : current_server_encoding == PG_SQL_ASCII ||
4385 tgl 232 ECB : encoding == PG_SQL_ASCII)
233 : {
4385 tgl 234 GIC 12310 : ClientEncoding = &pg_enc2name_tbl[encoding];
4385 tgl 235 CBC 12310 : ToServerConvProc = NULL;
236 12310 : ToClientConvProc = NULL;
237 12310 : return 0;
4385 tgl 238 ECB : }
239 :
240 : /*
241 : * Search the cache for the entry previously prepared by
242 : * PrepareClientEncoding; if there isn't one, we lose. While at it,
243 : * release any duplicate entries so that repeated Prepare/Set cycles don't
244 : * leak memory.
245 : */
4385 tgl 246 GIC 4 : found = false;
1364 tgl 247 CBC 8 : foreach(lc, ConvProcList)
4385 tgl 248 ECB : {
4385 tgl 249 GIC 4 : ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
4385 tgl 250 ECB :
4385 tgl 251 GIC 4 : if (convinfo->s_encoding == current_server_encoding &&
4385 tgl 252 CBC 4 : convinfo->c_encoding == encoding)
4385 tgl 253 ECB : {
4385 tgl 254 GIC 4 : if (!found)
4385 tgl 255 ECB : {
256 : /* Found newest entry, so set up */
4385 tgl 257 GIC 4 : ClientEncoding = &pg_enc2name_tbl[encoding];
4385 tgl 258 CBC 4 : ToServerConvProc = &convinfo->to_server_info;
259 4 : ToClientConvProc = &convinfo->to_client_info;
260 4 : found = true;
4385 tgl 261 ECB : }
262 : else
263 : {
264 : /* Duplicate entry, release it */
1364 tgl 265 UIC 0 : ConvProcList = foreach_delete_current(ConvProcList, lc);
4385 tgl 266 UBC 0 : pfree(convinfo);
4385 tgl 267 EUB : }
268 : }
269 : }
270 :
4385 tgl 271 GIC 4 : if (found)
4385 tgl 272 CBC 4 : return 0; /* success */
4385 tgl 273 ECB : else
4385 tgl 274 UIC 0 : return -1; /* it's not cached, so fail */
4385 tgl 275 EUB : }
276 :
277 : /*
278 : * Initialize client encoding conversions.
279 : * Called from InitPostgres() once during backend startup.
280 : */
281 : void
7287 tgl 282 GIC 10866 : InitializeClientEncoding(void)
7354 ishii 283 ECB : {
284 : int current_server_encoding;
285 :
7287 tgl 286 GIC 10866 : Assert(!backend_startup_complete);
7287 tgl 287 CBC 10866 : backend_startup_complete = true;
7287 tgl 288 ECB :
4385 tgl 289 GIC 21732 : if (PrepareClientEncoding(pending_client_encoding) < 0 ||
4385 tgl 290 CBC 10866 : SetClientEncoding(pending_client_encoding) < 0)
7354 ishii 291 ECB : {
292 : /*
293 : * Oops, the requested conversion is not available. We couldn't fail
294 : * before, but we can now.
295 : */
7198 tgl 296 UIC 0 : ereport(FATAL,
7198 tgl 297 EUB : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
298 : errmsg("conversion between %s and %s is not supported",
299 : pg_enc2name_tbl[pending_client_encoding].name,
300 : GetDatabaseEncodingName())));
301 : }
302 :
303 : /*
304 : * Also look up the UTF8-to-server conversion function if needed. Since
305 : * the server encoding is fixed within any one backend process, we don't
306 : * have to do this more than once.
307 : */
1129 tgl 308 GIC 10866 : current_server_encoding = GetDatabaseEncoding();
1129 tgl 309 CBC 10866 : if (current_server_encoding != PG_UTF8 &&
1129 tgl 310 ECB : current_server_encoding != PG_SQL_ASCII)
311 : {
312 : Oid utf8_to_server_proc;
313 :
1129 tgl 314 GIC 101 : Assert(IsTransactionState());
1129 tgl 315 ECB : utf8_to_server_proc =
1129 tgl 316 GIC 101 : FindDefaultConversionProc(PG_UTF8,
1129 tgl 317 ECB : current_server_encoding);
318 : /* If there's no such conversion, just leave the pointer as NULL */
1129 tgl 319 GIC 101 : if (OidIsValid(utf8_to_server_proc))
1129 tgl 320 ECB : {
321 : FmgrInfo *finfo;
322 :
1129 tgl 323 GIC 101 : finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
1129 tgl 324 ECB : sizeof(FmgrInfo));
1129 tgl 325 GIC 101 : fmgr_info_cxt(utf8_to_server_proc, finfo,
1129 tgl 326 ECB : TopMemoryContext);
327 : /* Set Utf8ToServerConvProc only after data is fully valid */
1129 tgl 328 GIC 101 : Utf8ToServerConvProc = finfo;
1129 tgl 329 ECB : }
330 : }
7354 ishii 331 GIC 10866 : }
7354 ishii 332 ECB :
333 : /*
334 : * returns the current client encoding
335 : */
336 : int
7870 tgl 337 GIC 4449 : pg_get_client_encoding(void)
9025 scrappy 338 ECB : {
6297 neilc 339 GIC 4449 : return ClientEncoding->encoding;
7885 ishii 340 ECB : }
341 :
342 : /*
343 : * returns the current client encoding name
344 : */
345 : const char *
7870 tgl 346 UIC 0 : pg_get_client_encoding_name(void)
7885 ishii 347 EUB : {
6297 neilc 348 UIC 0 : return ClientEncoding->name;
9025 scrappy 349 EUB : }
350 :
351 : /*
352 : * Convert src string to another encoding (general case).
353 : *
354 : * See the notes about string conversion functions at the top of this file.
355 : */
356 : unsigned char *
7870 tgl 357 GIC 1417 : pg_do_encoding_conversion(unsigned char *src, int len,
7570 ishii 358 ECB : int src_encoding, int dest_encoding)
359 : {
360 : unsigned char *result;
361 : Oid proc;
362 :
3332 tgl 363 GIC 1417 : if (len <= 0)
3332 tgl 364 CBC 15 : return src; /* empty string is always valid */
7522 bruce 365 ECB :
7570 ishii 366 GIC 1402 : if (src_encoding == dest_encoding)
3332 tgl 367 CBC 896 : return src; /* no conversion required, assume valid */
8986 bruce 368 ECB :
3332 tgl 369 GIC 506 : if (dest_encoding == PG_SQL_ASCII)
3332 tgl 370 LBC 0 : return src; /* any string is valid in SQL_ASCII */
7563 ishii 371 EUB :
3332 tgl 372 GIC 506 : if (src_encoding == PG_SQL_ASCII)
3332 tgl 373 ECB : {
374 : /* No conversion is possible, but we must validate the result */
3332 tgl 375 GIC 8 : (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
7439 ishii 376 CBC 8 : return src;
3332 tgl 377 ECB : }
378 :
3332 tgl 379 GIC 498 : if (!IsTransactionState()) /* shouldn't happen */
3332 tgl 380 LBC 0 : elog(ERROR, "cannot perform encoding conversion outside a transaction");
7439 ishii 381 EUB :
7570 ishii 382 GIC 498 : proc = FindDefaultConversionProc(src_encoding, dest_encoding);
7570 ishii 383 CBC 498 : if (!OidIsValid(proc))
3332 tgl 384 LBC 0 : ereport(ERROR,
7198 tgl 385 EUB : (errcode(ERRCODE_UNDEFINED_FUNCTION),
386 : errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
387 : pg_encoding_to_char(src_encoding),
388 : pg_encoding_to_char(dest_encoding))));
389 :
390 : /*
391 : * Allocate space for conversion result, being wary of integer overflow.
392 : *
393 : * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
394 : * required space, so it might exceed MaxAllocSize even though the result
395 : * would actually fit. We do not want to hand back a result string that
396 : * exceeds MaxAllocSize, because callers might not cope gracefully --- but
397 : * if we just allocate more than that, and don't use it, that's fine.
398 : */
1284 tgl 399 GIC 498 : if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
5795 tgl 400 LBC 0 : ereport(ERROR,
5795 tgl 401 EUB : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
402 : errmsg("out of memory"),
403 : errdetail("String of %d bytes is too long for encoding conversion.",
404 : len)));
405 :
406 : result = (unsigned char *)
1284 tgl 407 GIC 498 : MemoryContextAllocHuge(CurrentMemoryContext,
1284 tgl 408 CBC 498 : (Size) len * MAX_CONVERSION_GROWTH + 1);
7570 ishii 409 ECB :
738 heikki.linnakangas 410 GIC 498 : (void) OidFunctionCall6(proc,
738 heikki.linnakangas 411 ECB : Int32GetDatum(src_encoding),
412 : Int32GetDatum(dest_encoding),
413 : CStringGetDatum((char *) src),
414 : CStringGetDatum((char *) result),
415 : Int32GetDatum(len),
416 : BoolGetDatum(false));
417 :
418 : /*
419 : * If the result is large, it's worth repalloc'ing to release any extra
420 : * space we asked for. The cutoff here is somewhat arbitrary, but we
421 : * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
422 : */
1284 tgl 423 GIC 498 : if (len > 1000000)
1284 tgl 424 ECB : {
1284 tgl 425 UIC 0 : Size resultlen = strlen((char *) result);
1284 tgl 426 EUB :
1284 tgl 427 UIC 0 : if (resultlen >= MaxAllocSize)
1284 tgl 428 UBC 0 : ereport(ERROR,
1284 tgl 429 EUB : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
430 : errmsg("out of memory"),
431 : errdetail("String of %d bytes is too long for encoding conversion.",
432 : len)));
433 :
1284 tgl 434 UIC 0 : result = (unsigned char *) repalloc(result, resultlen + 1);
1284 tgl 435 EUB : }
436 :
8611 tgl 437 GIC 498 : return result;
9025 scrappy 438 ECB : }
439 :
440 : /*
441 : * Convert src string to another encoding.
442 : *
443 : * This function has a different API than the other conversion functions.
444 : * The caller should've looked up the conversion function using
445 : * FindDefaultConversionProc(). Unlike the other functions, the converted
446 : * result is not palloc'd. It is written to the caller-supplied buffer
447 : * instead.
448 : *
449 : * src_encoding - encoding to convert from
450 : * dest_encoding - encoding to convert to
451 : * src, srclen - input buffer and its length in bytes
452 : * dest, destlen - destination buffer and its size in bytes
453 : *
454 : * The output is null-terminated.
455 : *
456 : * If destlen < srclen * MAX_CONVERSION_LENGTH + 1, the converted output
457 : * wouldn't necessarily fit in the output buffer, and the function will not
458 : * convert the whole input.
459 : *
460 : * TODO: The conversion function interface is not great. Firstly, it
461 : * would be nice to pass through the destination buffer size to the
462 : * conversion function, so that if you pass a shorter destination buffer, it
463 : * could still continue to fill up the whole buffer. Currently, we have to
464 : * assume worst case expansion and stop the conversion short, even if there
465 : * is in fact space left in the destination buffer. Secondly, it would be
466 : * nice to return the number of bytes written to the caller, to avoid a call
467 : * to strlen().
468 : */
469 : int
738 heikki.linnakangas 470 GIC 2853 : pg_do_encoding_conversion_buf(Oid proc,
738 heikki.linnakangas 471 ECB : int src_encoding,
472 : int dest_encoding,
473 : unsigned char *src, int srclen,
474 : unsigned char *dest, int destlen,
475 : bool noError)
476 : {
477 : Datum result;
478 :
479 : /*
480 : * If the destination buffer is not large enough to hold the result in the
481 : * worst case, limit the input size passed to the conversion function.
482 : */
738 heikki.linnakangas 483 GIC 2853 : if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
738 heikki.linnakangas 484 CBC 2853 : srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
738 heikki.linnakangas 485 ECB :
738 heikki.linnakangas 486 GIC 2853 : result = OidFunctionCall6(proc,
738 heikki.linnakangas 487 ECB : Int32GetDatum(src_encoding),
488 : Int32GetDatum(dest_encoding),
489 : CStringGetDatum((char *) src),
490 : CStringGetDatum((char *) dest),
491 : Int32GetDatum(srclen),
492 : BoolGetDatum(noError));
738 heikki.linnakangas 493 GIC 1683 : return DatumGetInt32(result);
738 heikki.linnakangas 494 ECB : }
495 :
496 : /*
497 : * Convert string to encoding encoding_name. The source
498 : * encoding is the DB encoding.
499 : *
500 : * BYTEA convert_to(TEXT string, NAME encoding_name) */
501 : Datum
5682 andrew 502 GIC 24 : pg_convert_to(PG_FUNCTION_ARGS)
7907 ishii 503 ECB : {
7522 bruce 504 GIC 24 : Datum string = PG_GETARG_DATUM(0);
7522 bruce 505 CBC 24 : Datum dest_encoding_name = PG_GETARG_DATUM(1);
5569 tgl 506 24 : Datum src_encoding_name = DirectFunctionCall1(namein,
2118 tgl 507 ECB : CStringGetDatum(DatabaseEncoding->name));
508 : Datum result;
509 :
510 : /*
511 : * pg_convert expects a bytea as its first argument. We're passing it a
512 : * text argument here, relying on the fact that they are both in fact
513 : * varlena types, and thus structurally identical.
514 : */
5569 tgl 515 GIC 24 : result = DirectFunctionCall3(pg_convert, string,
5569 tgl 516 ECB : src_encoding_name, dest_encoding_name);
517 :
5475 tgl 518 GIC 24 : PG_RETURN_DATUM(result);
5682 andrew 519 ECB : }
520 :
521 : /*
522 : * Convert string from encoding encoding_name. The destination
523 : * encoding is the DB encoding.
524 : *
525 : * TEXT convert_from(BYTEA string, NAME encoding_name) */
526 : Datum
5682 andrew 527 GIC 281 : pg_convert_from(PG_FUNCTION_ARGS)
5682 andrew 528 ECB : {
5682 andrew 529 GIC 281 : Datum string = PG_GETARG_DATUM(0);
5682 andrew 530 CBC 281 : Datum src_encoding_name = PG_GETARG_DATUM(1);
5569 tgl 531 281 : Datum dest_encoding_name = DirectFunctionCall1(namein,
2118 tgl 532 ECB : CStringGetDatum(DatabaseEncoding->name));
533 : Datum result;
534 :
5569 tgl 535 GIC 281 : result = DirectFunctionCall3(pg_convert, string,
5569 tgl 536 ECB : src_encoding_name, dest_encoding_name);
537 :
538 : /*
539 : * pg_convert returns a bytea, which we in turn return as text, relying on
540 : * the fact that they are both in fact varlena types, and thus
541 : * structurally identical. Although not all bytea values are valid text,
542 : * in this case it will be because we've told pg_convert to return one
543 : * that is valid as text in the current database encoding.
544 : */
5475 tgl 545 GIC 278 : PG_RETURN_DATUM(result);
7907 ishii 546 ECB : }
547 :
548 : /*
549 : * Convert string between two arbitrary encodings.
550 : *
551 : * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
552 : */
553 : Datum
5682 andrew 554 GIC 689 : pg_convert(PG_FUNCTION_ARGS)
7907 ishii 555 ECB : {
4510 itagaki.takahiro 556 GIC 689 : bytea *string = PG_GETARG_BYTEA_PP(0);
7836 bruce 557 CBC 689 : char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
558 689 : int src_encoding = pg_char_to_encoding(src_encoding_name);
559 689 : char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
560 689 : int dest_encoding = pg_char_to_encoding(dest_encoding_name);
4510 itagaki.takahiro 561 ECB : const char *src_str;
562 : char *dest_str;
563 : bytea *retval;
564 : int len;
565 :
7907 ishii 566 GIC 689 : if (src_encoding < 0)
7198 tgl 567 LBC 0 : ereport(ERROR,
7198 tgl 568 EUB : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
569 : errmsg("invalid source encoding name \"%s\"",
570 : src_encoding_name)));
7907 ishii 571 GIC 689 : if (dest_encoding < 0)
7198 tgl 572 LBC 0 : ereport(ERROR,
7198 tgl 573 EUB : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
574 : errmsg("invalid destination encoding name \"%s\"",
575 : dest_encoding_name)));
576 :
577 : /* make sure that source string is valid */
4510 itagaki.takahiro 578 GIC 689 : len = VARSIZE_ANY_EXHDR(string);
4510 itagaki.takahiro 579 CBC 689 : src_str = VARDATA_ANY(string);
801 heikki.linnakangas 580 689 : (void) pg_verify_mbstr(src_encoding, src_str, len, false);
7811 ishii 581 ECB :
582 : /* perform conversion */
1531 peter 583 GIC 686 : dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
3332 tgl 584 ECB : len,
585 : src_encoding,
586 : dest_encoding);
587 :
588 : /* update len if conversion actually happened */
4510 itagaki.takahiro 589 GIC 686 : if (dest_str != src_str)
4510 itagaki.takahiro 590 CBC 384 : len = strlen(dest_str);
7907 ishii 591 ECB :
592 : /*
593 : * build bytea data type structure.
594 : */
4510 itagaki.takahiro 595 GIC 686 : retval = (bytea *) palloc(len + VARHDRSZ);
4510 itagaki.takahiro 596 CBC 686 : SET_VARSIZE(retval, len + VARHDRSZ);
597 686 : memcpy(VARDATA(retval), dest_str, len);
4510 itagaki.takahiro 598 ECB :
4510 itagaki.takahiro 599 GIC 686 : if (dest_str != src_str)
4510 itagaki.takahiro 600 CBC 384 : pfree(dest_str);
7907 ishii 601 ECB :
602 : /* free memory if allocated by the toaster */
7907 ishii 603 GIC 686 : PG_FREE_IF_COPY(string, 0);
7907 ishii 604 ECB :
5682 andrew 605 GIC 686 : PG_RETURN_BYTEA_P(retval);
5682 andrew 606 ECB : }
607 :
608 : /*
609 : * get the length of the string considered as text in the specified
610 : * encoding. Raises an error if the data is not valid in that
611 : * encoding.
612 : *
613 : * INT4 length (BYTEA string, NAME src_encoding_name)
614 : */
615 : Datum
5682 andrew 616 UIC 0 : length_in_encoding(PG_FUNCTION_ARGS)
5682 andrew 617 EUB : {
3332 tgl 618 UIC 0 : bytea *string = PG_GETARG_BYTEA_PP(0);
5682 andrew 619 UBC 0 : char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
620 0 : int src_encoding = pg_char_to_encoding(src_encoding_name);
3332 tgl 621 EUB : const char *src_str;
622 : int len;
623 : int retval;
624 :
5657 tgl 625 UIC 0 : if (src_encoding < 0)
5657 tgl 626 UBC 0 : ereport(ERROR,
5657 tgl 627 EUB : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
628 : errmsg("invalid encoding name \"%s\"",
629 : src_encoding_name)));
630 :
3332 tgl 631 UIC 0 : len = VARSIZE_ANY_EXHDR(string);
3332 tgl 632 UBC 0 : src_str = VARDATA_ANY(string);
3332 tgl 633 EUB :
3332 tgl 634 UIC 0 : retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
5624 bruce 635 EUB :
3332 tgl 636 UIC 0 : PG_RETURN_INT32(retval);
7907 ishii 637 EUB : }
638 :
639 : /*
640 : * Get maximum multibyte character length in the specified encoding.
641 : *
642 : * Note encoding is specified numerically, not by name as above.
643 : */
644 : Datum
5024 peter_e 645 UIC 0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
5024 peter_e 646 EUB : {
4790 bruce 647 UIC 0 : int encoding = PG_GETARG_INT32(0);
5024 peter_e 648 EUB :
5024 peter_e 649 UIC 0 : if (PG_VALID_ENCODING(encoding))
5024 tgl 650 UBC 0 : PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
5024 peter_e 651 EUB : else
5024 peter_e 652 UIC 0 : PG_RETURN_NULL();
5024 peter_e 653 EUB : }
654 :
655 : /*
656 : * Convert client encoding to server encoding.
657 : *
658 : * See the notes about string conversion functions at the top of this file.
659 : */
660 : char *
6406 tgl 661 GIC 629604 : pg_client_to_server(const char *s, int len)
4430 itagaki.takahiro 662 ECB : {
4430 itagaki.takahiro 663 GIC 629604 : return pg_any_to_server(s, len, ClientEncoding->encoding);
4430 itagaki.takahiro 664 ECB : }
665 :
666 : /*
667 : * Convert any encoding to server encoding.
668 : *
669 : * See the notes about string conversion functions at the top of this file.
670 : *
671 : * Unlike the other string conversion functions, this will apply validation
672 : * even if encoding == DatabaseEncoding->encoding. This is because this is
673 : * used to process data coming in from outside the database, and we never
674 : * want to just assume validity.
675 : */
676 : char *
4430 itagaki.takahiro 677 GIC 668076 : pg_any_to_server(const char *s, int len, int encoding)
7907 ishii 678 ECB : {
6167 tgl 679 GIC 668076 : if (len <= 0)
1418 tgl 680 CBC 36008 : return unconstify(char *, s); /* empty string is always valid */
6167 tgl 681 ECB :
4430 itagaki.takahiro 682 GIC 632068 : if (encoding == DatabaseEncoding->encoding ||
4430 itagaki.takahiro 683 ECB : encoding == PG_SQL_ASCII)
684 : {
685 : /*
686 : * No conversion is needed, but we must still validate the data.
687 : */
6167 tgl 688 GIC 631910 : (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
1531 peter 689 CBC 631909 : return unconstify(char *, s);
6167 tgl 690 ECB : }
691 :
6167 tgl 692 GIC 158 : if (DatabaseEncoding->encoding == PG_SQL_ASCII)
6167 tgl 693 ECB : {
694 : /*
695 : * No conversion is possible, but we must still validate the data,
696 : * because the client-side code might have done string escaping using
697 : * the selected client_encoding. If the client encoding is ASCII-safe
698 : * then we just do a straight validation under that encoding. For an
699 : * ASCII-unsafe encoding we have a problem: we dare not pass such data
700 : * to the parser but we have no way to convert it. We compromise by
701 : * rejecting the data if it contains any non-ASCII characters.
702 : */
4430 itagaki.takahiro 703 GIC 26 : if (PG_VALID_BE_ENCODING(encoding))
4430 itagaki.takahiro 704 CBC 26 : (void) pg_verify_mbstr(encoding, s, len, false);
6167 tgl 705 ECB : else
706 : {
707 : int i;
708 :
6167 tgl 709 UIC 0 : for (i = 0; i < len; i++)
6167 tgl 710 EUB : {
6167 tgl 711 UIC 0 : if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
6167 tgl 712 UBC 0 : ereport(ERROR,
6167 tgl 713 EUB : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
714 : errmsg("invalid byte value for encoding \"%s\": 0x%02x",
715 : pg_enc2name_tbl[PG_SQL_ASCII].name,
716 : (unsigned char) s[i])));
717 : }
718 : }
1531 peter 719 GIC 26 : return unconstify(char *, s);
6167 tgl 720 ECB : }
721 :
722 : /* Fast path if we can use cached conversion function */
3332 tgl 723 GIC 132 : if (encoding == ClientEncoding->encoding)
4430 itagaki.takahiro 724 CBC 18 : return perform_default_encoding_conversion(s, len, true);
3332 tgl 725 ECB :
726 : /* General case ... will not work outside transactions */
1531 peter 727 GIC 114 : return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
3332 tgl 728 ECB : len,
729 : encoding,
3332 tgl 730 GIC 114 : DatabaseEncoding->encoding);
7907 ishii 731 ECB : }
732 :
733 : /*
734 : * Convert server encoding to client encoding.
735 : *
736 : * See the notes about string conversion functions at the top of this file.
737 : */
738 : char *
6406 tgl 739 GIC 14303212 : pg_server_to_client(const char *s, int len)
4430 itagaki.takahiro 740 ECB : {
4430 itagaki.takahiro 741 GIC 14303212 : return pg_server_to_any(s, len, ClientEncoding->encoding);
4430 itagaki.takahiro 742 ECB : }
743 :
744 : /*
745 : * Convert server encoding to any encoding.
746 : *
747 : * See the notes about string conversion functions at the top of this file.
748 : */
749 : char *
4430 itagaki.takahiro 750 GIC 19099607 : pg_server_to_any(const char *s, int len, int encoding)
9025 scrappy 751 ECB : {
6167 tgl 752 GIC 19099607 : if (len <= 0)
1418 tgl 753 CBC 93680 : return unconstify(char *, s); /* empty string is always valid */
7907 ishii 754 ECB :
4430 itagaki.takahiro 755 GIC 19005927 : if (encoding == DatabaseEncoding->encoding ||
3332 tgl 756 ECB : encoding == PG_SQL_ASCII)
1418 tgl 757 GIC 19005777 : return unconstify(char *, s); /* assume data is valid */
6167 tgl 758 ECB :
3332 tgl 759 GIC 150 : if (DatabaseEncoding->encoding == PG_SQL_ASCII)
3332 tgl 760 ECB : {
761 : /* No conversion is possible, but we must validate the result */
3332 tgl 762 GIC 6 : (void) pg_verify_mbstr(encoding, s, len, false);
1531 peter 763 CBC 6 : return unconstify(char *, s);
3332 tgl 764 ECB : }
765 :
766 : /* Fast path if we can use cached conversion function */
3332 tgl 767 GIC 144 : if (encoding == ClientEncoding->encoding)
4430 itagaki.takahiro 768 CBC 144 : return perform_default_encoding_conversion(s, len, false);
3332 tgl 769 ECB :
770 : /* General case ... will not work outside transactions */
1531 peter 771 UIC 0 : return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
3332 tgl 772 EUB : len,
3332 tgl 773 UIC 0 : DatabaseEncoding->encoding,
3332 tgl 774 EUB : encoding);
775 : }
776 :
777 : /*
778 : * Perform default encoding conversion using cached FmgrInfo. Since
779 : * this function does not access database at all, it is safe to call
780 : * outside transactions. If the conversion has not been set up by
781 : * SetClientEncoding(), no conversion is performed.
782 : */
783 : static char *
3332 tgl 784 GIC 162 : perform_default_encoding_conversion(const char *src, int len,
3332 tgl 785 ECB : bool is_client_to_server)
786 : {
787 : char *result;
788 : int src_encoding,
789 : dest_encoding;
790 : FmgrInfo *flinfo;
791 :
7549 ishii 792 GIC 162 : if (is_client_to_server)
7549 ishii 793 ECB : {
7549 ishii 794 GIC 18 : src_encoding = ClientEncoding->encoding;
7549 ishii 795 CBC 18 : dest_encoding = DatabaseEncoding->encoding;
7463 tgl 796 18 : flinfo = ToServerConvProc;
7549 ishii 797 ECB : }
798 : else
799 : {
7549 ishii 800 GIC 144 : src_encoding = DatabaseEncoding->encoding;
7549 ishii 801 CBC 144 : dest_encoding = ClientEncoding->encoding;
7463 tgl 802 144 : flinfo = ToClientConvProc;
7549 ishii 803 ECB : }
804 :
7549 ishii 805 GIC 162 : if (flinfo == NULL)
1531 peter 806 LBC 0 : return unconstify(char *, src);
7549 ishii 807 EUB :
808 : /*
809 : * Allocate space for conversion result, being wary of integer overflow.
810 : * See comments in pg_do_encoding_conversion.
811 : */
1284 tgl 812 GIC 162 : if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
5795 tgl 813 LBC 0 : ereport(ERROR,
5795 tgl 814 EUB : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
815 : errmsg("out of memory"),
816 : errdetail("String of %d bytes is too long for encoding conversion.",
817 : len)));
818 :
819 : result = (char *)
1284 tgl 820 GIC 162 : MemoryContextAllocHuge(CurrentMemoryContext,
1284 tgl 821 CBC 162 : (Size) len * MAX_CONVERSION_GROWTH + 1);
7549 ishii 822 ECB :
738 heikki.linnakangas 823 GIC 162 : FunctionCall6(flinfo,
7549 ishii 824 ECB : Int32GetDatum(src_encoding),
825 : Int32GetDatum(dest_encoding),
826 : CStringGetDatum(src),
827 : CStringGetDatum(result),
828 : Int32GetDatum(len),
829 : BoolGetDatum(false));
830 :
831 : /*
832 : * Release extra space if there might be a lot --- see comments in
833 : * pg_do_encoding_conversion.
834 : */
1284 tgl 835 GIC 162 : if (len > 1000000)
1284 tgl 836 ECB : {
1284 tgl 837 UIC 0 : Size resultlen = strlen(result);
1284 tgl 838 EUB :
1284 tgl 839 UIC 0 : if (resultlen >= MaxAllocSize)
1284 tgl 840 UBC 0 : ereport(ERROR,
1284 tgl 841 EUB : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
842 : errmsg("out of memory"),
843 : errdetail("String of %d bytes is too long for encoding conversion.",
844 : len)));
845 :
1284 tgl 846 UIC 0 : result = (char *) repalloc(result, resultlen + 1);
1284 tgl 847 EUB : }
848 :
7549 ishii 849 GIC 162 : return result;
9025 scrappy 850 ECB : }
851 :
852 : /*
853 : * Convert a single Unicode code point into a string in the server encoding.
854 : *
855 : * The code point given by "c" is converted and stored at *s, which must
856 : * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
857 : * The output will have a trailing '\0'. Throws error if the conversion
858 : * cannot be performed.
859 : *
860 : * Note that this relies on having previously looked up any required
861 : * conversion function. That's partly for speed but mostly because the parser
862 : * may call this outside any transaction, or in an aborted transaction.
863 : */
864 : void
1129 tgl 865 GIC 300 : pg_unicode_to_server(pg_wchar c, unsigned char *s)
1129 tgl 866 ECB : {
867 : unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
868 : int c_as_utf8_len;
869 : int server_encoding;
870 :
871 : /*
872 : * Complain if invalid Unicode code point. The choice of errcode here is
873 : * debatable, but really our caller should have checked this anyway.
874 : */
1129 tgl 875 GIC 300 : if (!is_valid_unicode_codepoint(c))
1129 tgl 876 LBC 0 : ereport(ERROR,
1129 tgl 877 EUB : (errcode(ERRCODE_SYNTAX_ERROR),
878 : errmsg("invalid Unicode code point")));
879 :
880 : /* Otherwise, if it's in ASCII range, conversion is trivial */
1129 tgl 881 GIC 300 : if (c <= 0x7F)
1129 tgl 882 ECB : {
1129 tgl 883 GIC 117 : s[0] = (unsigned char) c;
1129 tgl 884 CBC 117 : s[1] = '\0';
885 300 : return;
1129 tgl 886 ECB : }
887 :
888 : /* If the server encoding is UTF-8, we just need to reformat the code */
1129 tgl 889 GIC 183 : server_encoding = GetDatabaseEncoding();
1129 tgl 890 CBC 183 : if (server_encoding == PG_UTF8)
1129 tgl 891 ECB : {
1129 tgl 892 GIC 183 : unicode_to_utf8(c, s);
1129 tgl 893 CBC 183 : s[pg_utf_mblen(s)] = '\0';
894 183 : return;
1129 tgl 895 ECB : }
896 :
897 : /* For all other cases, we must have a conversion function available */
1129 tgl 898 UIC 0 : if (Utf8ToServerConvProc == NULL)
1129 tgl 899 UBC 0 : ereport(ERROR,
1129 tgl 900 EUB : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
901 : errmsg("conversion between %s and %s is not supported",
902 : pg_enc2name_tbl[PG_UTF8].name,
903 : GetDatabaseEncodingName())));
904 :
905 : /* Construct UTF-8 source string */
1129 tgl 906 UIC 0 : unicode_to_utf8(c, c_as_utf8);
1129 tgl 907 UBC 0 : c_as_utf8_len = pg_utf_mblen(c_as_utf8);
908 0 : c_as_utf8[c_as_utf8_len] = '\0';
1129 tgl 909 EUB :
910 : /* Convert, or throw error if we can't */
738 heikki.linnakangas 911 UIC 0 : FunctionCall6(Utf8ToServerConvProc,
1129 tgl 912 EUB : Int32GetDatum(PG_UTF8),
913 : Int32GetDatum(server_encoding),
914 : CStringGetDatum((char *) c_as_utf8),
915 : CStringGetDatum((char *) s),
916 : Int32GetDatum(c_as_utf8_len),
917 : BoolGetDatum(false));
918 : }
919 :
920 : /*
921 : * Convert a single Unicode code point into a string in the server encoding.
922 : *
923 : * Same as pg_unicode_to_server(), except that we don't throw errors,
924 : * but simply return false on conversion failure.
925 : */
926 : bool
119 tgl 927 GNC 42 : pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
928 : {
929 : unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
930 : int c_as_utf8_len;
931 : int converted_len;
932 : int server_encoding;
933 :
934 : /* Fail if invalid Unicode code point */
935 42 : if (!is_valid_unicode_codepoint(c))
119 tgl 936 UNC 0 : return false;
937 :
938 : /* Otherwise, if it's in ASCII range, conversion is trivial */
119 tgl 939 GNC 42 : if (c <= 0x7F)
940 : {
941 12 : s[0] = (unsigned char) c;
942 12 : s[1] = '\0';
943 12 : return true;
944 : }
945 :
946 : /* If the server encoding is UTF-8, we just need to reformat the code */
947 30 : server_encoding = GetDatabaseEncoding();
948 30 : if (server_encoding == PG_UTF8)
949 : {
950 30 : unicode_to_utf8(c, s);
951 30 : s[pg_utf_mblen(s)] = '\0';
952 30 : return true;
953 : }
954 :
955 : /* For all other cases, we must have a conversion function available */
119 tgl 956 UNC 0 : if (Utf8ToServerConvProc == NULL)
957 0 : return false;
958 :
959 : /* Construct UTF-8 source string */
960 0 : unicode_to_utf8(c, c_as_utf8);
961 0 : c_as_utf8_len = pg_utf_mblen(c_as_utf8);
962 0 : c_as_utf8[c_as_utf8_len] = '\0';
963 :
964 : /* Convert, but without throwing error if we can't */
965 0 : converted_len = DatumGetInt32(FunctionCall6(Utf8ToServerConvProc,
966 : Int32GetDatum(PG_UTF8),
967 : Int32GetDatum(server_encoding),
968 : CStringGetDatum((char *) c_as_utf8),
969 : CStringGetDatum((char *) s),
970 : Int32GetDatum(c_as_utf8_len),
971 : BoolGetDatum(true)));
972 :
973 : /* Conversion was successful iff it consumed the whole input */
974 0 : return (converted_len == c_as_utf8_len);
975 : }
976 :
977 :
978 : /* convert a multibyte string to a wchar */
979 : int
6406 tgl 980 UIC 0 : pg_mb2wchar(const char *from, pg_wchar *to)
981 : {
2040 peter_e 982 0 : return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
983 : }
984 :
7523 peter_e 985 ECB : /* convert a multibyte string to a wchar with a limited length */
986 : int
6406 tgl 987 GIC 458799 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
988 : {
2040 peter_e 989 458799 : return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
990 : }
991 :
992 : /* same, with any encoding */
5950 tgl 993 ECB : int
5950 tgl 994 GBC 9140 : pg_encoding_mb2wchar_with_len(int encoding,
995 : const char *from, pg_wchar *to, int len)
996 : {
2040 peter_e 997 CBC 9140 : return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
998 : }
5950 tgl 999 ECB :
3931 rhaas 1000 : /* convert a wchar string to a multibyte */
1001 : int
3931 rhaas 1002 UIC 0 : pg_wchar2mb(const pg_wchar *from, char *to)
1003 : {
2040 peter_e 1004 0 : return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
3931 rhaas 1005 ECB : }
1006 :
1007 : /* convert a wchar string to a multibyte with a limited length */
1008 : int
3931 rhaas 1009 CBC 555741 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
3931 rhaas 1010 ECB : {
2040 peter_e 1011 GIC 555741 : return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
1012 : }
1013 :
3931 rhaas 1014 EUB : /* same, with any encoding */
1015 : int
3931 rhaas 1016 UIC 0 : pg_encoding_wchar2mb_with_len(int encoding,
1017 : const pg_wchar *from, char *to, int len)
3931 rhaas 1018 EUB : {
2040 peter_e 1019 UBC 0 : return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
3931 rhaas 1020 EUB : }
1021 :
1022 : /* returns the byte length of a multibyte character */
8986 bruce 1023 : int
6406 tgl 1024 GIC 111073808 : pg_mblen(const char *mbstr)
1025 : {
2040 peter_e 1026 111073808 : return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
1027 : }
1028 :
1029 : /* returns the display length of a multibyte character */
1030 : int
6406 tgl 1031 4362 : pg_dsplen(const char *mbstr)
6964 ishii 1032 EUB : {
2040 peter_e 1033 GIC 4362 : return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
1034 : }
1035 :
1036 : /* returns the length (counted in wchars) of a multibyte string */
1037 : int
6406 tgl 1038 GBC 351 : pg_mbstrlen(const char *mbstr)
1039 : {
8986 bruce 1040 351 : int len = 0;
1041 :
1042 : /* optimization for single byte encoding */
7528 ishii 1043 GIC 351 : if (pg_database_encoding_max_length() == 1)
6406 tgl 1044 UIC 0 : return strlen(mbstr);
7528 ishii 1045 ECB :
8986 bruce 1046 GIC 813 : while (*mbstr)
8986 bruce 1047 ECB : {
8986 bruce 1048 GIC 462 : mbstr += pg_mblen(mbstr);
1049 462 : len++;
1050 : }
6297 neilc 1051 351 : return len;
9025 scrappy 1052 ECB : }
1053 :
1054 : /* returns the length (counted in wchars) of a multibyte string
6482 tgl 1055 : * (not necessarily NULL terminated)
1056 : */
1057 : int
6406 tgl 1058 GIC 847091 : pg_mbstrlen_with_len(const char *mbstr, int limit)
1059 : {
8986 bruce 1060 GBC 847091 : int len = 0;
1061 :
6482 tgl 1062 EUB : /* optimization for single byte encoding */
6482 tgl 1063 GIC 847091 : if (pg_database_encoding_max_length() == 1)
6482 tgl 1064 UIC 0 : return limit;
1065 :
8067 tgl 1066 GIC 95754570 : while (limit > 0 && *mbstr)
8986 bruce 1067 ECB : {
6385 bruce 1068 GIC 94907479 : int l = pg_mblen(mbstr);
6482 tgl 1069 ECB :
8986 bruce 1070 GIC 94907479 : limit -= l;
1071 94907479 : mbstr += l;
1072 94907479 : len++;
1073 : }
6297 neilc 1074 GBC 847091 : return len;
1075 : }
1076 :
8962 bruce 1077 EUB : /*
1078 : * returns the byte length of a multibyte string
1079 : * (not necessarily NULL terminated)
1080 : * that is no longer than limit.
1081 : * this function does not break multibyte character boundary.
8962 bruce 1082 ECB : */
1083 : int
6406 tgl 1084 CBC 214148 : pg_mbcliplen(const char *mbstr, int len, int limit)
1085 : {
5208 tgl 1086 GIC 214148 : return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
1087 : len, limit);
1088 : }
5208 tgl 1089 ECB :
1090 : /*
1091 : * pg_mbcliplen with specified encoding
1092 : */
1093 : int
5208 tgl 1094 GIC 214148 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
1095 : int len, int limit)
5208 tgl 1096 ECB : {
1097 : mblen_converter mblen_fn;
8962 bruce 1098 CBC 214148 : int clen = 0;
1099 : int l;
1100 :
7528 ishii 1101 ECB : /* optimization for single byte encoding */
5208 tgl 1102 GBC 214148 : if (pg_encoding_max_length(encoding) == 1)
7528 ishii 1103 GIC 41840 : return cliplen(mbstr, len, limit);
7528 ishii 1104 ECB :
5208 tgl 1105 GIC 172308 : mblen_fn = pg_wchar_table[encoding].mblen;
5208 tgl 1106 ECB :
8067 tgl 1107 CBC 1693040 : while (len > 0 && *mbstr)
1108 : {
5208 1109 1604605 : l = (*mblen_fn) ((const unsigned char *) mbstr);
8720 bruce 1110 GIC 1604605 : if ((clen + l) > limit)
8962 1111 41 : break;
1112 1604564 : clen += l;
8720 1113 1604564 : if (clen == limit)
8962 1114 83832 : break;
1115 1520732 : len -= l;
8962 bruce 1116 CBC 1520732 : mbstr += l;
1117 : }
6297 neilc 1118 172308 : return clen;
1119 : }
1120 :
9025 scrappy 1121 ECB : /*
7528 ishii 1122 EUB : * Similar to pg_mbcliplen except the limit parameter specifies the
1123 : * character length, not the byte length.
5208 tgl 1124 ECB : */
1125 : int
6406 tgl 1126 CBC 144 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
1127 : {
7938 ishii 1128 144 : int clen = 0;
1129 144 : int nch = 0;
7938 ishii 1130 ECB : int l;
1131 :
7528 1132 : /* optimization for single byte encoding */
7528 ishii 1133 GIC 144 : if (pg_database_encoding_max_length() == 1)
7528 ishii 1134 UIC 0 : return cliplen(mbstr, len, limit);
1135 :
7938 ishii 1136 GIC 681 : while (len > 0 && *mbstr)
1137 : {
1138 672 : l = pg_mblen(mbstr);
1139 672 : nch++;
1140 672 : if (nch > limit)
1141 135 : break;
7938 ishii 1142 CBC 537 : clen += l;
7938 ishii 1143 GIC 537 : len -= l;
7938 ishii 1144 CBC 537 : mbstr += l;
1145 : }
6297 neilc 1146 GIC 144 : return clen;
1147 : }
1148 :
1149 : /* mbcliplen for any single-byte encoding */
1150 : static int
5208 tgl 1151 41840 : cliplen(const char *str, int len, int limit)
5208 tgl 1152 ECB : {
5208 tgl 1153 GIC 41840 : int l = 0;
1154 :
1155 41840 : len = Min(len, limit);
5208 tgl 1156 CBC 314961 : while (l < len && str[l])
5208 tgl 1157 GIC 273121 : l++;
1158 41840 : return l;
1159 : }
5208 tgl 1160 ECB :
9025 scrappy 1161 : void
9025 scrappy 1162 GIC 10213 : SetDatabaseEncoding(int encoding)
9025 scrappy 1163 ECB : {
7885 ishii 1164 GIC 10213 : if (!PG_VALID_BE_ENCODING(encoding))
5953 peter_e 1165 LBC 0 : elog(ERROR, "invalid database encoding: %d", encoding);
1166 :
7836 bruce 1167 CBC 10213 : DatabaseEncoding = &pg_enc2name_tbl[encoding];
7885 ishii 1168 10213 : Assert(DatabaseEncoding->encoding == encoding);
5145 alvherre 1169 10213 : }
5145 alvherre 1170 ECB :
1171 : void
3574 noah 1172 CBC 12832 : SetMessageEncoding(int encoding)
5145 alvherre 1173 ECB : {
3574 noah 1174 : /* Some calls happen before we can elog()! */
3574 noah 1175 GIC 12832 : Assert(PG_VALID_ENCODING(encoding));
5190 magnus 1176 ECB :
3574 noah 1177 GIC 12832 : MessageEncoding = &pg_enc2name_tbl[encoding];
1178 12832 : Assert(MessageEncoding->encoding == encoding);
1179 12832 : }
1180 :
1181 : #ifdef ENABLE_NLS
1182 : /*
1183 : * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
3574 noah 1184 ECB : * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
1185 : * fail for gettext-internal causes like out-of-memory.
1186 : */
1187 : static bool
3574 noah 1188 GIC 1743 : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
1189 : {
1190 1743 : bool elog_ok = (CurrentMemoryContext != NULL);
3574 noah 1191 ECB : int i;
5114 heikki.linnakangas 1192 EUB :
5098 magnus 1193 GIC 3592 : for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
5145 alvherre 1194 ECB : {
5098 magnus 1195 GIC 3592 : if (pg_enc2gettext_tbl[i].encoding == encoding)
5190 magnus 1196 ECB : {
5145 alvherre 1197 CBC 1743 : if (bind_textdomain_codeset(domainname,
3574 noah 1198 1743 : pg_enc2gettext_tbl[i].name) != NULL)
1199 1743 : return true;
3574 noah 1200 ECB :
3574 noah 1201 LBC 0 : if (elog_ok)
5145 alvherre 1202 0 : elog(LOG, "bind_textdomain_codeset failed");
1203 : else
3574 noah 1204 0 : write_stderr("bind_textdomain_codeset failed");
1205 :
5145 alvherre 1206 UIC 0 : break;
1207 : }
1208 : }
3574 noah 1209 ECB :
3574 noah 1210 UIC 0 : return false;
3574 noah 1211 ECB : }
1212 :
1213 : /*
1214 : * Bind a gettext message domain to the codeset corresponding to the database
1215 : * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
1216 : * Return the MessageEncoding implied by the new settings.
1217 : *
1218 : * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
1219 : * When that matches the database encoding, we don't need to do anything. In
1220 : * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
1221 : * database encoding, except for the C locale. (On Windows, we also permit a
1222 : * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
3574 noah 1223 EUB : * gettext to the right codeset.
1224 : *
3260 bruce 1225 ECB : * On Windows, gettext defaults to the Windows ANSI code page. This is a
3574 noah 1226 : * convenient departure for software that passes the strings to Windows ANSI
1227 : * APIs, but we don't do that. Compel gettext to use database encoding or,
1228 : * failing that, the LC_CTYPE encoding as it would on other platforms.
1229 : *
1230 : * This function is called before elog() and palloc() are usable.
1231 : */
1232 : int
3574 noah 1233 CBC 14655 : pg_bind_textdomain_codeset(const char *domainname)
1234 : {
1235 14655 : bool elog_ok = (CurrentMemoryContext != NULL);
1236 14655 : int encoding = GetDatabaseEncoding();
3574 noah 1237 ECB : int new_msgenc;
1238 :
1239 : #ifndef WIN32
3574 noah 1240 GIC 14655 : const char *ctype = setlocale(LC_CTYPE, NULL);
1241 :
1242 14655 : if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
1243 : #endif
1244 3532 : if (encoding != PG_SQL_ASCII &&
1245 1743 : raw_pg_bind_textdomain_codeset(domainname, encoding))
3574 noah 1246 CBC 1743 : return encoding;
1247 :
1248 12912 : new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
3574 noah 1249 GIC 12912 : if (new_msgenc < 0)
3574 noah 1250 UIC 0 : new_msgenc = PG_SQL_ASCII;
3574 noah 1251 ECB :
1252 : #ifdef WIN32
1253 : if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1254 : /* On failure, the old message encoding remains valid. */
1255 : return GetMessageEncoding();
1256 : #endif
1257 :
3574 noah 1258 GIC 12912 : return new_msgenc;
9025 scrappy 1259 EUB : }
3574 noah 1260 : #endif
1261 :
1262 : /*
1263 : * The database encoding, also called the server encoding, represents the
1264 : * encoding of data stored in text-like data types. Affected types include
1265 : * cstring, text, varchar, name, xml, and json.
1266 : */
1267 : int
7870 tgl 1268 GBC 23467668 : GetDatabaseEncoding(void)
1269 : {
6297 neilc 1270 GIC 23467668 : return DatabaseEncoding->encoding;
1271 : }
1272 :
1273 : const char *
7870 tgl 1274 21313 : GetDatabaseEncodingName(void)
1275 : {
6297 neilc 1276 21313 : return DatabaseEncoding->name;
1277 : }
1278 :
1279 : Datum
8335 tgl 1280 37 : getdatabaseencoding(PG_FUNCTION_ARGS)
1281 : {
7885 ishii 1282 37 : return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1283 : }
1284 :
1285 : Datum
7849 ishii 1286 UIC 0 : pg_client_encoding(PG_FUNCTION_ARGS)
1287 : {
1288 0 : return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1289 : }
1290 :
1179 tgl 1291 ECB : Datum
1179 tgl 1292 GIC 18 : PG_char_to_encoding(PG_FUNCTION_ARGS)
1179 tgl 1293 ECB : {
1179 tgl 1294 CBC 18 : Name s = PG_GETARG_NAME(0);
1295 :
1179 tgl 1296 GIC 18 : PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
1297 : }
1179 tgl 1298 ECB :
1299 : Datum
1179 tgl 1300 CBC 1658 : PG_encoding_to_char(PG_FUNCTION_ARGS)
1301 : {
1302 1658 : int32 encoding = PG_GETARG_INT32(0);
1303 1658 : const char *encoding_name = pg_encoding_to_char(encoding);
1179 tgl 1304 ECB :
1179 tgl 1305 GIC 1658 : return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
1179 tgl 1306 ECB : }
1307 :
3574 noah 1308 EUB : /*
1309 : * gettext() returns messages in this encoding. This often matches the
1310 : * database encoding, but it differs for SQL_ASCII databases, for processes
1311 : * not attached to a database, and under a database encoding lacking iconv
1312 : * support (MULE_INTERNAL).
1313 : */
1314 : int
3574 noah 1315 UIC 0 : GetMessageEncoding(void)
4922 magnus 1316 ECB : {
3574 noah 1317 UIC 0 : return MessageEncoding->encoding;
1318 : }
1319 :
1320 :
1321 : /*
1322 : * Generic character incrementer function.
1323 : *
1324 : * Not knowing anything about the properties of the encoding in use, we just
1325 : * keep incrementing the last byte until we get a validly-encoded result,
1179 tgl 1326 ECB : * or we run out of values to try. We don't bother to try incrementing
1327 : * higher-order bytes, so there's no growth in runtime for wider characters.
1328 : * (If we did try to do that, we'd need to consider the likelihood that 255
1329 : * is not a valid final byte in the encoding.)
1330 : */
1331 : static bool
1179 tgl 1332 CBC 51 : pg_generic_charinc(unsigned char *charptr, int len)
1333 : {
1334 51 : unsigned char *lastbyte = charptr + len - 1;
1335 : mbchar_verifier mbverify;
1336 :
1337 : /* We can just invoke the character verifier directly. */
801 heikki.linnakangas 1338 51 : mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
1339 :
1179 tgl 1340 51 : while (*lastbyte < (unsigned char) 255)
1341 : {
1179 tgl 1342 GIC 51 : (*lastbyte)++;
1343 51 : if ((*mbverify) (charptr, len) == len)
1179 tgl 1344 GBC 51 : return true;
1345 : }
1179 tgl 1346 EUB :
1179 tgl 1347 UIC 0 : return false;
1348 : }
1349 :
1179 tgl 1350 ECB : /*
1351 : * UTF-8 character incrementer function.
1352 : *
1353 : * For a one-byte character less than 0x7F, we just increment the byte.
1354 : *
1355 : * For a multibyte character, every byte but the first must fall between 0x80
1356 : * and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1357 : * the last byte that's not already at its maximum value. If we can't find a
1358 : * byte that's less than the maximum allowable value, we simply fail. We also
1359 : * need some special-case logic to skip regions used for surrogate pair
1360 : * handling, as those should not occur in valid UTF-8.
1361 : *
1362 : * Note that we don't reset lower-order bytes back to their minimums, since
1363 : * we can't afford to make an exhaustive search (see make_greater_string).
1364 : */
1365 : static bool
1179 tgl 1366 GIC 929 : pg_utf8_increment(unsigned char *charptr, int length)
1367 : {
1368 : unsigned char a;
1369 : unsigned char limit;
1370 :
1371 929 : switch (length)
1372 : {
1179 tgl 1373 UBC 0 : default:
1374 : /* reject lengths 5 and 6 for now */
1375 0 : return false;
1179 tgl 1376 UIC 0 : case 4:
1377 0 : a = charptr[3];
1378 0 : if (a < 0xBF)
1379 : {
1380 0 : charptr[3]++;
1381 0 : break;
1382 : }
1383 : /* FALL THRU */
1384 : case 3:
1385 0 : a = charptr[2];
1386 0 : if (a < 0xBF)
1387 : {
1388 0 : charptr[2]++;
1389 0 : break;
1179 tgl 1390 ECB : }
1391 : /* FALL THRU */
1392 : case 2:
1179 tgl 1393 UIC 0 : a = charptr[1];
1394 0 : switch (*charptr)
1395 : {
1179 tgl 1396 LBC 0 : case 0xED:
1179 tgl 1397 UIC 0 : limit = 0x9F;
1179 tgl 1398 LBC 0 : break;
1179 tgl 1399 UIC 0 : case 0xF4:
1179 tgl 1400 LBC 0 : limit = 0x8F;
1401 0 : break;
1402 0 : default:
1179 tgl 1403 UIC 0 : limit = 0xBF;
1404 0 : break;
1179 tgl 1405 EUB : }
1179 tgl 1406 UIC 0 : if (a < limit)
1407 : {
1408 0 : charptr[1]++;
1409 0 : break;
1410 : }
1411 : /* FALL THRU */
1412 : case 1:
1179 tgl 1413 GIC 929 : a = *charptr;
1414 929 : if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1179 tgl 1415 UIC 0 : return false;
1179 tgl 1416 GIC 929 : charptr[0]++;
1417 929 : break;
1418 : }
1419 :
1420 929 : return true;
1421 : }
1422 :
1423 : /*
1179 tgl 1424 ECB : * EUC-JP character incrementer function.
1425 : *
1426 : * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1427 : * representing JIS X 0201 characters with the second byte ranging between
1428 : * 0xa1 and 0xdf. We just increment the last byte if it's less than 0xdf,
1429 : * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1430 : *
1179 tgl 1431 EUB : * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1432 : * in which the last two bytes range between 0xa1 and 0xfe. The last byte
1433 : * is incremented if possible, otherwise the second-to-last byte.
1434 : *
1435 : * If the sequence starts with a value other than the above and its MSB
1436 : * is set, it must be a two-byte sequence representing JIS X 0208 characters
1437 : * with both bytes ranging between 0xa1 and 0xfe. The last byte is
1438 : * incremented if possible, otherwise the second-to-last byte.
1439 : *
1440 : * Otherwise, the sequence is a single-byte ASCII character. It is
1441 : * incremented up to 0x7f.
1442 : */
1443 : static bool
1179 tgl 1444 UBC 0 : pg_eucjp_increment(unsigned char *charptr, int length)
1445 : {
1179 tgl 1446 EUB : unsigned char c1,
1447 : c2;
1448 : int i;
1449 :
1179 tgl 1450 UIC 0 : c1 = *charptr;
1179 tgl 1451 EUB :
1179 tgl 1452 UBC 0 : switch (c1)
1453 : {
1454 0 : case SS2: /* JIS X 0201 */
1455 0 : if (length != 2)
1456 0 : return false;
1179 tgl 1457 EUB :
1179 tgl 1458 UBC 0 : c2 = charptr[1];
1179 tgl 1459 EUB :
1179 tgl 1460 UBC 0 : if (c2 >= 0xdf)
1461 0 : charptr[0] = charptr[1] = 0xa1;
1462 0 : else if (c2 < 0xa1)
1179 tgl 1463 UIC 0 : charptr[1] = 0xa1;
1179 tgl 1464 EUB : else
1179 tgl 1465 UIC 0 : charptr[1]++;
1179 tgl 1466 UBC 0 : break;
1179 tgl 1467 EUB :
1179 tgl 1468 UIC 0 : case SS3: /* JIS X 0212 */
1469 0 : if (length != 3)
1470 0 : return false;
1179 tgl 1471 ECB :
1179 tgl 1472 LBC 0 : for (i = 2; i > 0; i--)
1179 tgl 1473 EUB : {
1179 tgl 1474 LBC 0 : c2 = charptr[i];
1475 0 : if (c2 < 0xa1)
1476 : {
1179 tgl 1477 UIC 0 : charptr[i] = 0xa1;
1179 tgl 1478 LBC 0 : return true;
1479 : }
1179 tgl 1480 UIC 0 : else if (c2 < 0xfe)
1481 : {
1482 0 : charptr[i]++;
1483 0 : return true;
1484 : }
1485 : }
1486 :
1487 : /* Out of 3-byte code region */
1488 0 : return false;
1489 :
1490 0 : default:
1491 0 : if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1492 : {
1493 0 : if (length != 2)
1494 0 : return false;
1495 :
1496 0 : for (i = 1; i >= 0; i--)
1497 : {
1498 0 : c2 = charptr[i];
1499 0 : if (c2 < 0xa1)
1500 : {
1501 0 : charptr[i] = 0xa1;
1179 tgl 1502 UBC 0 : return true;
1503 : }
1179 tgl 1504 UIC 0 : else if (c2 < 0xfe)
1505 : {
1506 0 : charptr[i]++;
1507 0 : return true;
1179 tgl 1508 EUB : }
1509 : }
1510 :
1511 : /* Out of 2 byte code region */
1179 tgl 1512 UBC 0 : return false;
1179 tgl 1513 EUB : }
1514 : else
1515 : { /* ASCII, single byte */
1179 tgl 1516 UBC 0 : if (c1 > 0x7e)
1179 tgl 1517 UIC 0 : return false;
1179 tgl 1518 UBC 0 : (*charptr)++;
1179 tgl 1519 EUB : }
1179 tgl 1520 UBC 0 : break;
1179 tgl 1521 EUB : }
1522 :
1179 tgl 1523 UBC 0 : return true;
1179 tgl 1524 EUB : }
1525 :
1526 : /*
1527 : * get the character incrementer for the encoding for the current database
1528 : */
1529 : mbcharacter_incrementer
1179 tgl 1530 GBC 980 : pg_database_encoding_character_incrementer(void)
1531 : {
1179 tgl 1532 EUB : /*
1533 : * Eventually it might be best to add a field to pg_wchar_table[], but for
1534 : * now we just use a switch.
1535 : */
1179 tgl 1536 GBC 980 : switch (GetDatabaseEncoding())
1537 : {
1538 929 : case PG_UTF8:
1179 tgl 1539 GIC 929 : return pg_utf8_increment;
1179 tgl 1540 EUB :
1179 tgl 1541 UBC 0 : case PG_EUC_JP:
1179 tgl 1542 UIC 0 : return pg_eucjp_increment;
1543 :
1179 tgl 1544 GIC 51 : default:
1545 51 : return pg_generic_charinc;
1179 tgl 1546 EUB : }
1547 : }
1548 :
1549 : /*
1550 : * fetch maximum length of the encoding for the current database
1551 : */
1552 : int
1179 tgl 1553 GIC 8841119 : pg_database_encoding_max_length(void)
1179 tgl 1554 EUB : {
1179 tgl 1555 GIC 8841119 : return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1179 tgl 1556 EUB : }
1557 :
1558 : /*
1559 : * Verify mbstr to make sure that it is validly encoded in the current
1560 : * database encoding. Otherwise same as pg_verify_mbstr().
1561 : */
1562 : bool
1179 tgl 1563 GIC 152232 : pg_verifymbstr(const char *mbstr, int len, bool noError)
1179 tgl 1564 EUB : {
801 heikki.linnakangas 1565 GBC 152232 : return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
1566 : }
1567 :
1568 : /*
1569 : * Verify mbstr to make sure that it is validly encoded in the specified
1179 tgl 1570 EUB : * encoding.
1571 : */
1572 : bool
1179 tgl 1573 GIC 1054176 : pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1179 tgl 1574 EUB : {
801 heikki.linnakangas 1575 : int oklen;
1576 :
801 heikki.linnakangas 1577 GIC 1054176 : Assert(PG_VALID_ENCODING(encoding));
801 heikki.linnakangas 1578 EUB :
801 heikki.linnakangas 1579 GIC 1054176 : oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
1580 1054176 : if (oklen != len)
801 heikki.linnakangas 1581 EUB : {
801 heikki.linnakangas 1582 GIC 4 : if (noError)
801 heikki.linnakangas 1583 UIC 0 : return false;
801 heikki.linnakangas 1584 GIC 4 : report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
1585 : }
1586 1054172 : return true;
1587 : }
1179 tgl 1588 ECB :
1589 : /*
1590 : * Verify mbstr to make sure that it is validly encoded in the specified
1591 : * encoding.
1592 : *
1593 : * mbstr is not necessarily zero terminated; length of mbstr is
1594 : * specified by len.
1595 : *
1596 : * If OK, return length of string in the encoding.
1597 : * If a problem is found, return -1 when noError is
1598 : * true; when noError is false, ereport() a descriptive message.
801 heikki.linnakangas 1599 EUB : *
1600 : * Note: We cannot use the faster encoding-specific mbverifystr() function
1601 : * here, because we need to count the number of characters in the string.
1179 tgl 1602 ECB : */
1603 : int
1179 tgl 1604 UIC 0 : pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1605 : {
1606 : mbchar_verifier mbverifychar;
1607 : int mb_len;
1608 :
1609 0 : Assert(PG_VALID_ENCODING(encoding));
1610 :
1179 tgl 1611 ECB : /*
1612 : * In single-byte encodings, we need only reject nulls (\0).
1613 : */
1179 tgl 1614 UIC 0 : if (pg_encoding_max_length(encoding) <= 1)
1615 : {
1616 0 : const char *nullpos = memchr(mbstr, 0, len);
1617 :
1618 0 : if (nullpos == NULL)
1619 0 : return len;
1620 0 : if (noError)
1179 tgl 1621 LBC 0 : return -1;
1179 tgl 1622 UIC 0 : report_invalid_encoding(encoding, nullpos, 1);
1179 tgl 1623 ECB : }
1624 :
1625 : /* fetch function pointer just once */
801 heikki.linnakangas 1626 UIC 0 : mbverifychar = pg_wchar_table[encoding].mbverifychar;
1627 :
1179 tgl 1628 0 : mb_len = 0;
1629 :
1630 0 : while (len > 0)
1179 tgl 1631 ECB : {
1632 : int l;
1633 :
1634 : /* fast path for ASCII-subset characters */
1179 tgl 1635 LBC 0 : if (!IS_HIGHBIT_SET(*mbstr))
1636 : {
1637 0 : if (*mbstr != '\0')
1179 tgl 1638 ECB : {
1179 tgl 1639 UIC 0 : mb_len++;
1179 tgl 1640 LBC 0 : mbstr++;
1179 tgl 1641 UBC 0 : len--;
1179 tgl 1642 LBC 0 : continue;
1643 : }
1644 0 : if (noError)
1179 tgl 1645 UIC 0 : return -1;
1646 0 : report_invalid_encoding(encoding, mbstr, len);
1647 : }
1648 :
801 heikki.linnakangas 1649 0 : l = (*mbverifychar) ((const unsigned char *) mbstr, len);
1650 :
1179 tgl 1651 0 : if (l < 0)
1652 : {
1653 0 : if (noError)
1654 0 : return -1;
1655 0 : report_invalid_encoding(encoding, mbstr, len);
1656 : }
1657 :
1658 0 : mbstr += l;
1659 0 : len -= l;
1660 0 : mb_len++;
1661 : }
1179 tgl 1662 UBC 0 : return mb_len;
1663 : }
1664 :
1665 : /*
1666 : * check_encoding_conversion_args: check arguments of a conversion function
1179 tgl 1667 EUB : *
1668 : * "expected" arguments can be either an encoding ID or -1 to indicate that
1669 : * the caller will check whether it accepts the ID.
1670 : *
1671 : * Note: the errors here are not really user-facing, so elog instead of
1672 : * ereport seems sufficient. Also, we trust that the "expected" encoding
1673 : * arguments are valid encoding IDs, but we don't trust the actuals.
1674 : */
1675 : void
1179 tgl 1676 GBC 3545 : check_encoding_conversion_args(int src_encoding,
1179 tgl 1677 EUB : int dest_encoding,
1678 : int len,
1679 : int expected_src_encoding,
1680 : int expected_dest_encoding)
1681 : {
1179 tgl 1682 GIC 3545 : if (!PG_VALID_ENCODING(src_encoding))
1179 tgl 1683 UIC 0 : elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1179 tgl 1684 GBC 3545 : if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1179 tgl 1685 UIC 0 : elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1179 tgl 1686 EUB : pg_enc2name_tbl[expected_src_encoding].name,
1687 : pg_enc2name_tbl[src_encoding].name);
1179 tgl 1688 GBC 3545 : if (!PG_VALID_ENCODING(dest_encoding))
1179 tgl 1689 UIC 0 : elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1179 tgl 1690 GIC 3545 : if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1179 tgl 1691 UIC 0 : elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1692 : pg_enc2name_tbl[expected_dest_encoding].name,
1179 tgl 1693 EUB : pg_enc2name_tbl[dest_encoding].name);
1179 tgl 1694 GIC 3545 : if (len < 0)
1179 tgl 1695 UBC 0 : elog(ERROR, "encoding conversion length must not be negative");
1179 tgl 1696 GIC 3545 : }
1179 tgl 1697 EUB :
1698 : /*
1699 : * report_invalid_encoding: complain about invalid multibyte character
1700 : *
1701 : * note: len is remaining length of string, not length of character;
1702 : * len must be greater than zero, as we always examine the first byte.
1703 : */
1704 : void
1179 tgl 1705 GIC 1471 : report_invalid_encoding(int encoding, const char *mbstr, int len)
1706 : {
1179 tgl 1707 GBC 1471 : int l = pg_encoding_mblen(encoding, mbstr);
1708 : char buf[8 * 5 + 1];
1709 1471 : char *p = buf;
1710 : int j,
1179 tgl 1711 EUB : jlimit;
1712 :
1179 tgl 1713 GBC 1471 : jlimit = Min(l, len);
1179 tgl 1714 GIC 1471 : jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1715 :
1179 tgl 1716 GBC 4559 : for (j = 0; j < jlimit; j++)
1179 tgl 1717 EUB : {
1179 tgl 1718 GBC 3088 : p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1179 tgl 1719 GIC 3088 : if (j < jlimit - 1)
1179 tgl 1720 GBC 1617 : p += sprintf(p, " ");
1721 : }
1722 :
1179 tgl 1723 GIC 1471 : ereport(ERROR,
1724 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
1725 : errmsg("invalid byte sequence for encoding \"%s\": %s",
1726 : pg_enc2name_tbl[encoding].name,
1727 : buf)));
1728 : }
1729 :
1730 : /*
1731 : * report_untranslatable_char: complain about untranslatable character
1732 : *
1733 : * note: len is remaining length of string, not length of character;
1179 tgl 1734 ECB : * len must be greater than zero, as we always examine the first byte.
1735 : */
1736 : void
1179 tgl 1737 GIC 468 : report_untranslatable_char(int src_encoding, int dest_encoding,
1738 : const char *mbstr, int len)
1739 : {
1179 tgl 1740 CBC 468 : int l = pg_encoding_mblen(src_encoding, mbstr);
1179 tgl 1741 EUB : char buf[8 * 5 + 1];
1179 tgl 1742 CBC 468 : char *p = buf;
1179 tgl 1743 EUB : int j,
1744 : jlimit;
1745 :
1179 tgl 1746 CBC 468 : jlimit = Min(l, len);
1179 tgl 1747 GBC 468 : jlimit = Min(jlimit, 8); /* prevent buffer overrun */
1179 tgl 1748 ECB :
1179 tgl 1749 GBC 1764 : for (j = 0; j < jlimit; j++)
1750 : {
1179 tgl 1751 GIC 1296 : p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
1179 tgl 1752 CBC 1296 : if (j < jlimit - 1)
1179 tgl 1753 GBC 828 : p += sprintf(p, " ");
1179 tgl 1754 ECB : }
1755 :
1179 tgl 1756 GIC 468 : ereport(ERROR,
1757 : (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
1758 : errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
1759 : buf,
1760 : pg_enc2name_tbl[src_encoding].name,
1761 : pg_enc2name_tbl[dest_encoding].name)));
1762 : }
1179 tgl 1763 ECB :
1764 :
4922 magnus 1765 : #ifdef WIN32
1766 : /*
1974 noah 1767 : * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
1768 : * string. The character length is also passed to utf16len if not
1769 : * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
1770 : * should be ASCII-only; this will function as though MessageEncoding is UTF8.
4922 magnus 1771 : */
1772 : WCHAR *
1773 : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1774 : {
1775 : int msgenc = GetMessageEncoding();
1776 : WCHAR *utf16;
1777 : int dstlen;
1778 : UINT codepage;
1779 :
1780 : if (msgenc == PG_SQL_ASCII)
1428 noah 1781 : /* No conversion is possible, and SQL_ASCII is never utf16. */
1782 : return NULL;
1783 :
1784 : codepage = pg_enc2name_tbl[msgenc].codepage;
1785 :
1786 : /*
1787 : * Use MultiByteToWideChar directly if there is a corresponding codepage,
1788 : * or double conversion through UTF8 if not. Double conversion is needed,
1789 : * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1790 : */
1791 : if (codepage != 0)
1792 : {
1793 : utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1794 : dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
4659 tgl 1795 : utf16[dstlen] = (WCHAR) 0;
1796 : }
1797 : else
4922 magnus 1798 : {
1799 : char *utf8;
1800 :
1801 : /*
1802 : * XXX pg_do_encoding_conversion() requires a transaction. In the
1803 : * absence of one, hope for the input to be valid UTF8.
2795 noah 1804 : */
1805 : if (IsTransactionState())
1806 : {
1807 : utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1808 : len,
1428 1809 : msgenc,
2795 1810 : PG_UTF8);
1811 : if (utf8 != str)
1812 : len = strlen(utf8);
1813 : }
1814 : else
1815 : utf8 = (char *) str;
1816 :
1817 : utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1818 : dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1819 : utf16[dstlen] = (WCHAR) 0;
1820 :
1821 : if (utf8 != str)
1822 : pfree(utf8);
1823 : }
1824 :
1825 : if (dstlen == 0 && len > 0)
1826 : {
1827 : pfree(utf16);
1828 : return NULL; /* error */
1829 : }
1830 :
1831 : if (utf16len)
1832 : *utf16len = dstlen;
1833 : return utf16;
1834 : }
1835 :
1836 : #endif /* WIN32 */
|