Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * scansup.c
4 : * scanner support routines used by the core lexer
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/parser/scansup.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include <ctype.h>
18 :
19 : #include "mb/pg_wchar.h"
20 : #include "parser/scansup.h"
21 :
22 :
23 : /*
24 : * downcase_truncate_identifier() --- do appropriate downcasing and
25 : * truncation of an unquoted identifier. Optionally warn of truncation.
26 : *
27 : * Returns a palloc'd string containing the adjusted identifier.
28 : *
29 : * Note: in some usages the passed string is not null-terminated.
30 : *
31 : * Note: the API of this function is designed to allow for downcasing
32 : * transformations that increase the string length, but we don't yet
33 : * support that. If you want to implement it, you'll need to fix
34 : * SplitIdentifierString() in utils/adt/varlena.c.
35 : */
36 : char *
6987 tgl 37 CBC 5146715 : downcase_truncate_identifier(const char *ident, int len, bool warn)
38 : {
2578 teodor 39 5146715 : return downcase_identifier(ident, len, warn, true);
40 : }
41 :
42 : /*
43 : * a workhorse for downcase_truncate_identifier
44 : */
45 : char *
46 5146766 : downcase_identifier(const char *ident, int len, bool warn, bool truncate)
47 : {
48 : char *result;
49 : int i;
50 : bool enc_is_single_byte;
51 :
6987 tgl 52 5146766 : result = palloc(len + 1);
3592 andrew 53 5146766 : enc_is_single_byte = pg_database_encoding_max_length() == 1;
54 :
55 : /*
56 : * SQL99 specifies Unicode-aware case normalization, which we don't yet
57 : * have the infrastructure for. Instead we use tolower() to provide a
58 : * locale-aware translation. However, there are some locales where this
59 : * is not right either (eg, Turkish may do strange things with 'i' and
60 : * 'I'). Our current compromise is to use tolower() for characters with
61 : * the high bit set, as long as they aren't part of a multi-byte
62 : * character, and use an ASCII-only downcasing for 7-bit characters.
63 : */
6987 tgl 64 44713542 : for (i = 0; i < len; i++)
65 : {
6797 bruce 66 39566776 : unsigned char ch = (unsigned char) ident[i];
67 :
6987 tgl 68 39566776 : if (ch >= 'A' && ch <= 'Z')
69 907543 : ch += 'a' - 'A';
3592 andrew 70 38659233 : else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
6987 tgl 71 UBC 0 : ch = tolower(ch);
6987 tgl 72 CBC 39566776 : result[i] = (char) ch;
73 : }
74 5146766 : result[i] = '\0';
75 :
2578 teodor 76 5146766 : if (i >= NAMEDATALEN && truncate)
6987 tgl 77 6 : truncate_identifier(result, i, warn);
78 :
79 5146766 : return result;
80 : }
81 :
82 :
83 : /*
84 : * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
85 : *
86 : * The given string is modified in-place, if necessary. A warning is
87 : * issued if requested.
88 : *
89 : * We require the caller to pass in the string length since this saves a
90 : * strlen() call in some common usages.
91 : */
92 : void
93 147100 : truncate_identifier(char *ident, int len, bool warn)
94 : {
95 147100 : if (len >= NAMEDATALEN)
96 : {
6797 bruce 97 7 : len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
6987 tgl 98 7 : if (warn)
99 7 : ereport(NOTICE,
100 : (errcode(ERRCODE_NAME_TOO_LONG),
101 : errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
102 : ident, len, ident)));
103 7 : ident[len] = '\0';
104 : }
105 147100 : }
106 :
107 : /*
108 : * scanner_isspace() --- return true if flex scanner considers char whitespace
109 : *
110 : * This should be used instead of the potentially locale-dependent isspace()
111 : * function when it's important to match the lexer's behavior.
112 : *
113 : * In principle we might need similar functions for isalnum etc, but for the
114 : * moment only isspace seems needed.
115 : */
116 : bool
6043 117 1052917 : scanner_isspace(char ch)
118 : {
119 : /* This must match scan.l's list of {space} characters */
120 1052917 : if (ch == ' ' ||
121 1023924 : ch == '\t' ||
122 1023524 : ch == '\n' ||
123 1023521 : ch == '\r' ||
124 : ch == '\f')
125 29396 : return true;
126 1023521 : return false;
127 : }
|