Age Owner TLA Line data Source code
1 : /*-----------------------------------------------------------------------
2 : * ascii.c
3 : * The PostgreSQL routine for string to ascii conversion.
4 : *
5 : * Portions Copyright (c) 1999-2023, PostgreSQL Global Development Group
6 : *
7 : * IDENTIFICATION
8 : * src/backend/utils/adt/ascii.c
9 : *
10 : *-----------------------------------------------------------------------
11 : */
12 : #include "postgres.h"
13 :
14 : #include "mb/pg_wchar.h"
15 : #include "utils/ascii.h"
16 : #include "utils/builtins.h"
17 : #include "varatt.h"
18 :
19 : static void pg_to_ascii(unsigned char *src, unsigned char *src_end,
20 : unsigned char *dest, int enc);
21 : static text *encode_to_ascii(text *data, int enc);
22 :
23 :
24 : /* ----------
25 : * to_ascii
26 : * ----------
27 : */
28 : static void
7209 tgl 29 UIC 0 : pg_to_ascii(unsigned char *src, unsigned char *src_end, unsigned char *dest, int enc)
8282 bruce 30 EUB : {
31 : unsigned char *x;
32 : const unsigned char *ascii;
33 : int range;
34 :
35 : /*
36 : * relevant start for an encoding
37 : */
38 : #define RANGE_128 128
39 : #define RANGE_160 160
40 :
7885 ishii 41 UIC 0 : if (enc == PG_LATIN1)
8282 bruce 42 EUB : {
43 : /*
44 : * ISO-8859-1 <range: 160 -- 255>
45 : */
6406 tgl 46 UIC 0 : ascii = (const unsigned char *) " cL Y \"Ca -R 'u ., ?AAAAAAACEEEEIIII NOOOOOxOUUUUYTBaaaaaaaceeeeiiii nooooo/ouuuuyty";
8282 bruce 47 UBC 0 : range = RANGE_160;
8282 bruce 48 EUB : }
7885 ishii 49 UIC 0 : else if (enc == PG_LATIN2)
8282 bruce 50 EUB : {
51 : /*
52 : * ISO-8859-2 <range: 160 -- 255>
53 : */
6406 tgl 54 UIC 0 : ascii = (const unsigned char *) " A L LS \"SSTZ-ZZ a,l'ls ,sstz\"zzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTBraaaalccceeeeiiddnnoooo/ruuuuyt.";
8282 bruce 55 UBC 0 : range = RANGE_160;
8282 bruce 56 EUB : }
6684 tgl 57 UIC 0 : else if (enc == PG_LATIN9)
6684 tgl 58 EUB : {
59 : /*
60 : * ISO-8859-15 <range: 160 -- 255>
61 : */
6406 tgl 62 UIC 0 : ascii = (const unsigned char *) " cL YS sCa -R Zu .z EeY?AAAAAAACEEEEIIII NOOOOOxOUUUUYTBaaaaaaaceeeeiiii nooooo/ouuuuyty";
6684 tgl 63 UBC 0 : range = RANGE_160;
6684 tgl 64 EUB : }
7885 ishii 65 UIC 0 : else if (enc == PG_WIN1250)
8282 bruce 66 EUB : {
67 : /*
68 : * Window CP1250 <range: 128 -- 255>
69 : */
6406 tgl 70 UIC 0 : ascii = (const unsigned char *) " ' \" %S<STZZ `'\"\".-- s>stzz L A \"CS -RZ ,l'u .,as L\"lzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTBraaaalccceeeeiiddnnoooo/ruuuuyt ";
8282 bruce 71 UBC 0 : range = RANGE_128;
8282 bruce 72 EUB : }
73 : else
74 : {
7196 tgl 75 UIC 0 : ereport(ERROR,
7196 tgl 76 EUB : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
77 : errmsg("encoding conversion from %s to ASCII not supported",
78 : pg_encoding_to_char(enc))));
79 : return; /* keep compiler quiet */
80 : }
81 :
82 : /*
83 : * Encode
84 : */
7312 tgl 85 UIC 0 : for (x = src; x < src_end; x++)
8282 bruce 86 EUB : {
8053 bruce 87 UIC 0 : if (*x < 128)
7209 tgl 88 UBC 0 : *dest++ = *x;
8282 bruce 89 0 : else if (*x < range)
7209 tgl 90 0 : *dest++ = ' '; /* bogus 128 to 'range' */
8282 bruce 91 EUB : else
7209 tgl 92 UIC 0 : *dest++ = ascii[*x - range];
8053 bruce 93 EUB : }
94 : }
95 :
96 : /* ----------
97 : * encode text
98 : *
99 : * The text datum is overwritten in-place, therefore this coding method
100 : * cannot support conversions that change the string length!
101 : * ----------
102 : */
103 : static text *
8282 bruce 104 UIC 0 : encode_to_ascii(text *data, int enc)
8282 bruce 105 EUB : {
2118 tgl 106 UIC 0 : pg_to_ascii((unsigned char *) VARDATA(data), /* src */
2118 tgl 107 UBC 0 : (unsigned char *) (data) + VARSIZE(data), /* src end */
108 0 : (unsigned char *) VARDATA(data), /* dest */
8053 bruce 109 EUB : enc); /* encoding */
110 :
8282 bruce 111 UIC 0 : return data;
8282 bruce 112 EUB : }
113 :
114 : /* ----------
115 : * convert to ASCII - enc is set as 'name' arg.
116 : * ----------
117 : */
118 : Datum
8282 bruce 119 UIC 0 : to_ascii_encname(PG_FUNCTION_ARGS)
8282 bruce 120 EUB : {
7188 bruce 121 UIC 0 : text *data = PG_GETARG_TEXT_P_COPY(0);
5657 tgl 122 UBC 0 : char *encname = NameStr(*PG_GETARG_NAME(1));
123 0 : int enc = pg_char_to_encoding(encname);
5657 tgl 124 EUB :
5657 tgl 125 UIC 0 : if (enc < 0)
5657 tgl 126 UBC 0 : ereport(ERROR,
5657 tgl 127 EUB : (errcode(ERRCODE_UNDEFINED_OBJECT),
128 : errmsg("%s is not a valid encoding name", encname)));
129 :
7209 tgl 130 UIC 0 : PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
8282 bruce 131 EUB : }
132 :
133 : /* ----------
134 : * convert to ASCII - enc is set as int4
135 : * ----------
136 : */
137 : Datum
8282 bruce 138 UIC 0 : to_ascii_enc(PG_FUNCTION_ARGS)
8282 bruce 139 EUB : {
7188 bruce 140 UIC 0 : text *data = PG_GETARG_TEXT_P_COPY(0);
7188 bruce 141 UBC 0 : int enc = PG_GETARG_INT32(1);
7209 tgl 142 EUB :
5657 tgl 143 UIC 0 : if (!PG_VALID_ENCODING(enc))
5657 tgl 144 UBC 0 : ereport(ERROR,
5657 tgl 145 EUB : (errcode(ERRCODE_UNDEFINED_OBJECT),
146 : errmsg("%d is not a valid encoding code", enc)));
147 :
7209 tgl 148 UIC 0 : PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
8282 bruce 149 EUB : }
150 :
151 : /* ----------
152 : * convert to ASCII - current enc is DatabaseEncoding
153 : * ----------
154 : */
155 : Datum
8282 bruce 156 UIC 0 : to_ascii_default(PG_FUNCTION_ARGS)
8282 bruce 157 EUB : {
7188 bruce 158 UIC 0 : text *data = PG_GETARG_TEXT_P_COPY(0);
7188 bruce 159 UBC 0 : int enc = GetDatabaseEncoding();
7209 tgl 160 EUB :
7209 tgl 161 UIC 0 : PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
8282 bruce 162 EUB : }
163 :
164 : /* ----------
165 : * Copy a string in an arbitrary backend-safe encoding, converting it to a
166 : * valid ASCII string by replacing non-ASCII bytes with '?'. Otherwise the
167 : * behavior is identical to strlcpy(), except that we don't bother with a
168 : * return value.
169 : *
170 : * This must not trigger ereport(ERROR), as it is called in postmaster.
171 : * ----------
172 : */
173 : void
4188 tgl 174 GIC 6584 : ascii_safe_strlcpy(char *dest, const char *src, size_t destsiz)
4188 rhaas 175 ECB : {
4188 tgl 176 GIC 6584 : if (destsiz == 0) /* corner case: no room for trailing nul */
4188 tgl 177 LBC 0 : return;
4188 rhaas 178 EUB :
4188 tgl 179 GIC 138714 : while (--destsiz > 0)
4188 rhaas 180 ECB : {
181 : /* use unsigned char here to avoid compiler warning */
4188 tgl 182 GIC 138714 : unsigned char ch = *src++;
4188 rhaas 183 ECB :
4188 rhaas 184 GIC 138714 : if (ch == '\0')
4188 rhaas 185 CBC 6584 : break;
4188 rhaas 186 ECB : /* Keep printable ASCII characters */
4188 rhaas 187 GIC 132130 : if (32 <= ch && ch <= 127)
4188 tgl 188 CBC 132130 : *dest = ch;
4188 rhaas 189 ECB : /* White-space is also OK */
4188 rhaas 190 UIC 0 : else if (ch == '\n' || ch == '\r' || ch == '\t')
4188 tgl 191 UBC 0 : *dest = ch;
4188 rhaas 192 EUB : /* Everything else is replaced with '?' */
193 : else
4188 tgl 194 UIC 0 : *dest = '?';
4188 tgl 195 GBC 132130 : dest++;
4188 rhaas 196 ECB : }
197 :
4188 tgl 198 GIC 6584 : *dest = '\0';
4188 rhaas 199 ECB : }
|