Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * copyfromparse.c
4 : * Parse CSV/text/binary format for COPY FROM.
5 : *
6 : * This file contains routines to parse the text, CSV and binary input
7 : * formats. The main entry point is NextCopyFrom(), which parses the
8 : * next input line and returns it as Datums.
9 : *
10 : * In text/CSV mode, the parsing happens in multiple stages:
11 : *
12 : * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 : * 1. 2. 3. 4.
14 : *
15 : * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 : * places it into 'raw_buf'.
17 : *
18 : * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 : * the data in 'raw_buf' from client to server encoding, placing the
20 : * converted result in 'input_buf'.
21 : *
22 : * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 : * It is responsible for finding the next newline marker, taking quote and
24 : * escape characters into account according to the COPY options. The line
25 : * is copied into 'line_buf', with quotes and escape characters still
26 : * intact.
27 : *
28 : * 4. CopyReadAttributesText/CSV() function takes the input line from
29 : * 'line_buf', and splits it into fields, unescaping the data as required.
30 : * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 : * pointers to each field.
32 : *
33 : * If encoding conversion is not required, a shortcut is taken in step 2 to
34 : * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 : * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 : * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 : * the data is valid in the current encoding.
38 : *
39 : * In binary mode, the pipeline is much simpler. Input is loaded into
40 : * 'raw_buf', and encoding conversion is done in the datatype-specific
41 : * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 : * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 : * data when it's passed the receive function.
44 : *
45 : * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 : * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 : * and 'attribute_buf' are expanded on demand, to hold the longest line
48 : * encountered so far.
49 : *
50 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
51 : * Portions Copyright (c) 1994, Regents of the University of California
52 : *
53 : *
54 : * IDENTIFICATION
55 : * src/backend/commands/copyfromparse.c
56 : *
57 : *-------------------------------------------------------------------------
58 : */
59 : #include "postgres.h"
60 :
61 : #include <ctype.h>
62 : #include <unistd.h>
63 : #include <sys/stat.h>
64 :
65 : #include "commands/copy.h"
66 : #include "commands/copyfrom_internal.h"
67 : #include "commands/progress.h"
68 : #include "executor/executor.h"
69 : #include "libpq/libpq.h"
70 : #include "libpq/pqformat.h"
71 : #include "mb/pg_wchar.h"
72 : #include "miscadmin.h"
73 : #include "pgstat.h"
74 : #include "port/pg_bswap.h"
75 : #include "utils/builtins.h"
76 : #include "utils/memutils.h"
77 : #include "utils/rel.h"
78 :
79 : #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
80 : #define OCTVALUE(c) ((c) - '0')
81 :
82 : /*
83 : * These macros centralize code used to process line_buf and input_buf buffers.
84 : * They are macros because they often do continue/break control and to avoid
85 : * function call overhead in tight COPY loops.
86 : *
87 : * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
88 : * prevent the continue/break processing from working. We end the "if (1)"
89 : * with "else ((void) 0)" to ensure the "if" does not unintentionally match
90 : * any "else" in the calling code, and to avoid any compiler warnings about
91 : * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
92 : */
93 :
94 : /*
95 : * This keeps the character read at the top of the loop in the buffer
96 : * even if there is more than one read-ahead.
97 : */
98 : #define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
99 : if (1) \
100 : { \
101 : if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
102 : { \
103 : input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
104 : need_data = true; \
105 : continue; \
106 : } \
107 : } else ((void) 0)
108 :
109 : /* This consumes the remainder of the buffer and breaks */
110 : #define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
111 : if (1) \
112 : { \
113 : if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
114 : { \
115 : if (extralen) \
116 : input_buf_ptr = copy_buf_len; /* consume the partial character */ \
117 : /* backslash just before EOF, treat as data char */ \
118 : result = true; \
119 : break; \
120 : } \
121 : } else ((void) 0)
122 :
123 : /*
124 : * Transfer any approved data to line_buf; must do this to be sure
125 : * there is some room in input_buf.
126 : */
127 : #define REFILL_LINEBUF \
128 : if (1) \
129 : { \
130 : if (input_buf_ptr > cstate->input_buf_index) \
131 : { \
132 : appendBinaryStringInfo(&cstate->line_buf, \
133 : cstate->input_buf + cstate->input_buf_index, \
134 : input_buf_ptr - cstate->input_buf_index); \
135 : cstate->input_buf_index = input_buf_ptr; \
136 : } \
137 : } else ((void) 0)
138 :
139 : /* Undo any read-ahead and jump out of the block. */
140 : #define NO_END_OF_COPY_GOTO \
141 : if (1) \
142 : { \
143 : input_buf_ptr = prev_raw_ptr + 1; \
144 : goto not_end_of_copy; \
145 : } else ((void) 0)
146 :
147 : /* NOTE: there's a copy of this in copyto.c */
148 : static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
149 :
150 :
151 : /* non-export function prototypes */
152 : static bool CopyReadLine(CopyFromState cstate);
153 : static bool CopyReadLineText(CopyFromState cstate);
154 : static int CopyReadAttributesText(CopyFromState cstate);
155 : static int CopyReadAttributesCSV(CopyFromState cstate);
156 : static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
157 : Oid typioparam, int32 typmod,
158 : bool *isnull);
159 :
160 :
161 : /* Low-level communications functions */
162 : static int CopyGetData(CopyFromState cstate, void *databuf,
163 : int minread, int maxread);
164 : static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
165 : static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
166 : static void CopyLoadInputBuf(CopyFromState cstate);
167 : static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
168 :
169 : void
867 heikki.linnakangas 170 CBC 411 : ReceiveCopyBegin(CopyFromState cstate)
171 : {
172 : StringInfoData buf;
766 173 411 : int natts = list_length(cstate->attnumlist);
174 411 : int16 format = (cstate->opts.binary ? 1 : 0);
175 : int i;
176 :
177 411 : pq_beginmessage(&buf, 'G');
178 411 : pq_sendbyte(&buf, format); /* overall format */
179 411 : pq_sendint16(&buf, natts);
180 1368 : for (i = 0; i < natts; i++)
181 957 : pq_sendint16(&buf, format); /* per-column formats */
182 411 : pq_endmessage(&buf);
183 411 : cstate->copy_src = COPY_FRONTEND;
184 411 : cstate->fe_msgbuf = makeStringInfo();
185 : /* We *must* flush here to ensure FE knows it can send. */
867 186 411 : pq_flush();
187 411 : }
188 :
189 : void
190 7 : ReceiveCopyBinaryHeader(CopyFromState cstate)
191 : {
192 : char readSig[11];
193 : int32 tmp;
194 :
195 : /* Signature */
196 7 : if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
197 7 : memcmp(readSig, BinarySignature, 11) != 0)
867 heikki.linnakangas 198 UBC 0 : ereport(ERROR,
199 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
200 : errmsg("COPY file signature not recognized")));
201 : /* Flags field */
867 heikki.linnakangas 202 CBC 7 : if (!CopyGetInt32(cstate, &tmp))
867 heikki.linnakangas 203 UBC 0 : ereport(ERROR,
204 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
205 : errmsg("invalid COPY file header (missing flags)")));
867 heikki.linnakangas 206 CBC 7 : if ((tmp & (1 << 16)) != 0)
867 heikki.linnakangas 207 UBC 0 : ereport(ERROR,
208 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
209 : errmsg("invalid COPY file header (WITH OIDS)")));
867 heikki.linnakangas 210 CBC 7 : tmp &= ~(1 << 16);
211 7 : if ((tmp >> 16) != 0)
867 heikki.linnakangas 212 UBC 0 : ereport(ERROR,
213 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
214 : errmsg("unrecognized critical flags in COPY file header")));
215 : /* Header extension length */
867 heikki.linnakangas 216 CBC 7 : if (!CopyGetInt32(cstate, &tmp) ||
217 7 : tmp < 0)
867 heikki.linnakangas 218 UBC 0 : ereport(ERROR,
219 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
220 : errmsg("invalid COPY file header (missing length)")));
221 : /* Skip extension header, if present */
867 heikki.linnakangas 222 CBC 7 : while (tmp-- > 0)
223 : {
867 heikki.linnakangas 224 UBC 0 : if (CopyReadBinaryData(cstate, readSig, 1) != 1)
225 0 : ereport(ERROR,
226 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
227 : errmsg("invalid COPY file header (wrong length)")));
228 : }
867 heikki.linnakangas 229 CBC 7 : }
230 :
231 : /*
232 : * CopyGetData reads data from the source (file or frontend)
233 : *
234 : * We attempt to read at least minread, and at most maxread, bytes from
235 : * the source. The actual number of bytes read is returned; if this is
236 : * less than minread, EOF was detected.
237 : *
238 : * Note: when copying from the frontend, we expect a proper EOF mark per
239 : * protocol; if the frontend simply drops the connection, we raise error.
240 : * It seems unwise to allow the COPY IN to complete normally in that case.
241 : *
242 : * NB: no data conversion is applied here.
243 : */
244 : static int
245 215790 : CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
246 : {
247 215790 : int bytesread = 0;
248 :
249 215790 : switch (cstate->copy_src)
250 : {
251 1003 : case COPY_FILE:
252 1003 : bytesread = fread(databuf, 1, maxread, cstate->copy_file);
253 1003 : if (ferror(cstate->copy_file))
867 heikki.linnakangas 254 UBC 0 : ereport(ERROR,
255 : (errcode_for_file_access(),
256 : errmsg("could not read from COPY file: %m")));
867 heikki.linnakangas 257 CBC 1003 : if (bytesread == 0)
738 258 446 : cstate->raw_reached_eof = true;
867 259 1003 : break;
766 260 200922 : case COPY_FRONTEND:
738 261 401470 : while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
262 : {
263 : int avail;
264 :
867 265 401430 : while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
266 : {
267 : /* Try to receive another message */
268 : int mtype;
269 : int maxmsglen;
270 :
271 200882 : readmessage:
272 200882 : HOLD_CANCEL_INTERRUPTS();
273 200882 : pq_startmsgread();
274 200882 : mtype = pq_getbyte();
275 200882 : if (mtype == EOF)
867 heikki.linnakangas 276 UBC 0 : ereport(ERROR,
277 : (errcode(ERRCODE_CONNECTION_FAILURE),
278 : errmsg("unexpected EOF on client connection with an open transaction")));
279 : /* Validate message type and set packet size limit */
280 : switch (mtype)
281 : {
711 tgl 282 CBC 200548 : case 'd': /* CopyData */
283 200548 : maxmsglen = PQ_LARGE_MESSAGE_LIMIT;
284 200548 : break;
285 334 : case 'c': /* CopyDone */
286 : case 'f': /* CopyFail */
287 : case 'H': /* Flush */
288 : case 'S': /* Sync */
289 334 : maxmsglen = PQ_SMALL_MESSAGE_LIMIT;
290 334 : break;
711 tgl 291 UBC 0 : default:
292 0 : ereport(ERROR,
293 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
294 : errmsg("unexpected message type 0x%02X during COPY from stdin",
295 : mtype)));
296 : maxmsglen = 0; /* keep compiler quiet */
297 : break;
298 : }
299 : /* Now collect the message body */
711 tgl 300 CBC 200882 : if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
867 heikki.linnakangas 301 UBC 0 : ereport(ERROR,
302 : (errcode(ERRCODE_CONNECTION_FAILURE),
303 : errmsg("unexpected EOF on client connection with an open transaction")));
867 heikki.linnakangas 304 CBC 200882 : RESUME_CANCEL_INTERRUPTS();
305 : /* ... and process it */
306 : switch (mtype)
307 : {
308 200548 : case 'd': /* CopyData */
309 200548 : break;
310 334 : case 'c': /* CopyDone */
311 : /* COPY IN correctly terminated by frontend */
738 312 334 : cstate->raw_reached_eof = true;
867 313 334 : return bytesread;
867 heikki.linnakangas 314 UBC 0 : case 'f': /* CopyFail */
315 0 : ereport(ERROR,
316 : (errcode(ERRCODE_QUERY_CANCELED),
317 : errmsg("COPY from stdin failed: %s",
318 : pq_getmsgstring(cstate->fe_msgbuf))));
319 : break;
320 0 : case 'H': /* Flush */
321 : case 'S': /* Sync */
322 :
323 : /*
324 : * Ignore Flush/Sync for the convenience of client
325 : * libraries (such as libpq) that may send those
326 : * without noticing that the command they just
327 : * sent was COPY.
328 : */
329 0 : goto readmessage;
330 0 : default:
711 tgl 331 0 : Assert(false); /* NOT REACHED */
332 : }
333 : }
867 heikki.linnakangas 334 CBC 200548 : avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
335 200548 : if (avail > maxread)
867 heikki.linnakangas 336 UBC 0 : avail = maxread;
867 heikki.linnakangas 337 CBC 200548 : pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
338 200548 : databuf = (void *) ((char *) databuf + avail);
339 200548 : maxread -= avail;
340 200548 : bytesread += avail;
341 : }
342 200588 : break;
343 13865 : case COPY_CALLBACK:
344 13865 : bytesread = cstate->data_source_cb(databuf, minread, maxread);
345 13865 : break;
346 : }
347 :
348 215456 : return bytesread;
349 : }
350 :
351 :
352 : /*
353 : * These functions do apply some data conversion
354 : */
355 :
356 : /*
357 : * CopyGetInt32 reads an int32 that appears in network byte order
358 : *
359 : * Returns true if OK, false if EOF
360 : */
361 : static inline bool
362 93 : CopyGetInt32(CopyFromState cstate, int32 *val)
363 : {
364 : uint32 buf;
365 :
366 93 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
367 : {
867 heikki.linnakangas 368 UBC 0 : *val = 0; /* suppress compiler warning */
369 0 : return false;
370 : }
867 heikki.linnakangas 371 CBC 93 : *val = (int32) pg_ntoh32(buf);
372 93 : return true;
373 : }
374 :
375 : /*
376 : * CopyGetInt16 reads an int16 that appears in network byte order
377 : */
378 : static inline bool
379 21 : CopyGetInt16(CopyFromState cstate, int16 *val)
380 : {
381 : uint16 buf;
382 :
383 21 : if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
384 : {
867 heikki.linnakangas 385 UBC 0 : *val = 0; /* suppress compiler warning */
386 0 : return false;
387 : }
867 heikki.linnakangas 388 CBC 21 : *val = (int16) pg_ntoh16(buf);
389 21 : return true;
390 : }
391 :
392 :
393 : /*
394 : * Perform encoding conversion on data in 'raw_buf', writing the converted
395 : * data into 'input_buf'.
396 : *
397 : * On entry, there must be some data to convert in 'raw_buf'.
398 : */
399 : static void
738 400 430876 : CopyConvertBuf(CopyFromState cstate)
401 : {
402 : /*
403 : * If the file and server encoding are the same, no encoding conversion is
404 : * required. However, we still need to verify that the input is valid for
405 : * the encoding.
406 : */
407 430876 : if (!cstate->need_transcoding)
408 : {
409 : /*
410 : * When conversion is not required, input_buf and raw_buf are the
411 : * same. raw_buf_len is the total number of bytes in the buffer, and
412 : * input_buf_len tracks how many of those bytes have already been
413 : * verified.
414 : */
415 430876 : int preverifiedlen = cstate->input_buf_len;
416 430876 : int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
417 : int nverified;
418 :
419 430876 : if (unverifiedlen == 0)
420 : {
421 : /*
422 : * If no more raw data is coming, report the EOF to the caller.
423 : */
424 216071 : if (cstate->raw_reached_eof)
425 633 : cstate->input_reached_eof = true;
426 216071 : return;
427 : }
428 :
429 : /*
430 : * Verify the new data, including any residual unverified bytes from
431 : * previous round.
432 : */
433 214805 : nverified = pg_encoding_verifymbstr(cstate->file_encoding,
434 214805 : cstate->raw_buf + preverifiedlen,
435 : unverifiedlen);
436 214805 : if (nverified == 0)
437 : {
438 : /*
439 : * Could not verify anything.
440 : *
441 : * If there is no more raw input data coming, it means that there
442 : * was an incomplete multi-byte sequence at the end. Also, if
443 : * there's "enough" input left, we should be able to verify at
444 : * least one character, and a failure to do so means that we've
445 : * hit an invalid byte sequence.
446 : */
315 heikki.linnakangas 447 UBC 0 : if (cstate->raw_reached_eof || unverifiedlen >= pg_encoding_max_length(cstate->file_encoding))
738 448 0 : cstate->input_reached_error = true;
449 0 : return;
450 : }
738 heikki.linnakangas 451 CBC 214805 : cstate->input_buf_len += nverified;
452 : }
453 : else
454 : {
455 : /*
456 : * Encoding conversion is needed.
457 : */
458 : int nbytes;
459 : unsigned char *src;
460 : int srclen;
461 : unsigned char *dst;
462 : int dstlen;
463 : int convertedlen;
464 :
738 heikki.linnakangas 465 UBC 0 : if (RAW_BUF_BYTES(cstate) == 0)
466 : {
467 : /*
468 : * If no more raw data is coming, report the EOF to the caller.
469 : */
470 0 : if (cstate->raw_reached_eof)
471 0 : cstate->input_reached_eof = true;
472 0 : return;
473 : }
474 :
475 : /*
476 : * First, copy down any unprocessed data.
477 : */
478 0 : nbytes = INPUT_BUF_BYTES(cstate);
479 0 : if (nbytes > 0 && cstate->input_buf_index > 0)
480 0 : memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
481 : nbytes);
482 0 : cstate->input_buf_index = 0;
483 0 : cstate->input_buf_len = nbytes;
484 0 : cstate->input_buf[nbytes] = '\0';
485 :
486 0 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
487 0 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
488 0 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
489 0 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
490 :
491 : /*
492 : * Do the conversion. This might stop short, if there is an invalid
493 : * byte sequence in the input. We'll convert as much as we can in
494 : * that case.
495 : *
496 : * Note: Even if we hit an invalid byte sequence, we don't report the
497 : * error until all the valid bytes have been consumed. The input
498 : * might contain an end-of-input marker (\.), and we don't want to
499 : * report an error if the invalid byte sequence is after the
500 : * end-of-input marker. We might unnecessarily convert some data
501 : * after the end-of-input marker as long as it's valid for the
502 : * encoding, but that's harmless.
503 : */
504 0 : convertedlen = pg_do_encoding_conversion_buf(cstate->conversion_proc,
505 : cstate->file_encoding,
506 : GetDatabaseEncoding(),
507 : src, srclen,
508 : dst, dstlen,
509 : true);
510 0 : if (convertedlen == 0)
511 : {
512 : /*
513 : * Could not convert anything. If there is no more raw input data
514 : * coming, it means that there was an incomplete multi-byte
515 : * sequence at the end. Also, if there is plenty of input left,
516 : * we should be able to convert at least one character, so a
517 : * failure to do so must mean that we've hit a byte sequence
518 : * that's invalid.
519 : */
520 0 : if (cstate->raw_reached_eof || srclen >= MAX_CONVERSION_INPUT_LENGTH)
521 0 : cstate->input_reached_error = true;
522 0 : return;
523 : }
524 0 : cstate->raw_buf_index += convertedlen;
525 0 : cstate->input_buf_len += strlen((char *) dst);
526 : }
527 : }
528 :
529 : /*
530 : * Report an encoding or conversion error.
531 : */
532 : static void
533 0 : CopyConversionError(CopyFromState cstate)
534 : {
535 0 : Assert(cstate->raw_buf_len > 0);
536 0 : Assert(cstate->input_reached_error);
537 :
538 0 : if (!cstate->need_transcoding)
539 : {
540 : /*
541 : * Everything up to input_buf_len was successfully verified, and
542 : * input_buf_len points to the invalid or incomplete character.
543 : */
544 0 : report_invalid_encoding(cstate->file_encoding,
545 0 : cstate->raw_buf + cstate->input_buf_len,
546 0 : cstate->raw_buf_len - cstate->input_buf_len);
547 : }
548 : else
549 : {
550 : /*
551 : * raw_buf_index points to the invalid or untranslatable character. We
552 : * let the conversion routine report the error, because it can provide
553 : * a more specific error message than we could here. An earlier call
554 : * to the conversion routine in CopyConvertBuf() detected that there
555 : * is an error, now we call the conversion routine again with
556 : * noError=false, to have it throw the error.
557 : */
558 : unsigned char *src;
559 : int srclen;
560 : unsigned char *dst;
561 : int dstlen;
562 :
563 0 : src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
564 0 : srclen = cstate->raw_buf_len - cstate->raw_buf_index;
565 0 : dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
566 0 : dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
567 :
568 0 : (void) pg_do_encoding_conversion_buf(cstate->conversion_proc,
569 : cstate->file_encoding,
570 : GetDatabaseEncoding(),
571 : src, srclen,
572 : dst, dstlen,
573 : false);
574 :
575 : /*
576 : * The conversion routine should have reported an error, so this
577 : * should not be reached.
578 : */
579 0 : elog(ERROR, "encoding conversion failed without error");
580 : }
581 : }
582 :
583 : /*
584 : * Load more data from data source to raw_buf.
585 : *
586 : * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
587 : * beginning of the buffer, and we load new data after that.
588 : */
589 : static void
867 heikki.linnakangas 590 CBC 215456 : CopyLoadRawBuf(CopyFromState cstate)
591 : {
592 : int nbytes;
593 : int inbytes;
594 :
595 : /*
596 : * In text mode, if encoding conversion is not required, raw_buf and
597 : * input_buf point to the same buffer. Their len/index better agree, too.
598 : */
738 599 215456 : if (cstate->raw_buf == cstate->input_buf)
600 : {
601 215438 : Assert(!cstate->need_transcoding);
602 215438 : Assert(cstate->raw_buf_index == cstate->input_buf_index);
603 215438 : Assert(cstate->input_buf_len <= cstate->raw_buf_len);
604 : }
605 :
606 : /*
607 : * Copy down the unprocessed data if any.
608 : */
609 215456 : nbytes = RAW_BUF_BYTES(cstate);
610 215456 : if (nbytes > 0 && cstate->raw_buf_index > 0)
867 heikki.linnakangas 611 UBC 0 : memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
612 : nbytes);
738 heikki.linnakangas 613 CBC 215456 : cstate->raw_buf_len -= cstate->raw_buf_index;
614 215456 : cstate->raw_buf_index = 0;
615 :
616 : /*
617 : * If raw_buf and input_buf are in fact the same buffer, adjust the
618 : * input_buf variables, too.
619 : */
620 215456 : if (cstate->raw_buf == cstate->input_buf)
621 : {
622 215438 : cstate->input_buf_len -= cstate->input_buf_index;
623 215438 : cstate->input_buf_index = 0;
624 : }
625 :
626 : /* Load more data */
627 215456 : inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
628 215456 : 1, RAW_BUF_SIZE - cstate->raw_buf_len);
867 629 215456 : nbytes += inbytes;
630 215456 : cstate->raw_buf[nbytes] = '\0';
631 215456 : cstate->raw_buf_len = nbytes;
632 :
794 633 215456 : cstate->bytes_processed += inbytes;
823 tomas.vondra 634 215456 : pgstat_progress_update_param(PROGRESS_COPY_BYTES_PROCESSED, cstate->bytes_processed);
635 :
738 heikki.linnakangas 636 215456 : if (inbytes == 0)
637 639 : cstate->raw_reached_eof = true;
638 215456 : }
639 :
640 : /*
641 : * CopyLoadInputBuf loads some more data into input_buf
642 : *
643 : * On return, at least one more input character is loaded into
644 : * input_buf, or input_reached_eof is set.
645 : *
646 : * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
647 : * of the buffer and then we load more data after that.
648 : */
649 : static void
650 215438 : CopyLoadInputBuf(CopyFromState cstate)
651 : {
652 215438 : int nbytes = INPUT_BUF_BYTES(cstate);
653 :
654 : /*
655 : * The caller has updated input_buf_index to indicate how much of the
656 : * input has been consumed and isn't needed anymore. If input_buf is the
657 : * same physical area as raw_buf, update raw_buf_index accordingly.
658 : */
659 215438 : if (cstate->raw_buf == cstate->input_buf)
660 : {
661 215438 : Assert(!cstate->need_transcoding);
662 215438 : Assert(cstate->input_buf_index >= cstate->raw_buf_index);
663 215438 : cstate->raw_buf_index = cstate->input_buf_index;
664 : }
665 :
666 : for (;;)
667 : {
668 : /* If we now have some unconverted data, try to convert it */
669 430876 : CopyConvertBuf(cstate);
670 :
671 : /* If we now have some more input bytes ready, return them */
672 430876 : if (INPUT_BUF_BYTES(cstate) > nbytes)
673 214805 : return;
674 :
675 : /*
676 : * If we reached an invalid byte sequence, or we're at an incomplete
677 : * multi-byte character but there is no more raw input data, report
678 : * conversion error.
679 : */
680 216071 : if (cstate->input_reached_error)
738 heikki.linnakangas 681 UBC 0 : CopyConversionError(cstate);
682 :
683 : /* no more input, and everything has been converted */
738 heikki.linnakangas 684 CBC 216071 : if (cstate->input_reached_eof)
685 633 : break;
686 :
687 : /* Try to load more raw data */
688 215438 : Assert(!cstate->raw_reached_eof);
689 215438 : CopyLoadRawBuf(cstate);
690 : }
691 : }
692 :
693 : /*
694 : * CopyReadBinaryData
695 : *
696 : * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
697 : * and writes them to 'dest'. Returns the number of bytes read (which
698 : * would be less than 'nbytes' only if we reach EOF).
699 : */
700 : static int
867 701 191 : CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
702 : {
703 191 : int copied_bytes = 0;
704 :
705 191 : if (RAW_BUF_BYTES(cstate) >= nbytes)
706 : {
707 : /* Enough bytes are present in the buffer. */
708 173 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
709 173 : cstate->raw_buf_index += nbytes;
710 173 : copied_bytes = nbytes;
711 : }
712 : else
713 : {
714 : /*
715 : * Not enough bytes in the buffer, so must read from the file. Need
716 : * to loop since 'nbytes' could be larger than the buffer size.
717 : */
718 : do
719 : {
720 : int copy_bytes;
721 :
722 : /* Load more data if buffer is empty. */
723 18 : if (RAW_BUF_BYTES(cstate) == 0)
724 : {
738 725 18 : CopyLoadRawBuf(cstate);
726 18 : if (cstate->raw_reached_eof)
867 727 6 : break; /* EOF */
728 : }
729 :
730 : /* Transfer some bytes. */
731 12 : copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
732 12 : memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
733 12 : cstate->raw_buf_index += copy_bytes;
734 12 : dest += copy_bytes;
735 12 : copied_bytes += copy_bytes;
736 12 : } while (copied_bytes < nbytes);
737 : }
738 :
739 191 : return copied_bytes;
740 : }
741 :
742 : /*
743 : * Read raw fields in the next line for COPY FROM in text or csv mode.
744 : * Return false if no more lines.
745 : *
746 : * An internal temporary buffer is returned via 'fields'. It is valid until
747 : * the next call of the function. Since the function returns all raw fields
748 : * in the input file, 'nfields' could be different from the number of columns
749 : * in the relation.
750 : *
751 : * NOTE: force_not_null option are not applied to the returned fields.
752 : */
753 : bool
754 894910 : NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
755 : {
756 : int fldct;
757 : bool done;
758 :
759 : /* only available for text or csv input */
760 894910 : Assert(!cstate->opts.binary);
761 :
762 : /* on input check that the header line is correct if needed */
763 894910 : if (cstate->cur_lineno == 0 && cstate->opts.header_line)
764 : {
765 : ListCell *cur;
766 : TupleDesc tupDesc;
767 :
375 peter 768 55 : tupDesc = RelationGetDescr(cstate->rel);
769 :
867 heikki.linnakangas 770 55 : cstate->cur_lineno++;
375 peter 771 55 : done = CopyReadLine(cstate);
772 :
773 55 : if (cstate->opts.header_line == COPY_HEADER_MATCH)
774 : {
775 : int fldnum;
776 :
777 38 : if (cstate->opts.csv_mode)
778 5 : fldct = CopyReadAttributesCSV(cstate);
779 : else
780 33 : fldct = CopyReadAttributesText(cstate);
781 :
782 38 : if (fldct != list_length(cstate->attnumlist))
783 12 : ereport(ERROR,
784 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
785 : errmsg("wrong number of fields in header line: got %d, expected %d",
786 : fldct, list_length(cstate->attnumlist))));
787 :
788 26 : fldnum = 0;
789 79 : foreach(cur, cstate->attnumlist)
790 : {
791 63 : int attnum = lfirst_int(cur);
792 : char *colName;
793 63 : Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
794 :
290 michael 795 63 : Assert(fldnum < cstate->max_fields);
796 :
797 63 : colName = cstate->raw_fields[fldnum++];
375 peter 798 63 : if (colName == NULL)
799 3 : ereport(ERROR,
800 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
801 : errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
802 : fldnum, cstate->opts.null_print, NameStr(attr->attname))));
803 :
332 tgl 804 60 : if (namestrcmp(&attr->attname, colName) != 0)
805 : {
375 peter 806 7 : ereport(ERROR,
807 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
808 : errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
809 : fldnum, colName, NameStr(attr->attname))));
810 : }
811 : }
812 : }
813 :
814 33 : if (done)
375 peter 815 UBC 0 : return false;
816 : }
817 :
867 heikki.linnakangas 818 CBC 894888 : cstate->cur_lineno++;
819 :
820 : /* Actually read the line into memory here */
821 894888 : done = CopyReadLine(cstate);
822 :
823 : /*
824 : * EOF at start of line means we're done. If we see EOF after some
825 : * characters, we act as though it was newline followed by EOF, ie,
826 : * process the line and then exit loop on next iteration.
827 : */
828 894888 : if (done && cstate->line_buf.len == 0)
829 927 : return false;
830 :
831 : /* Parse the line into de-escaped field values */
832 893961 : if (cstate->opts.csv_mode)
833 188 : fldct = CopyReadAttributesCSV(cstate);
834 : else
835 893773 : fldct = CopyReadAttributesText(cstate);
836 :
837 893955 : *fields = cstate->raw_fields;
838 893955 : *nfields = fldct;
839 893955 : return true;
840 : }
841 :
842 : /*
843 : * Read next tuple from file for COPY FROM. Return false if no more tuples.
844 : *
845 : * 'econtext' is used to evaluate default expression for each column that is
846 : * either not read from the file or is using the DEFAULT option of COPY FROM.
847 : * It can be NULL when no default values are used, i.e. when all columns are
848 : * read from the file, and DEFAULT option is unset.
849 : *
850 : * 'values' and 'nulls' arrays must be the same length as columns of the
851 : * relation passed to BeginCopyFrom. This function fills the arrays.
852 : */
853 : bool
867 heikki.linnakangas 854 GIC 894931 : NextCopyFrom(CopyFromState cstate, ExprContext *econtext,
867 heikki.linnakangas 855 ECB : Datum *values, bool *nulls)
856 : {
857 : TupleDesc tupDesc;
858 : AttrNumber num_phys_attrs,
859 : attr_count,
867 heikki.linnakangas 860 GIC 894931 : num_defaults = cstate->num_defaults;
867 heikki.linnakangas 861 CBC 894931 : FmgrInfo *in_functions = cstate->in_functions;
862 894931 : Oid *typioparams = cstate->typioparams;
867 heikki.linnakangas 863 ECB : int i;
867 heikki.linnakangas 864 GIC 894931 : int *defmap = cstate->defmap;
867 heikki.linnakangas 865 CBC 894931 : ExprState **defexprs = cstate->defexprs;
867 heikki.linnakangas 866 ECB :
867 heikki.linnakangas 867 GIC 894931 : tupDesc = RelationGetDescr(cstate->rel);
867 heikki.linnakangas 868 CBC 894931 : num_phys_attrs = tupDesc->natts;
869 894931 : attr_count = list_length(cstate->attnumlist);
867 heikki.linnakangas 870 ECB :
871 : /* Initialize all values for row to NULL */
867 heikki.linnakangas 872 GIC 4580403 : MemSet(values, 0, num_phys_attrs * sizeof(Datum));
867 heikki.linnakangas 873 CBC 894931 : MemSet(nulls, true, num_phys_attrs * sizeof(bool));
27 andrew 874 GNC 894931 : cstate->defaults = (bool *) palloc0(num_phys_attrs * sizeof(bool));
867 heikki.linnakangas 875 ECB :
867 heikki.linnakangas 876 CBC 894931 : if (!cstate->opts.binary)
877 : {
867 heikki.linnakangas 878 ECB : char **field_strings;
879 : ListCell *cur;
880 : int fldct;
881 : int fieldno;
882 : char *string;
883 :
884 : /* read raw fields in the next line */
867 heikki.linnakangas 885 GIC 894910 : if (!NextCopyFromRawFields(cstate, &field_strings, &fldct))
886 927 : return false;
867 heikki.linnakangas 887 ECB :
888 : /* check for overflowing fields */
867 heikki.linnakangas 889 GIC 893955 : if (attr_count > 0 && fldct > attr_count)
890 6 : ereport(ERROR,
867 heikki.linnakangas 891 ECB : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
892 : errmsg("extra data after last expected column")));
893 :
867 heikki.linnakangas 894 GIC 893949 : fieldno = 0;
895 :
867 heikki.linnakangas 896 ECB : /* Loop to read the user attributes on the line. */
867 heikki.linnakangas 897 GIC 4346464 : foreach(cur, cstate->attnumlist)
898 : {
867 heikki.linnakangas 899 CBC 3452534 : int attnum = lfirst_int(cur);
867 heikki.linnakangas 900 GIC 3452534 : int m = attnum - 1;
867 heikki.linnakangas 901 CBC 3452534 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
867 heikki.linnakangas 902 ECB :
867 heikki.linnakangas 903 CBC 3452534 : if (fieldno >= fldct)
867 heikki.linnakangas 904 GIC 6 : ereport(ERROR,
867 heikki.linnakangas 905 ECB : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
906 : errmsg("missing data for column \"%s\"",
907 : NameStr(att->attname))));
867 heikki.linnakangas 908 GIC 3452528 : string = field_strings[fieldno++];
909 :
867 heikki.linnakangas 910 CBC 3452528 : if (cstate->convert_select_flags &&
867 heikki.linnakangas 911 GIC 10 : !cstate->convert_select_flags[m])
867 heikki.linnakangas 912 ECB : {
913 : /* ignore input field, leaving column as NULL */
867 heikki.linnakangas 914 GIC 5 : continue;
915 : }
867 heikki.linnakangas 916 ECB :
867 heikki.linnakangas 917 GIC 3452523 : if (cstate->opts.csv_mode)
918 : {
867 heikki.linnakangas 919 CBC 399 : if (string == NULL &&
867 heikki.linnakangas 920 GIC 16 : cstate->opts.force_notnull_flags[m])
867 heikki.linnakangas 921 ECB : {
922 : /*
923 : * FORCE_NOT_NULL option is set and column is NULL -
924 : * convert it to the NULL string.
925 : */
867 heikki.linnakangas 926 GIC 8 : string = cstate->opts.null_print;
927 : }
867 heikki.linnakangas 928 CBC 391 : else if (string != NULL && cstate->opts.force_null_flags[m]
867 heikki.linnakangas 929 GIC 10 : && strcmp(string, cstate->opts.null_print) == 0)
867 heikki.linnakangas 930 ECB : {
931 : /*
932 : * FORCE_NULL option is set and column matches the NULL
933 : * string. It must have been quoted, or otherwise the
934 : * string would already have been set to NULL. Convert it
935 : * to NULL as specified.
936 : */
867 heikki.linnakangas 937 GIC 7 : string = NULL;
938 : }
867 heikki.linnakangas 939 ECB : }
940 :
867 heikki.linnakangas 941 GIC 3452523 : cstate->cur_attname = NameStr(att->attname);
942 3452523 : cstate->cur_attval = string;
943 :
867 heikki.linnakangas 944 CBC 3452523 : if (string != NULL)
867 heikki.linnakangas 945 GIC 3450135 : nulls[m] = false;
946 :
27 andrew 947 GNC 3452523 : if (cstate->defaults[m])
948 : {
949 : /*
950 : * The caller must supply econtext and have switched into the
951 : * per-tuple memory context in it.
952 : */
953 30 : Assert(econtext != NULL);
954 30 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
955 :
956 30 : values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
957 : }
958 : else
959 3452480 : values[m] = InputFunctionCall(&in_functions[m],
960 : string,
961 3452493 : typioparams[m],
962 : att->atttypmod);
963 :
867 heikki.linnakangas 964 CBC 3452510 : cstate->cur_attname = NULL;
867 heikki.linnakangas 965 GIC 3452510 : cstate->cur_attval = NULL;
966 : }
967 :
968 893930 : Assert(fieldno == attr_count);
969 : }
867 heikki.linnakangas 970 ECB : else
971 : {
972 : /* binary */
973 : int16 fld_count;
974 : ListCell *cur;
975 :
867 heikki.linnakangas 976 CBC 21 : cstate->cur_lineno++;
977 :
978 21 : if (!CopyGetInt16(cstate, &fld_count))
979 : {
980 : /* EOF detected (end of file, or protocol-level EOF) */
981 6 : return false;
867 heikki.linnakangas 982 ECB : }
983 :
867 heikki.linnakangas 984 GIC 21 : if (fld_count == -1)
867 heikki.linnakangas 985 ECB : {
986 : /*
987 : * Received EOF marker. Wait for the protocol-level EOF, and
988 : * complain if it doesn't come immediately. In COPY FROM STDIN,
989 : * this ensures that we correctly handle CopyFail, if client
990 : * chooses to send that now. When copying from file, we could
991 : * ignore the rest of the file like in text mode, but we choose to
992 : * be consistent with the COPY FROM STDIN case.
993 : */
994 : char dummy;
995 :
766 heikki.linnakangas 996 GIC 6 : if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
867 heikki.linnakangas 997 UIC 0 : ereport(ERROR,
867 heikki.linnakangas 998 ECB : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
999 : errmsg("received copy data after EOF marker")));
867 heikki.linnakangas 1000 GIC 6 : return false;
867 heikki.linnakangas 1001 ECB : }
1002 :
867 heikki.linnakangas 1003 GIC 15 : if (fld_count != attr_count)
867 heikki.linnakangas 1004 UIC 0 : ereport(ERROR,
1005 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1006 : errmsg("row field count is %d, expected %d",
1007 : (int) fld_count, attr_count)));
1008 :
867 heikki.linnakangas 1009 GIC 93 : foreach(cur, cstate->attnumlist)
1010 : {
1011 79 : int attnum = lfirst_int(cur);
1012 79 : int m = attnum - 1;
867 heikki.linnakangas 1013 CBC 79 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
867 heikki.linnakangas 1014 EUB :
867 heikki.linnakangas 1015 GIC 79 : cstate->cur_attname = NameStr(att->attname);
1016 157 : values[m] = CopyReadBinaryAttribute(cstate,
867 heikki.linnakangas 1017 CBC 79 : &in_functions[m],
867 heikki.linnakangas 1018 GIC 79 : typioparams[m],
1019 : att->atttypmod,
867 heikki.linnakangas 1020 ECB : &nulls[m]);
867 heikki.linnakangas 1021 GBC 78 : cstate->cur_attname = NULL;
1022 : }
1023 : }
1024 :
1025 : /*
867 heikki.linnakangas 1026 ECB : * Now compute and insert any defaults available for the columns not
1027 : * provided by the input data. Anything not processed here or above will
1028 : * remain NULL.
1029 : */
867 heikki.linnakangas 1030 CBC 894209 : for (i = 0; i < num_defaults; i++)
1031 : {
867 heikki.linnakangas 1032 ECB : /*
1033 : * The caller must supply econtext and have switched into the
1034 : * per-tuple memory context in it.
1035 : */
867 heikki.linnakangas 1036 GIC 265 : Assert(econtext != NULL);
1037 265 : Assert(CurrentMemoryContext == econtext->ecxt_per_tuple_memory);
867 heikki.linnakangas 1038 ECB :
27 andrew 1039 GNC 265 : values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
867 heikki.linnakangas 1040 GIC 265 : &nulls[defmap[i]]);
1041 : }
1042 :
27 andrew 1043 GNC 893944 : pfree(cstate->defaults);
1044 :
867 heikki.linnakangas 1045 GIC 893944 : return true;
1046 : }
1047 :
1048 : /*
738 heikki.linnakangas 1049 ECB : * Read the next input line and stash it in line_buf.
1050 : *
1051 : * Result is true if read was terminated by EOF, false if terminated
1052 : * by newline. The terminating newline or EOF marker is not included
1053 : * in the final value of line_buf.
1054 : */
867 1055 : static bool
867 heikki.linnakangas 1056 CBC 894943 : CopyReadLine(CopyFromState cstate)
1057 : {
867 heikki.linnakangas 1058 ECB : bool result;
1059 :
867 heikki.linnakangas 1060 GIC 894943 : resetStringInfo(&cstate->line_buf);
738 1061 894943 : cstate->line_buf_valid = false;
867 heikki.linnakangas 1062 ECB :
1063 : /* Parse data and transfer into line_buf */
867 heikki.linnakangas 1064 CBC 894943 : result = CopyReadLineText(cstate);
1065 :
867 heikki.linnakangas 1066 GIC 894943 : if (result)
1067 : {
1068 : /*
1069 : * Reached EOF. In protocol version 3, we should ignore anything
1070 : * after \. up to the protocol end of copy data. (XXX maybe better
1071 : * not to treat \. as special?)
1072 : */
766 1073 927 : if (cstate->copy_src == COPY_FRONTEND)
1074 : {
738 heikki.linnakangas 1075 ECB : int inbytes;
1076 :
1077 : do
1078 : {
738 heikki.linnakangas 1079 CBC 334 : inbytes = CopyGetData(cstate, cstate->input_buf,
738 heikki.linnakangas 1080 ECB : 1, INPUT_BUF_SIZE);
738 heikki.linnakangas 1081 GIC 334 : } while (inbytes > 0);
1082 334 : cstate->input_buf_index = 0;
738 heikki.linnakangas 1083 CBC 334 : cstate->input_buf_len = 0;
738 heikki.linnakangas 1084 GIC 334 : cstate->raw_buf_index = 0;
738 heikki.linnakangas 1085 CBC 334 : cstate->raw_buf_len = 0;
1086 : }
1087 : }
1088 : else
1089 : {
1090 : /*
1091 : * If we didn't hit EOF, then we must have transferred the EOL marker
867 heikki.linnakangas 1092 ECB : * to line_buf along with the data. Get rid of it.
1093 : */
867 heikki.linnakangas 1094 GIC 894016 : switch (cstate->eol_type)
1095 : {
1096 894016 : case EOL_NL:
1097 894016 : Assert(cstate->line_buf.len >= 1);
867 heikki.linnakangas 1098 CBC 894016 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
867 heikki.linnakangas 1099 GIC 894016 : cstate->line_buf.len--;
867 heikki.linnakangas 1100 CBC 894016 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1101 894016 : break;
867 heikki.linnakangas 1102 LBC 0 : case EOL_CR:
1103 0 : Assert(cstate->line_buf.len >= 1);
1104 0 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
867 heikki.linnakangas 1105 UIC 0 : cstate->line_buf.len--;
1106 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
1107 0 : break;
1108 0 : case EOL_CRNL:
1109 0 : Assert(cstate->line_buf.len >= 2);
1110 0 : Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1111 0 : Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1112 0 : cstate->line_buf.len -= 2;
867 heikki.linnakangas 1113 LBC 0 : cstate->line_buf.data[cstate->line_buf.len] = '\0';
867 heikki.linnakangas 1114 UIC 0 : break;
867 heikki.linnakangas 1115 LBC 0 : case EOL_UNKNOWN:
867 heikki.linnakangas 1116 ECB : /* shouldn't get here */
867 heikki.linnakangas 1117 LBC 0 : Assert(false);
867 heikki.linnakangas 1118 ECB : break;
1119 : }
1120 : }
867 heikki.linnakangas 1121 EUB :
1122 : /* Now it's safe to use the buffer in error messages */
738 heikki.linnakangas 1123 GBC 894943 : cstate->line_buf_valid = true;
867 heikki.linnakangas 1124 EUB :
867 heikki.linnakangas 1125 GBC 894943 : return result;
867 heikki.linnakangas 1126 EUB : }
1127 :
1128 : /*
1129 : * CopyReadLineText - inner loop of CopyReadLine for text mode
1130 : */
1131 : static bool
867 heikki.linnakangas 1132 GBC 894943 : CopyReadLineText(CopyFromState cstate)
867 heikki.linnakangas 1133 EUB : {
738 1134 : char *copy_input_buf;
1135 : int input_buf_ptr;
867 1136 : int copy_buf_len;
867 heikki.linnakangas 1137 GIC 894943 : bool need_data = false;
1138 894943 : bool hit_eof = false;
1139 894943 : bool result = false;
1140 :
1141 : /* CSV variables */
867 heikki.linnakangas 1142 CBC 894943 : bool first_char_in_line = true;
867 heikki.linnakangas 1143 GIC 894943 : bool in_quote = false,
867 heikki.linnakangas 1144 CBC 894943 : last_was_esc = false;
867 heikki.linnakangas 1145 GIC 894943 : char quotec = '\0';
1146 894943 : char escapec = '\0';
1147 :
1148 894943 : if (cstate->opts.csv_mode)
1149 : {
1150 283 : quotec = cstate->opts.quote[0];
867 heikki.linnakangas 1151 CBC 283 : escapec = cstate->opts.escape[0];
1152 : /* ignore special escape processing if it's the same as quotec */
867 heikki.linnakangas 1153 GIC 283 : if (quotec == escapec)
1154 215 : escapec = '\0';
1155 : }
867 heikki.linnakangas 1156 ECB :
1157 : /*
1158 : * The objective of this loop is to transfer the entire next input line
1159 : * into line_buf. Hence, we only care for detecting newlines (\r and/or
1160 : * \n) and the end-of-copy marker (\.).
1161 : *
1162 : * In CSV mode, \r and \n inside a quoted field are just part of the data
1163 : * value and are put in line_buf. We keep just enough state to know if we
1164 : * are currently in a quoted field or not.
1165 : *
1166 : * These four characters, and the CSV escape and quote characters, are
1167 : * assumed the same in frontend and backend encodings.
1168 : *
738 1169 : * The input has already been converted to the database encoding. All
1170 : * supported server encodings have the property that all bytes in a
1171 : * multi-byte sequence have the high bit set, so a multibyte character
1172 : * cannot contain any newline or escape characters embedded in the
1173 : * multibyte sequence. Therefore, we can process the input byte-by-byte,
1174 : * regardless of the encoding.
1175 : *
1176 : * For speed, we try to move data from input_buf to line_buf in chunks
1177 : * rather than one character at a time. input_buf_ptr points to the next
1178 : * character to examine; any characters from input_buf_index to
1179 : * input_buf_ptr have been determined to be part of the line, but not yet
1180 : * transferred to line_buf.
1181 : *
1182 : * For a little extra speed within the loop, we copy input_buf and
1183 : * input_buf_len into local variables.
1184 : */
738 heikki.linnakangas 1185 GIC 894943 : copy_input_buf = cstate->input_buf;
1186 894943 : input_buf_ptr = cstate->input_buf_index;
1187 894943 : copy_buf_len = cstate->input_buf_len;
1188 :
1189 : for (;;)
867 1190 21722128 : {
1191 : int prev_raw_ptr;
1192 : char c;
1193 :
1194 : /*
1195 : * Load more data if needed.
1196 : *
1197 : * TODO: We could just force four bytes of read-ahead and avoid the
1198 : * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1199 : * unsafe with the old v2 COPY protocol, but we don't support that
1200 : * anymore.
1201 : */
738 1202 22617071 : if (input_buf_ptr >= copy_buf_len || need_data)
1203 : {
867 heikki.linnakangas 1204 CBC 215438 : REFILL_LINEBUF;
867 heikki.linnakangas 1205 ECB :
738 heikki.linnakangas 1206 CBC 215438 : CopyLoadInputBuf(cstate);
1207 : /* update our local variables */
738 heikki.linnakangas 1208 GIC 215438 : hit_eof = cstate->input_reached_eof;
738 heikki.linnakangas 1209 CBC 215438 : input_buf_ptr = cstate->input_buf_index;
738 heikki.linnakangas 1210 GIC 215438 : copy_buf_len = cstate->input_buf_len;
1211 :
1212 : /*
1213 : * If we are completely out of data, break out of the loop,
1214 : * reporting EOF.
1215 : */
1216 215438 : if (INPUT_BUF_BYTES(cstate) <= 0)
1217 : {
867 1218 633 : result = true;
1219 633 : break;
1220 : }
867 heikki.linnakangas 1221 CBC 214805 : need_data = false;
1222 : }
867 heikki.linnakangas 1223 ECB :
1224 : /* OK to fetch a character */
738 heikki.linnakangas 1225 CBC 22616438 : prev_raw_ptr = input_buf_ptr;
738 heikki.linnakangas 1226 GIC 22616438 : c = copy_input_buf[input_buf_ptr++];
867 heikki.linnakangas 1227 ECB :
867 heikki.linnakangas 1228 CBC 22616438 : if (cstate->opts.csv_mode)
867 heikki.linnakangas 1229 ECB : {
1230 : /*
1231 : * If character is '\\' or '\r', we may need to look ahead below.
1232 : * Force fetch of the next character if we don't already have it.
1233 : * We need to do this before changing CSV state, in case one of
1234 : * these characters is also the quote or escape character.
1235 : */
867 heikki.linnakangas 1236 GIC 2357 : if (c == '\\' || c == '\r')
867 heikki.linnakangas 1237 ECB : {
867 heikki.linnakangas 1238 CBC 150 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1239 : }
867 heikki.linnakangas 1240 ECB :
1241 : /*
1242 : * Dealing with quotes and escapes here is mildly tricky. If the
1243 : * quote char is also the escape char, there's no problem - we
1244 : * just use the char as a toggle. If they are different, we need
1245 : * to ensure that we only take account of an escape inside a
1246 : * quoted field and immediately preceding a quote char, and not
1247 : * the second in an escape-escape sequence.
1248 : */
867 heikki.linnakangas 1249 GIC 2357 : if (in_quote && c == escapec)
1250 24 : last_was_esc = !last_was_esc;
1251 2357 : if (c == quotec && !last_was_esc)
1252 180 : in_quote = !in_quote;
1253 2357 : if (c != escapec)
1254 2330 : last_was_esc = false;
867 heikki.linnakangas 1255 ECB :
1256 : /*
1257 : * Updating the line count for embedded CR and/or LF chars is
1258 : * necessarily a little fragile - this test is probably about the
1259 : * best we can do. (XXX it's arguable whether we should do this
1260 : * at all --- is cur_lineno a physical or logical count?)
1261 : */
867 heikki.linnakangas 1262 GIC 2357 : if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1263 18 : cstate->cur_lineno++;
1264 : }
1265 :
1266 : /* Process \r */
1267 22616438 : if (c == '\r' && (!cstate->opts.csv_mode || !in_quote))
867 heikki.linnakangas 1268 ECB : {
1269 : /* Check for \r\n on first line, _and_ handle \r\n. */
867 heikki.linnakangas 1270 LBC 0 : if (cstate->eol_type == EOL_UNKNOWN ||
1271 0 : cstate->eol_type == EOL_CRNL)
867 heikki.linnakangas 1272 ECB : {
1273 : /*
1274 : * If need more data, go back to loop top to load it.
1275 : *
1276 : * Note that if we are at EOF, c will wind up as '\0' because
1277 : * of the guaranteed pad of input_buf.
1278 : */
867 heikki.linnakangas 1279 UIC 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1280 :
867 heikki.linnakangas 1281 ECB : /* get next char */
738 heikki.linnakangas 1282 LBC 0 : c = copy_input_buf[input_buf_ptr];
1283 :
867 heikki.linnakangas 1284 UIC 0 : if (c == '\n')
1285 : {
738 heikki.linnakangas 1286 LBC 0 : input_buf_ptr++; /* eat newline */
867 heikki.linnakangas 1287 UIC 0 : cstate->eol_type = EOL_CRNL; /* in case not set yet */
1288 : }
867 heikki.linnakangas 1289 EUB : else
1290 : {
1291 : /* found \r, but no \n */
867 heikki.linnakangas 1292 UIC 0 : if (cstate->eol_type == EOL_CRNL)
1293 0 : ereport(ERROR,
1294 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1295 : !cstate->opts.csv_mode ?
1296 : errmsg("literal carriage return found in data") :
1297 : errmsg("unquoted carriage return found in data"),
867 heikki.linnakangas 1298 EUB : !cstate->opts.csv_mode ?
1299 : errhint("Use \"\\r\" to represent carriage return.") :
1300 : errhint("Use quoted CSV field to represent carriage return.")));
1301 :
1302 : /*
1303 : * if we got here, it is the first line and we didn't find
1304 : * \n, so don't consume the peeked character
1305 : */
867 heikki.linnakangas 1306 UBC 0 : cstate->eol_type = EOL_CR;
1307 : }
1308 : }
867 heikki.linnakangas 1309 UIC 0 : else if (cstate->eol_type == EOL_NL)
1310 0 : ereport(ERROR,
867 heikki.linnakangas 1311 EUB : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1312 : !cstate->opts.csv_mode ?
1313 : errmsg("literal carriage return found in data") :
1314 : errmsg("unquoted carriage return found in data"),
1315 : !cstate->opts.csv_mode ?
1316 : errhint("Use \"\\r\" to represent carriage return.") :
1317 : errhint("Use quoted CSV field to represent carriage return.")));
1318 : /* If reach here, we have found the line terminator */
867 heikki.linnakangas 1319 UIC 0 : break;
1320 : }
1321 :
1322 : /* Process \n */
867 heikki.linnakangas 1323 GIC 22616438 : if (c == '\n' && (!cstate->opts.csv_mode || !in_quote))
1324 : {
867 heikki.linnakangas 1325 GBC 894016 : if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
867 heikki.linnakangas 1326 UIC 0 : ereport(ERROR,
1327 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
867 heikki.linnakangas 1328 EUB : !cstate->opts.csv_mode ?
1329 : errmsg("literal newline found in data") :
1330 : errmsg("unquoted newline found in data"),
1331 : !cstate->opts.csv_mode ?
1332 : errhint("Use \"\\n\" to represent newline.") :
1333 : errhint("Use quoted CSV field to represent newline.")));
867 heikki.linnakangas 1334 GIC 894016 : cstate->eol_type = EOL_NL; /* in case not set yet */
1335 : /* If reach here, we have found the line terminator */
1336 894016 : break;
1337 : }
867 heikki.linnakangas 1338 EUB :
1339 : /*
1340 : * In CSV mode, we only recognize \. alone on a line. This is because
1341 : * \. is a valid CSV data value.
867 heikki.linnakangas 1342 ECB : */
867 heikki.linnakangas 1343 GIC 21722422 : if (c == '\\' && (!cstate->opts.csv_mode || first_char_in_line))
867 heikki.linnakangas 1344 ECB : {
867 heikki.linnakangas 1345 EUB : char c2;
1346 :
867 heikki.linnakangas 1347 GIC 4255 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1348 4255 : IF_NEED_REFILL_AND_EOF_BREAK(0);
1349 :
1350 : /* -----
1351 : * get next character
1352 : * Note: we do not change c so if it isn't \., we can fall
738 heikki.linnakangas 1353 ECB : * through and continue processing.
1354 : * -----
867 1355 : */
738 heikki.linnakangas 1356 GIC 4255 : c2 = copy_input_buf[input_buf_ptr];
1357 :
867 1358 4255 : if (c2 == '.')
1359 : {
738 1360 297 : input_buf_ptr++; /* consume the '.' */
1361 :
867 heikki.linnakangas 1362 ECB : /*
1363 : * Note: if we loop back for more data here, it does not
1364 : * matter that the CSV state change checks are re-executed; we
1365 : * will come back here with no important state changed.
1366 : */
867 heikki.linnakangas 1367 CBC 297 : if (cstate->eol_type == EOL_CRNL)
1368 : {
1369 : /* Get the next character */
867 heikki.linnakangas 1370 UIC 0 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
1371 : /* if hit_eof, c2 will become '\0' */
738 1372 0 : c2 = copy_input_buf[input_buf_ptr++];
1373 :
867 1374 0 : if (c2 == '\n')
867 heikki.linnakangas 1375 ECB : {
867 heikki.linnakangas 1376 UIC 0 : if (!cstate->opts.csv_mode)
867 heikki.linnakangas 1377 LBC 0 : ereport(ERROR,
1378 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
867 heikki.linnakangas 1379 ECB : errmsg("end-of-copy marker does not match previous newline style")));
1380 : else
867 heikki.linnakangas 1381 UIC 0 : NO_END_OF_COPY_GOTO;
1382 : }
1383 0 : else if (c2 != '\r')
1384 : {
1385 0 : if (!cstate->opts.csv_mode)
867 heikki.linnakangas 1386 LBC 0 : ereport(ERROR,
1387 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1388 : errmsg("end-of-copy marker corrupt")));
867 heikki.linnakangas 1389 EUB : else
867 heikki.linnakangas 1390 UIC 0 : NO_END_OF_COPY_GOTO;
867 heikki.linnakangas 1391 EUB : }
1392 : }
1393 :
1394 : /* Get the next character */
867 heikki.linnakangas 1395 GBC 297 : IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
867 heikki.linnakangas 1396 EUB : /* if hit_eof, c2 will become '\0' */
738 heikki.linnakangas 1397 GIC 297 : c2 = copy_input_buf[input_buf_ptr++];
1398 :
867 1399 297 : if (c2 != '\r' && c2 != '\n')
867 heikki.linnakangas 1400 EUB : {
867 heikki.linnakangas 1401 GIC 3 : if (!cstate->opts.csv_mode)
867 heikki.linnakangas 1402 UBC 0 : ereport(ERROR,
1403 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
867 heikki.linnakangas 1404 EUB : errmsg("end-of-copy marker corrupt")));
1405 : else
867 heikki.linnakangas 1406 GIC 3 : NO_END_OF_COPY_GOTO;
1407 : }
1408 :
867 heikki.linnakangas 1409 GBC 294 : if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
867 heikki.linnakangas 1410 GIC 294 : (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1411 294 : (cstate->eol_type == EOL_CR && c2 != '\r'))
1412 : {
867 heikki.linnakangas 1413 UIC 0 : ereport(ERROR,
867 heikki.linnakangas 1414 ECB : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1415 : errmsg("end-of-copy marker does not match previous newline style")));
1416 : }
1417 :
1418 : /*
1419 : * Transfer only the data before the \. into line_buf, then
1420 : * discard the data and the \. sequence.
867 heikki.linnakangas 1421 EUB : */
738 heikki.linnakangas 1422 GIC 294 : if (prev_raw_ptr > cstate->input_buf_index)
867 heikki.linnakangas 1423 UIC 0 : appendBinaryStringInfo(&cstate->line_buf,
738 1424 0 : cstate->input_buf + cstate->input_buf_index,
738 heikki.linnakangas 1425 LBC 0 : prev_raw_ptr - cstate->input_buf_index);
738 heikki.linnakangas 1426 GIC 294 : cstate->input_buf_index = input_buf_ptr;
867 1427 294 : result = true; /* report EOF */
867 heikki.linnakangas 1428 CBC 294 : break;
867 heikki.linnakangas 1429 ECB : }
867 heikki.linnakangas 1430 CBC 3958 : else if (!cstate->opts.csv_mode)
1431 : {
867 heikki.linnakangas 1432 EUB : /*
1433 : * If we are here, it means we found a backslash followed by
1434 : * something other than a period. In non-CSV mode, anything
1435 : * after a backslash is special, so we skip over that second
1436 : * character too. If we didn't do that \\. would be
1437 : * considered an eof-of copy, while in non-CSV mode it is a
1438 : * literal backslash followed by a period. In CSV mode,
1439 : * backslashes are not special, so we want to process the
1440 : * character after the backslash just like a normal character,
867 heikki.linnakangas 1441 ECB : * so we don't increment in those cases.
867 heikki.linnakangas 1442 EUB : */
738 heikki.linnakangas 1443 GBC 3955 : input_buf_ptr++;
793 heikki.linnakangas 1444 EUB : }
867 heikki.linnakangas 1445 ECB : }
1446 :
1447 : /*
1448 : * This label is for CSV cases where \. appears at the start of a
1449 : * line, but there is more text after it, meaning it was a data value.
1450 : * We are more strict for \. in CSV mode because \. could be a data
1451 : * value, while in non-CSV mode, \. cannot be a data value.
1452 : */
867 heikki.linnakangas 1453 GIC 21718170 : not_end_of_copy:
1454 21722128 : first_char_in_line = false;
1455 : } /* end of outer loop */
1456 :
1457 : /*
1458 : * Transfer any still-uncopied data to line_buf.
1459 : */
1460 894943 : REFILL_LINEBUF;
1461 :
867 heikki.linnakangas 1462 CBC 894943 : return result;
1463 : }
1464 :
1465 : /*
1466 : * Return decimal value for a hexadecimal digit
1467 : */
1468 : static int
867 heikki.linnakangas 1469 UIC 0 : GetDecimalFromHex(char hex)
1470 : {
1471 0 : if (isdigit((unsigned char) hex))
867 heikki.linnakangas 1472 LBC 0 : return hex - '0';
867 heikki.linnakangas 1473 ECB : else
867 heikki.linnakangas 1474 UIC 0 : return tolower((unsigned char) hex) - 'a' + 10;
1475 : }
1476 :
1477 : /*
1478 : * Parse the current line into separate attributes (fields),
867 heikki.linnakangas 1479 ECB : * performing de-escaping as needed.
1480 : *
1481 : * The input is in line_buf. We use attribute_buf to hold the result
1482 : * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1483 : * string, or NULL when the input matches the null marker string.
1484 : * This array is expanded as necessary.
1485 : *
1486 : * (Note that the caller cannot check for nulls since the returned
1487 : * string would be the post-de-escaping equivalent, which may look
867 heikki.linnakangas 1488 EUB : * the same as some valid data string.)
1489 : *
1490 : * delim is the column delimiter string (must be just one byte for now).
1491 : * null_print is the null marker string. Note that this is compared to
1492 : * the pre-de-escaped input string.
1493 : *
1494 : * The return value is the number of fields actually read.
1495 : */
1496 : static int
867 heikki.linnakangas 1497 GIC 893806 : CopyReadAttributesText(CopyFromState cstate)
1498 : {
1499 893806 : char delimc = cstate->opts.delim[0];
1500 : int fieldno;
1501 : char *output_ptr;
1502 : char *cur_ptr;
1503 : char *line_end_ptr;
1504 :
1505 : /*
1506 : * We need a special case for zero-column tables: check that the input
1507 : * line is empty, and return.
1508 : */
1509 893806 : if (cstate->max_fields <= 0)
1510 : {
1511 3 : if (cstate->line_buf.len != 0)
867 heikki.linnakangas 1512 UIC 0 : ereport(ERROR,
1513 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1514 : errmsg("extra data after last expected column")));
867 heikki.linnakangas 1515 GIC 3 : return 0;
867 heikki.linnakangas 1516 ECB : }
1517 :
867 heikki.linnakangas 1518 CBC 893803 : resetStringInfo(&cstate->attribute_buf);
1519 :
1520 : /*
1521 : * The de-escaped attributes will certainly not be longer than the input
1522 : * data line, so we can just force attribute_buf to be large enough and
1523 : * then transfer data without any checks for enough space. We need to do
1524 : * it this way because enlarging attribute_buf mid-stream would invalidate
1525 : * pointers already stored into cstate->raw_fields[].
1526 : */
867 heikki.linnakangas 1527 GIC 893803 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
867 heikki.linnakangas 1528 CBC 4 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
867 heikki.linnakangas 1529 GIC 893803 : output_ptr = cstate->attribute_buf.data;
867 heikki.linnakangas 1530 ECB :
867 heikki.linnakangas 1531 EUB : /* set pointer variables for loop */
867 heikki.linnakangas 1532 GIC 893803 : cur_ptr = cstate->line_buf.data;
1533 893803 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
867 heikki.linnakangas 1534 ECB :
1535 : /* Outer loop iterates over fields */
867 heikki.linnakangas 1536 GIC 893803 : fieldno = 0;
867 heikki.linnakangas 1537 ECB : for (;;)
867 heikki.linnakangas 1538 GIC 2558447 : {
1539 3452250 : bool found_delim = false;
1540 : char *start_ptr;
1541 : char *end_ptr;
1542 : int input_len;
1543 3452250 : bool saw_non_ascii = false;
1544 :
1545 : /* Make sure there is enough space for the next value */
867 heikki.linnakangas 1546 CBC 3452250 : if (fieldno >= cstate->max_fields)
867 heikki.linnakangas 1547 ECB : {
867 heikki.linnakangas 1548 CBC 15 : cstate->max_fields *= 2;
867 heikki.linnakangas 1549 GIC 15 : cstate->raw_fields =
1550 15 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
867 heikki.linnakangas 1551 ECB : }
1552 :
1553 : /* Remember start of field on both input and output sides */
867 heikki.linnakangas 1554 GIC 3452250 : start_ptr = cur_ptr;
867 heikki.linnakangas 1555 CBC 3452250 : cstate->raw_fields[fieldno] = output_ptr;
1556 :
867 heikki.linnakangas 1557 ECB : /*
1558 : * Scan data for field.
1559 : *
1560 : * Note that in this loop, we are scanning to locate the end of field
1561 : * and also speculatively performing de-escaping. Once we find the
1562 : * end-of-field, we can match the raw field contents against the null
1563 : * marker string. Only after that comparison fails do we know that
1564 : * de-escaping is actually the right thing to do; therefore we *must
1565 : * not* throw any syntax errors before we've done the null-marker
1566 : * check.
1567 : */
1568 : for (;;)
867 heikki.linnakangas 1569 CBC 19161333 : {
1570 : char c;
1571 :
867 heikki.linnakangas 1572 GIC 22613583 : end_ptr = cur_ptr;
867 heikki.linnakangas 1573 CBC 22613583 : if (cur_ptr >= line_end_ptr)
1574 893800 : break;
867 heikki.linnakangas 1575 GIC 21719783 : c = *cur_ptr++;
1576 21719783 : if (c == delimc)
1577 : {
1578 2558450 : found_delim = true;
1579 2558450 : break;
1580 : }
1581 19161333 : if (c == '\\')
1582 : {
1583 3955 : if (cur_ptr >= line_end_ptr)
867 heikki.linnakangas 1584 UIC 0 : break;
867 heikki.linnakangas 1585 GIC 3955 : c = *cur_ptr++;
1586 3955 : switch (c)
1587 : {
867 heikki.linnakangas 1588 CBC 6 : case '0':
1589 : case '1':
1590 : case '2':
867 heikki.linnakangas 1591 ECB : case '3':
1592 : case '4':
1593 : case '5':
1594 : case '6':
1595 : case '7':
1596 : {
1597 : /* handle \013 */
1598 : int val;
1599 :
867 heikki.linnakangas 1600 CBC 6 : val = OCTVALUE(c);
867 heikki.linnakangas 1601 GIC 6 : if (cur_ptr < line_end_ptr)
867 heikki.linnakangas 1602 ECB : {
867 heikki.linnakangas 1603 GBC 3 : c = *cur_ptr;
867 heikki.linnakangas 1604 CBC 3 : if (ISOCTAL(c))
867 heikki.linnakangas 1605 ECB : {
867 heikki.linnakangas 1606 UIC 0 : cur_ptr++;
867 heikki.linnakangas 1607 LBC 0 : val = (val << 3) + OCTVALUE(c);
867 heikki.linnakangas 1608 UIC 0 : if (cur_ptr < line_end_ptr)
1609 : {
1610 0 : c = *cur_ptr;
1611 0 : if (ISOCTAL(c))
1612 : {
1613 0 : cur_ptr++;
1614 0 : val = (val << 3) + OCTVALUE(c);
1615 : }
1616 : }
1617 : }
1618 : }
867 heikki.linnakangas 1619 CBC 6 : c = val & 0377;
1620 6 : if (c == '\0' || IS_HIGHBIT_SET(c))
867 heikki.linnakangas 1621 GIC 6 : saw_non_ascii = true;
867 heikki.linnakangas 1622 ECB : }
867 heikki.linnakangas 1623 CBC 6 : break;
867 heikki.linnakangas 1624 GIC 6 : case 'x':
867 heikki.linnakangas 1625 EUB : /* Handle \x3F */
867 heikki.linnakangas 1626 GBC 6 : if (cur_ptr < line_end_ptr)
867 heikki.linnakangas 1627 EUB : {
867 heikki.linnakangas 1628 GIC 3 : char hexchar = *cur_ptr;
867 heikki.linnakangas 1629 EUB :
867 heikki.linnakangas 1630 GBC 3 : if (isxdigit((unsigned char) hexchar))
1631 : {
867 heikki.linnakangas 1632 UBC 0 : int val = GetDecimalFromHex(hexchar);
867 heikki.linnakangas 1633 EUB :
867 heikki.linnakangas 1634 UIC 0 : cur_ptr++;
1635 0 : if (cur_ptr < line_end_ptr)
1636 : {
1637 0 : hexchar = *cur_ptr;
867 heikki.linnakangas 1638 LBC 0 : if (isxdigit((unsigned char) hexchar))
867 heikki.linnakangas 1639 ECB : {
867 heikki.linnakangas 1640 LBC 0 : cur_ptr++;
867 heikki.linnakangas 1641 UIC 0 : val = (val << 4) + GetDecimalFromHex(hexchar);
867 heikki.linnakangas 1642 ECB : }
1643 : }
867 heikki.linnakangas 1644 UIC 0 : c = val & 0xff;
867 heikki.linnakangas 1645 LBC 0 : if (c == '\0' || IS_HIGHBIT_SET(c))
867 heikki.linnakangas 1646 UIC 0 : saw_non_ascii = true;
867 heikki.linnakangas 1647 ECB : }
1648 : }
867 heikki.linnakangas 1649 CBC 6 : break;
867 heikki.linnakangas 1650 UIC 0 : case 'b':
867 heikki.linnakangas 1651 UBC 0 : c = '\b';
867 heikki.linnakangas 1652 UIC 0 : break;
867 heikki.linnakangas 1653 UBC 0 : case 'f':
1654 0 : c = '\f';
867 heikki.linnakangas 1655 UIC 0 : break;
867 heikki.linnakangas 1656 GBC 1525 : case 'n':
1657 1525 : c = '\n';
867 heikki.linnakangas 1658 GIC 1525 : break;
867 heikki.linnakangas 1659 UBC 0 : case 'r':
1660 0 : c = '\r';
867 heikki.linnakangas 1661 UIC 0 : break;
1662 0 : case 't':
867 heikki.linnakangas 1663 UBC 0 : c = '\t';
1664 0 : break;
1665 0 : case 'v':
867 heikki.linnakangas 1666 UIC 0 : c = '\v';
1667 0 : break;
867 heikki.linnakangas 1668 ECB :
867 heikki.linnakangas 1669 EUB : /*
1670 : * in all other cases, take the char after '\'
1671 : * literally
1672 : */
1673 : }
1674 : }
867 heikki.linnakangas 1675 ECB :
1676 : /* Add c to output string */
867 heikki.linnakangas 1677 CBC 19161333 : *output_ptr++ = c;
867 heikki.linnakangas 1678 EUB : }
1679 :
1680 : /* Check whether raw input matched null marker */
867 heikki.linnakangas 1681 GBC 3452250 : input_len = end_ptr - start_ptr;
1682 3452250 : if (input_len == cstate->opts.null_print_len &&
1683 246279 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1684 2376 : cstate->raw_fields[fieldno] = NULL;
1685 : /* Check whether raw input matched default marker */
25 andrew 1686 GNC 3449874 : else if (fieldno < list_length(cstate->attnumlist) &&
1687 3449856 : cstate->opts.default_print &&
27 1688 57 : input_len == cstate->opts.default_print_len &&
1689 15 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1690 12 : {
1691 : /* fieldno is 0-indexed and attnum is 1-indexed */
1692 15 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1693 :
1694 15 : if (cstate->defexprs[m] != NULL)
1695 : {
1696 : /* defaults contain entries for all physical attributes */
1697 12 : cstate->defaults[m] = true;
1698 : }
1699 : else
1700 : {
1701 3 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1702 3 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1703 :
1704 3 : ereport(ERROR,
1705 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1706 : errmsg("unexpected DEFAULT in COPY data"),
1707 : errdetail("Column \"%s\" has no DEFAULT value.",
1708 : NameStr(att->attname))));
1709 : }
1710 : }
867 heikki.linnakangas 1711 EUB : else
1712 : {
1713 : /*
1714 : * At this point we know the field is supposed to contain data.
1715 : *
1716 : * If we de-escaped any non-7-bit-ASCII chars, make sure the
1717 : * resulting string is valid data for the db encoding.
1718 : */
867 heikki.linnakangas 1719 GIC 3449859 : if (saw_non_ascii)
1720 : {
867 heikki.linnakangas 1721 UIC 0 : char *fld = cstate->raw_fields[fieldno];
867 heikki.linnakangas 1722 ECB :
867 heikki.linnakangas 1723 UIC 0 : pg_verifymbstr(fld, output_ptr - fld, false);
1724 : }
1725 : }
867 heikki.linnakangas 1726 ECB :
1727 : /* Terminate attribute value in output area */
867 heikki.linnakangas 1728 CBC 3452247 : *output_ptr++ = '\0';
867 heikki.linnakangas 1729 ECB :
867 heikki.linnakangas 1730 GIC 3452247 : fieldno++;
867 heikki.linnakangas 1731 ECB : /* Done if we hit EOL instead of a delim */
867 heikki.linnakangas 1732 CBC 3452247 : if (!found_delim)
1733 893800 : break;
867 heikki.linnakangas 1734 ECB : }
1735 :
1736 : /* Clean up state of attribute_buf */
867 heikki.linnakangas 1737 CBC 893800 : output_ptr--;
867 heikki.linnakangas 1738 GIC 893800 : Assert(*output_ptr == '\0');
867 heikki.linnakangas 1739 CBC 893800 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1740 :
867 heikki.linnakangas 1741 GIC 893800 : return fieldno;
867 heikki.linnakangas 1742 ECB : }
1743 :
1744 : /*
1745 : * Parse the current line into separate attributes (fields),
1746 : * performing de-escaping as needed. This has exactly the same API as
1747 : * CopyReadAttributesText, except we parse the fields according to
1748 : * "standard" (i.e. common) CSV usage.
1749 : */
1750 : static int
867 heikki.linnakangas 1751 GIC 193 : CopyReadAttributesCSV(CopyFromState cstate)
1752 : {
1753 193 : char delimc = cstate->opts.delim[0];
1754 193 : char quotec = cstate->opts.quote[0];
1755 193 : char escapec = cstate->opts.escape[0];
1756 : int fieldno;
1757 : char *output_ptr;
1758 : char *cur_ptr;
1759 : char *line_end_ptr;
1760 :
1761 : /*
1762 : * We need a special case for zero-column tables: check that the input
1763 : * line is empty, and return.
867 heikki.linnakangas 1764 ECB : */
867 heikki.linnakangas 1765 GIC 193 : if (cstate->max_fields <= 0)
867 heikki.linnakangas 1766 EUB : {
867 heikki.linnakangas 1767 UIC 0 : if (cstate->line_buf.len != 0)
867 heikki.linnakangas 1768 UBC 0 : ereport(ERROR,
1769 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1770 : errmsg("extra data after last expected column")));
867 heikki.linnakangas 1771 UIC 0 : return 0;
1772 : }
867 heikki.linnakangas 1773 ECB :
867 heikki.linnakangas 1774 GIC 193 : resetStringInfo(&cstate->attribute_buf);
867 heikki.linnakangas 1775 ECB :
1776 : /*
1777 : * The de-escaped attributes will certainly not be longer than the input
1778 : * data line, so we can just force attribute_buf to be large enough and
1779 : * then transfer data without any checks for enough space. We need to do
1780 : * it this way because enlarging attribute_buf mid-stream would invalidate
1781 : * pointers already stored into cstate->raw_fields[].
1782 : */
867 heikki.linnakangas 1783 CBC 193 : if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
867 heikki.linnakangas 1784 LBC 0 : enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
867 heikki.linnakangas 1785 GIC 193 : output_ptr = cstate->attribute_buf.data;
867 heikki.linnakangas 1786 ECB :
1787 : /* set pointer variables for loop */
867 heikki.linnakangas 1788 GIC 193 : cur_ptr = cstate->line_buf.data;
1789 193 : line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1790 :
1791 : /* Outer loop iterates over fields */
1792 193 : fieldno = 0;
1793 : for (;;)
1794 227 : {
1795 420 : bool found_delim = false;
867 heikki.linnakangas 1796 CBC 420 : bool saw_quote = false;
1797 : char *start_ptr;
867 heikki.linnakangas 1798 ECB : char *end_ptr;
1799 : int input_len;
1800 :
1801 : /* Make sure there is enough space for the next value */
867 heikki.linnakangas 1802 GIC 420 : if (fieldno >= cstate->max_fields)
1803 : {
867 heikki.linnakangas 1804 UIC 0 : cstate->max_fields *= 2;
1805 0 : cstate->raw_fields =
1806 0 : repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1807 : }
1808 :
1809 : /* Remember start of field on both input and output sides */
867 heikki.linnakangas 1810 CBC 420 : start_ptr = cur_ptr;
867 heikki.linnakangas 1811 GIC 420 : cstate->raw_fields[fieldno] = output_ptr;
867 heikki.linnakangas 1812 EUB :
1813 : /*
1814 : * Scan data for field,
1815 : *
1816 : * The loop starts in "not quote" mode and then toggles between that
1817 : * and "in quote" mode. The loop exits normally if it is in "not
1818 : * quote" mode and a delimiter or line end is seen.
867 heikki.linnakangas 1819 ECB : */
1820 : for (;;)
867 heikki.linnakangas 1821 GIC 79 : {
1822 : char c;
1823 :
1824 : /* Not in quote */
1825 : for (;;)
1826 : {
1827 1385 : end_ptr = cur_ptr;
867 heikki.linnakangas 1828 CBC 1385 : if (cur_ptr >= line_end_ptr)
867 heikki.linnakangas 1829 GBC 190 : goto endfield;
867 heikki.linnakangas 1830 CBC 1195 : c = *cur_ptr++;
1831 : /* unquoted field delimiter */
867 heikki.linnakangas 1832 GIC 1195 : if (c == delimc)
867 heikki.linnakangas 1833 ECB : {
867 heikki.linnakangas 1834 CBC 230 : found_delim = true;
867 heikki.linnakangas 1835 GIC 230 : goto endfield;
1836 : }
867 heikki.linnakangas 1837 ECB : /* start of quoted field (or part of field) */
867 heikki.linnakangas 1838 GIC 965 : if (c == quotec)
867 heikki.linnakangas 1839 ECB : {
867 heikki.linnakangas 1840 CBC 79 : saw_quote = true;
1841 79 : break;
1842 : }
1843 : /* Add c to output string */
867 heikki.linnakangas 1844 GIC 886 : *output_ptr++ = c;
1845 : }
1846 :
867 heikki.linnakangas 1847 ECB : /* In quote */
1848 : for (;;)
867 heikki.linnakangas 1849 EUB : {
867 heikki.linnakangas 1850 GBC 570 : end_ptr = cur_ptr;
1851 570 : if (cur_ptr >= line_end_ptr)
867 heikki.linnakangas 1852 UIC 0 : ereport(ERROR,
1853 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1854 : errmsg("unterminated CSV quoted field")));
867 heikki.linnakangas 1855 ECB :
867 heikki.linnakangas 1856 CBC 570 : c = *cur_ptr++;
1857 :
1858 : /* escape within a quoted field */
867 heikki.linnakangas 1859 GIC 570 : if (c == escapec)
1860 : {
1861 : /*
1862 : * peek at the next char if available, and escape it if it
1863 : * is an escape char or a quote char
1864 : */
1865 47 : if (cur_ptr < line_end_ptr)
867 heikki.linnakangas 1866 ECB : {
867 heikki.linnakangas 1867 GIC 33 : char nextc = *cur_ptr;
1868 :
1869 33 : if (nextc == escapec || nextc == quotec)
1870 : {
1871 12 : *output_ptr++ = nextc;
867 heikki.linnakangas 1872 CBC 12 : cur_ptr++;
1873 12 : continue;
867 heikki.linnakangas 1874 ECB : }
1875 : }
1876 : }
1877 :
1878 : /*
1879 : * end of quoted field. Must do this test after testing for
1880 : * escape in case quote char and escape char are the same
1881 : * (which is the common case).
1882 : */
867 heikki.linnakangas 1883 CBC 558 : if (c == quotec)
867 heikki.linnakangas 1884 GIC 79 : break;
867 heikki.linnakangas 1885 ECB :
1886 : /* Add c to output string */
867 heikki.linnakangas 1887 GIC 479 : *output_ptr++ = c;
1888 : }
867 heikki.linnakangas 1889 ECB : }
867 heikki.linnakangas 1890 GIC 420 : endfield:
1891 :
1892 : /* Terminate attribute value in output area */
1893 420 : *output_ptr++ = '\0';
1894 :
867 heikki.linnakangas 1895 ECB : /* Check whether raw input matched null marker */
867 heikki.linnakangas 1896 CBC 420 : input_len = end_ptr - start_ptr;
867 heikki.linnakangas 1897 GBC 420 : if (!saw_quote && input_len == cstate->opts.null_print_len &&
867 heikki.linnakangas 1898 GIC 16 : strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1899 16 : cstate->raw_fields[fieldno] = NULL;
1900 : /* Check whether raw input matched default marker */
25 andrew 1901 GNC 404 : else if (fieldno < list_length(cstate->attnumlist) &&
1902 404 : cstate->opts.default_print &&
27 1903 75 : input_len == cstate->opts.default_print_len &&
1904 21 : strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1905 : {
1906 : /* fieldno is 0-index and attnum is 1-index */
1907 21 : int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1908 :
1909 21 : if (cstate->defexprs[m] != NULL)
1910 : {
1911 : /* defaults contain entries for all physical attributes */
1912 18 : cstate->defaults[m] = true;
1913 : }
1914 : else
1915 : {
1916 3 : TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1917 3 : Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1918 :
1919 3 : ereport(ERROR,
1920 : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1921 : errmsg("unexpected DEFAULT in COPY data"),
1922 : errdetail("Column \"%s\" has no DEFAULT value.",
1923 : NameStr(att->attname))));
1924 : }
1925 : }
1926 :
867 heikki.linnakangas 1927 CBC 417 : fieldno++;
1928 : /* Done if we hit EOL instead of a delim */
867 heikki.linnakangas 1929 GIC 417 : if (!found_delim)
867 heikki.linnakangas 1930 CBC 190 : break;
1931 : }
1932 :
1933 : /* Clean up state of attribute_buf */
867 heikki.linnakangas 1934 GIC 190 : output_ptr--;
1935 190 : Assert(*output_ptr == '\0');
867 heikki.linnakangas 1936 CBC 190 : cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1937 :
1938 190 : return fieldno;
1939 : }
867 heikki.linnakangas 1940 ECB :
1941 :
1942 : /*
1943 : * Read a binary attribute
1944 : */
1945 : static Datum
867 heikki.linnakangas 1946 GIC 79 : CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
1947 : Oid typioparam, int32 typmod,
1948 : bool *isnull)
1949 : {
1950 : int32 fld_size;
1951 : Datum result;
1952 :
1953 79 : if (!CopyGetInt32(cstate, &fld_size))
867 heikki.linnakangas 1954 LBC 0 : ereport(ERROR,
867 heikki.linnakangas 1955 ECB : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1956 : errmsg("unexpected EOF in COPY data")));
867 heikki.linnakangas 1957 GIC 79 : if (fld_size == -1)
867 heikki.linnakangas 1958 ECB : {
867 heikki.linnakangas 1959 GIC 15 : *isnull = true;
1960 15 : return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
867 heikki.linnakangas 1961 ECB : }
867 heikki.linnakangas 1962 GIC 64 : if (fld_size < 0)
867 heikki.linnakangas 1963 UIC 0 : ereport(ERROR,
867 heikki.linnakangas 1964 ECB : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1965 : errmsg("invalid field size")));
1966 :
1967 : /* reset attribute_buf to empty, and load raw data in it */
867 heikki.linnakangas 1968 CBC 64 : resetStringInfo(&cstate->attribute_buf);
867 heikki.linnakangas 1969 ECB :
867 heikki.linnakangas 1970 CBC 64 : enlargeStringInfo(&cstate->attribute_buf, fld_size);
867 heikki.linnakangas 1971 GIC 64 : if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
867 heikki.linnakangas 1972 CBC 64 : fld_size) != fld_size)
867 heikki.linnakangas 1973 LBC 0 : ereport(ERROR,
867 heikki.linnakangas 1974 ECB : (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
1975 : errmsg("unexpected EOF in COPY data")));
1976 :
867 heikki.linnakangas 1977 GIC 64 : cstate->attribute_buf.len = fld_size;
867 heikki.linnakangas 1978 CBC 64 : cstate->attribute_buf.data[fld_size] = '\0';
1979 :
867 heikki.linnakangas 1980 ECB : /* Call the column type's binary input converter */
867 heikki.linnakangas 1981 GIC 64 : result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
1982 : typioparam, typmod);
867 heikki.linnakangas 1983 ECB :
1984 : /* Trouble if it didn't eat the whole buffer */
867 heikki.linnakangas 1985 GIC 64 : if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
1986 1 : ereport(ERROR,
867 heikki.linnakangas 1987 ECB : (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
1988 : errmsg("incorrect binary data format")));
1989 :
867 heikki.linnakangas 1990 CBC 63 : *isnull = false;
867 heikki.linnakangas 1991 GIC 63 : return result;
1992 : }
|