LCOV - differential code coverage report
Current view: top level - src/backend/utils/adt - varlena.c (source / functions) Coverage Total Hit UNC LBC UIC UBC GBC GIC GNC CBC EUB ECB DUB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 89.8 % 2196 1971 18 58 144 5 62 1240 112 557 152 1232 6 124
Current Date: 2023-04-08 15:15:32 Functions: 92.0 % 163 150 13 144 6 13 140 10
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * varlena.c
       4                 :  *    Functions for the variable-length built-in types.
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/utils/adt/varlena.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : #include "postgres.h"
      16                 : 
      17                 : #include <ctype.h>
      18                 : #include <limits.h>
      19                 : 
      20                 : #include "access/detoast.h"
      21                 : #include "access/toast_compression.h"
      22                 : #include "catalog/pg_collation.h"
      23                 : #include "catalog/pg_type.h"
      24                 : #include "common/hashfn.h"
      25                 : #include "common/int.h"
      26                 : #include "common/unicode_norm.h"
      27                 : #include "funcapi.h"
      28                 : #include "lib/hyperloglog.h"
      29                 : #include "libpq/pqformat.h"
      30                 : #include "miscadmin.h"
      31                 : #include "nodes/execnodes.h"
      32                 : #include "parser/scansup.h"
      33                 : #include "port/pg_bswap.h"
      34                 : #include "regex/regex.h"
      35                 : #include "utils/builtins.h"
      36                 : #include "utils/bytea.h"
      37                 : #include "utils/guc.h"
      38                 : #include "utils/lsyscache.h"
      39                 : #include "utils/memutils.h"
      40                 : #include "utils/pg_locale.h"
      41                 : #include "utils/sortsupport.h"
      42                 : #include "utils/varlena.h"
      43                 : 
      44                 : 
      45                 : /* GUC variable */
      46                 : int         bytea_output = BYTEA_OUTPUT_HEX;
      47                 : 
      48                 : typedef struct varlena VarString;
      49                 : 
      50                 : /*
      51                 :  * State for text_position_* functions.
      52                 :  */
      53                 : typedef struct
      54                 : {
      55                 :     bool        is_multibyte_char_in_char;  /* need to check char boundaries? */
      56                 : 
      57                 :     char       *str1;           /* haystack string */
      58                 :     char       *str2;           /* needle string */
      59                 :     int         len1;           /* string lengths in bytes */
      60                 :     int         len2;
      61                 : 
      62                 :     /* Skip table for Boyer-Moore-Horspool search algorithm: */
      63                 :     int         skiptablemask;  /* mask for ANDing with skiptable subscripts */
      64                 :     int         skiptable[256]; /* skip distance for given mismatched char */
      65                 : 
      66                 :     char       *last_match;     /* pointer to last match in 'str1' */
      67                 : 
      68                 :     /*
      69                 :      * Sometimes we need to convert the byte position of a match to a
      70                 :      * character position.  These store the last position that was converted,
      71                 :      * so that on the next call, we can continue from that point, rather than
      72                 :      * count characters from the very beginning.
      73                 :      */
      74                 :     char       *refpoint;       /* pointer within original haystack string */
      75                 :     int         refpos;         /* 0-based character offset of the same point */
      76                 : } TextPositionState;
      77                 : 
      78                 : typedef struct
      79                 : {
      80                 :     char       *buf1;           /* 1st string, or abbreviation original string
      81                 :                                  * buf */
      82                 :     char       *buf2;           /* 2nd string, or abbreviation strxfrm() buf */
      83                 :     int         buflen1;        /* Allocated length of buf1 */
      84                 :     int         buflen2;        /* Allocated length of buf2 */
      85                 :     int         last_len1;      /* Length of last buf1 string/strxfrm() input */
      86                 :     int         last_len2;      /* Length of last buf2 string/strxfrm() blob */
      87                 :     int         last_returned;  /* Last comparison result (cache) */
      88                 :     bool        cache_blob;     /* Does buf2 contain strxfrm() blob, etc? */
      89                 :     bool        collate_c;
      90                 :     Oid         typid;          /* Actual datatype (text/bpchar/bytea/name) */
      91                 :     hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
      92                 :     hyperLogLogState full_card; /* Full key cardinality state */
      93                 :     double      prop_card;      /* Required cardinality proportion */
      94                 :     pg_locale_t locale;
      95                 : } VarStringSortSupport;
      96                 : 
      97                 : /*
      98                 :  * Output data for split_text(): we output either to an array or a table.
      99                 :  * tupstore and tupdesc must be set up in advance to output to a table.
     100                 :  */
     101                 : typedef struct
     102                 : {
     103                 :     ArrayBuildState *astate;
     104                 :     Tuplestorestate *tupstore;
     105                 :     TupleDesc   tupdesc;
     106                 : } SplitTextOutputData;
     107                 : 
     108                 : /*
     109                 :  * This should be large enough that most strings will fit, but small enough
     110                 :  * that we feel comfortable putting it on the stack
     111                 :  */
     112                 : #define TEXTBUFLEN      1024
     113                 : 
     114                 : #define DatumGetVarStringP(X)       ((VarString *) PG_DETOAST_DATUM(X))
     115                 : #define DatumGetVarStringPP(X)      ((VarString *) PG_DETOAST_DATUM_PACKED(X))
     116                 : 
     117                 : static int  varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
     118                 : static int  bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
     119                 : static int  namefastcmp_c(Datum x, Datum y, SortSupport ssup);
     120                 : static int  varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
     121                 : static int  namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
     122                 : static int  varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
     123                 : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
     124                 : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
     125                 : static int32 text_length(Datum str);
     126                 : static text *text_catenate(text *t1, text *t2);
     127                 : static text *text_substring(Datum str,
     128                 :                             int32 start,
     129                 :                             int32 length,
     130                 :                             bool length_not_specified);
     131                 : static text *text_overlay(text *t1, text *t2, int sp, int sl);
     132                 : static int  text_position(text *t1, text *t2, Oid collid);
     133                 : static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
     134                 : static bool text_position_next(TextPositionState *state);
     135                 : static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
     136                 : static char *text_position_get_match_ptr(TextPositionState *state);
     137                 : static int  text_position_get_match_pos(TextPositionState *state);
     138                 : static void text_position_cleanup(TextPositionState *state);
     139                 : static void check_collation_set(Oid collid);
     140                 : static int  text_cmp(text *arg1, text *arg2, Oid collid);
     141                 : static bytea *bytea_catenate(bytea *t1, bytea *t2);
     142                 : static bytea *bytea_substring(Datum str,
     143                 :                               int S,
     144                 :                               int L,
     145                 :                               bool length_not_specified);
     146                 : static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
     147                 : static void appendStringInfoText(StringInfo str, const text *t);
     148                 : static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
     149                 : static void split_text_accum_result(SplitTextOutputData *tstate,
     150                 :                                     text *field_value,
     151                 :                                     text *null_string,
     152                 :                                     Oid collation);
     153                 : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
     154                 :                                     const char *fldsep, const char *null_string);
     155                 : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
     156                 : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
     157                 :                                      int *value);
     158                 : static const char *text_format_parse_format(const char *start_ptr,
     159                 :                                             const char *end_ptr,
     160                 :                                             int *argpos, int *widthpos,
     161                 :                                             int *flags, int *width);
     162                 : static void text_format_string_conversion(StringInfo buf, char conversion,
     163                 :                                           FmgrInfo *typOutputInfo,
     164                 :                                           Datum value, bool isNull,
     165                 :                                           int flags, int width);
     166                 : static void text_format_append_string(StringInfo buf, const char *str,
     167                 :                                       int flags, int width);
     168                 : 
     169                 : 
     170                 : /*****************************************************************************
     171                 :  *   CONVERSION ROUTINES EXPORTED FOR USE BY C CODE                          *
     172                 :  *****************************************************************************/
     173                 : 
     174                 : /*
     175                 :  * cstring_to_text
     176 ECB             :  *
     177                 :  * Create a text value from a null-terminated C string.
     178                 :  *
     179                 :  * The new text value is freshly palloc'd with a full-size VARHDR.
     180                 :  */
     181                 : text *
     182 GIC    12430865 : cstring_to_text(const char *s)
     183                 : {
     184        12430865 :     return cstring_to_text_with_len(s, strlen(s));
     185                 : }
     186                 : 
     187                 : /*
     188 ECB             :  * cstring_to_text_with_len
     189                 :  *
     190                 :  * Same as cstring_to_text except the caller specifies the string length;
     191                 :  * the string need not be null_terminated.
     192                 :  */
     193                 : text *
     194 GIC    14970995 : cstring_to_text_with_len(const char *s, int len)
     195 ECB             : {
     196 GIC    14970995 :     text       *result = (text *) palloc(len + VARHDRSZ);
     197                 : 
     198        14970995 :     SET_VARSIZE(result, len + VARHDRSZ);
     199        14970995 :     memcpy(VARDATA(result), s, len);
     200                 : 
     201        14970995 :     return result;
     202                 : }
     203                 : 
     204                 : /*
     205                 :  * text_to_cstring
     206                 :  *
     207                 :  * Create a palloc'd, null-terminated C string from a text value.
     208                 :  *
     209 ECB             :  * We support being passed a compressed or toasted text value.
     210                 :  * This is a bit bogus since such values shouldn't really be referred to as
     211                 :  * "text *", but it seems useful for robustness.  If we didn't handle that
     212                 :  * case here, we'd need another routine that did, anyway.
     213                 :  */
     214                 : char *
     215 GIC     8999458 : text_to_cstring(const text *t)
     216 ECB             : {
     217                 :     /* must cast away the const, unfortunately */
     218 CBC     8999458 :     text       *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
     219 GIC     8999458 :     int         len = VARSIZE_ANY_EXHDR(tunpacked);
     220 ECB             :     char       *result;
     221                 : 
     222 GIC     8999458 :     result = (char *) palloc(len + 1);
     223 CBC     8999458 :     memcpy(result, VARDATA_ANY(tunpacked), len);
     224 GIC     8999458 :     result[len] = '\0';
     225                 : 
     226         8999458 :     if (tunpacked != t)
     227           64653 :         pfree(tunpacked);
     228                 : 
     229         8999458 :     return result;
     230                 : }
     231                 : 
     232                 : /*
     233                 :  * text_to_cstring_buffer
     234                 :  *
     235                 :  * Copy a text value into a caller-supplied buffer of size dst_len.
     236                 :  *
     237                 :  * The text string is truncated if necessary to fit.  The result is
     238                 :  * guaranteed null-terminated (unless dst_len == 0).
     239                 :  *
     240 ECB             :  * We support being passed a compressed or toasted text value.
     241                 :  * This is a bit bogus since such values shouldn't really be referred to as
     242                 :  * "text *", but it seems useful for robustness.  If we didn't handle that
     243                 :  * case here, we'd need another routine that did, anyway.
     244                 :  */
     245                 : void
     246 CBC         320 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
     247                 : {
     248 ECB             :     /* must cast away the const, unfortunately */
     249 CBC         320 :     text       *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
     250             320 :     size_t      src_len = VARSIZE_ANY_EXHDR(srcunpacked);
     251                 : 
     252 GBC         320 :     if (dst_len > 0)
     253 ECB             :     {
     254 CBC         320 :         dst_len--;
     255 GIC         320 :         if (dst_len >= src_len)
     256             320 :             dst_len = src_len;
     257 ECB             :         else                    /* ensure truncation is encoding-safe */
     258 UBC           0 :             dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
     259 CBC         320 :         memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
     260 GIC         320 :         dst[dst_len] = '\0';
     261                 :     }
     262                 : 
     263             320 :     if (srcunpacked != src)
     264 UIC           0 :         pfree(srcunpacked);
     265 GIC         320 : }
     266                 : 
     267                 : 
     268                 : /*****************************************************************************
     269                 :  *   USER I/O ROUTINES                                                       *
     270                 :  *****************************************************************************/
     271                 : 
     272                 : 
     273                 : #define VAL(CH)         ((CH) - '0')
     274                 : #define DIG(VAL)        ((VAL) + '0')
     275                 : 
     276                 : /*
     277                 :  *      byteain         - converts from printable representation of byte array
     278                 :  *
     279                 :  *      Non-printable characters must be passed as '\nnn' (octal) and are
     280                 :  *      converted to internal form.  '\' must be passed as '\\'.
     281                 :  *      ereport(ERROR, ...) if bad form.
     282 ECB             :  *
     283                 :  *      BUGS:
     284                 :  *              The input is scanned twice.
     285                 :  *              The error checking of input is minimal.
     286                 :  */
     287                 : Datum
     288 GIC      131417 : byteain(PG_FUNCTION_ARGS)
     289                 : {
     290          131417 :     char       *inputText = PG_GETARG_CSTRING(0);
     291 GNC      131417 :     Node       *escontext = fcinfo->context;
     292                 :     char       *tp;
     293 ECB             :     char       *rp;
     294                 :     int         bc;
     295                 :     bytea      *result;
     296                 : 
     297                 :     /* Recognize hex input */
     298 CBC      131417 :     if (inputText[0] == '\\' && inputText[1] == 'x')
     299 ECB             :     {
     300 GIC         405 :         size_t      len = strlen(inputText);
     301 ECB             : 
     302 GIC         405 :         bc = (len - 2) / 2 + VARHDRSZ;  /* maximum possible length */
     303 CBC         405 :         result = palloc(bc);
     304 GNC         405 :         bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
     305                 :                              escontext);
     306 GIC         399 :         SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
     307                 : 
     308 CBC         399 :         PG_RETURN_BYTEA_P(result);
     309                 :     }
     310 ECB             : 
     311                 :     /* Else, it's the traditional escaped style */
     312 CBC     2176934 :     for (bc = 0, tp = inputText; *tp != '\0'; bc++)
     313 ECB             :     {
     314 CBC     2045928 :         if (tp[0] != '\\')
     315         2045424 :             tp++;
     316             504 :         else if ((tp[0] == '\\') &&
     317             504 :                  (tp[1] >= '0' && tp[1] <= '3') &&
     318             498 :                  (tp[2] >= '0' && tp[2] <= '7') &&
     319 GBC         498 :                  (tp[3] >= '0' && tp[3] <= '7'))
     320 GIC         498 :             tp += 4;
     321               6 :         else if ((tp[0] == '\\') &&
     322               6 :                  (tp[1] == '\\'))
     323 UIC           0 :             tp += 2;
     324                 :         else
     325 ECB             :         {
     326                 :             /*
     327                 :              * one backslash, not followed by another or ### valid octal
     328                 :              */
     329 GNC           6 :             ereturn(escontext, (Datum) 0,
     330                 :                     (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
     331 ECB             :                      errmsg("invalid input syntax for type %s", "bytea")));
     332                 :         }
     333                 :     }
     334                 : 
     335 GIC      131006 :     bc += VARHDRSZ;
     336 ECB             : 
     337 CBC      131006 :     result = (bytea *) palloc(bc);
     338          131006 :     SET_VARSIZE(result, bc);
     339                 : 
     340          131006 :     tp = inputText;
     341          131006 :     rp = VARDATA(result);
     342         2176913 :     while (*tp != '\0')
     343 ECB             :     {
     344 CBC     2045907 :         if (tp[0] != '\\')
     345         2045409 :             *rp++ = *tp++;
     346 GIC         498 :         else if ((tp[0] == '\\') &&
     347 CBC         498 :                  (tp[1] >= '0' && tp[1] <= '3') &&
     348             498 :                  (tp[2] >= '0' && tp[2] <= '7') &&
     349             498 :                  (tp[3] >= '0' && tp[3] <= '7'))
     350 ECB             :         {
     351 CBC         498 :             bc = VAL(tp[1]);
     352 GIC         498 :             bc <<= 3;
     353 CBC         498 :             bc += VAL(tp[2]);
     354 GIC         498 :             bc <<= 3;
     355 GBC         498 :             *rp++ = bc + VAL(tp[3]);
     356 EUB             : 
     357 GIC         498 :             tp += 4;
     358 EUB             :         }
     359 UBC           0 :         else if ((tp[0] == '\\') &&
     360 UIC           0 :                  (tp[1] == '\\'))
     361                 :         {
     362               0 :             *rp++ = '\\';
     363               0 :             tp += 2;
     364                 :         }
     365                 :         else
     366 EUB             :         {
     367                 :             /*
     368                 :              * We should never get here. The first pass should not allow it.
     369                 :              */
     370 UNC           0 :             ereturn(escontext, (Datum) 0,
     371                 :                     (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
     372 ECB             :                      errmsg("invalid input syntax for type %s", "bytea")));
     373                 :         }
     374                 :     }
     375                 : 
     376 GIC      131006 :     PG_RETURN_BYTEA_P(result);
     377                 : }
     378                 : 
     379                 : /*
     380                 :  *      byteaout        - converts to printable representation of byte array
     381                 :  *
     382 ECB             :  *      In the traditional escaped format, non-printable characters are
     383                 :  *      printed as '\nnn' (octal) and '\' as '\\'.
     384                 :  */
     385                 : Datum
     386 GIC        6677 : byteaout(PG_FUNCTION_ARGS)
     387                 : {
     388 CBC        6677 :     bytea      *vlena = PG_GETARG_BYTEA_PP(0);
     389                 :     char       *result;
     390                 :     char       *rp;
     391 ECB             : 
     392 CBC        6677 :     if (bytea_output == BYTEA_OUTPUT_HEX)
     393 ECB             :     {
     394                 :         /* Print hex format */
     395 GIC        6486 :         rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
     396 CBC        6486 :         *rp++ = '\\';
     397 GIC        6486 :         *rp++ = 'x';
     398            6486 :         rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
     399                 :     }
     400             191 :     else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
     401                 :     {
     402                 :         /* Print traditional escaped format */
     403 ECB             :         char       *vp;
     404                 :         uint64      len;
     405                 :         int         i;
     406                 : 
     407 CBC         191 :         len = 1;                /* empty string has 1 char */
     408 GBC         191 :         vp = VARDATA_ANY(vlena);
     409 CBC      108825 :         for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
     410 ECB             :         {
     411 GIC      108634 :             if (*vp == '\\')
     412 LBC           0 :                 len += 2;
     413 GIC      108634 :             else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
     414             246 :                 len += 4;
     415                 :             else
     416          108388 :                 len++;
     417                 :         }
     418                 : 
     419                 :         /*
     420 ECB             :          * In principle len can't overflow uint32 if the input fit in 1GB, but
     421 EUB             :          * for safety let's check rather than relying on palloc's internal
     422                 :          * check.
     423                 :          */
     424 CBC         191 :         if (len > MaxAllocSize)
     425 UIC           0 :             ereport(ERROR,
     426 ECB             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     427                 :                      errmsg_internal("result of bytea output conversion is too large")));
     428 GIC         191 :         rp = result = (char *) palloc(len);
     429 ECB             : 
     430 GIC         191 :         vp = VARDATA_ANY(vlena);
     431 GBC      108825 :         for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
     432 EUB             :         {
     433 GIC      108634 :             if (*vp == '\\')
     434 ECB             :             {
     435 LBC           0 :                 *rp++ = '\\';
     436 UIC           0 :                 *rp++ = '\\';
     437                 :             }
     438 CBC      108634 :             else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
     439             246 :             {
     440 ECB             :                 int         val;    /* holds unprintable chars */
     441                 : 
     442 CBC         246 :                 val = *vp;
     443             246 :                 rp[0] = '\\';
     444             246 :                 rp[3] = DIG(val & 07);
     445             246 :                 val >>= 3;
     446 GIC         246 :                 rp[2] = DIG(val & 07);
     447             246 :                 val >>= 3;
     448 CBC         246 :                 rp[1] = DIG(val & 03);
     449 GIC         246 :                 rp += 4;
     450                 :             }
     451                 :             else
     452          108388 :                 *rp++ = *vp;
     453 EUB             :         }
     454                 :     }
     455                 :     else
     456                 :     {
     457 LBC           0 :         elog(ERROR, "unrecognized bytea_output setting: %d",
     458 ECB             :              bytea_output);
     459                 :         rp = result = NULL;     /* keep compiler quiet */
     460                 :     }
     461 GIC        6677 :     *rp = '\0';
     462            6677 :     PG_RETURN_CSTRING(result);
     463                 : }
     464                 : 
     465 ECB             : /*
     466                 :  *      bytearecv           - converts external binary format to bytea
     467                 :  */
     468                 : Datum
     469 GIC         506 : bytearecv(PG_FUNCTION_ARGS)
     470                 : {
     471 CBC         506 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     472 ECB             :     bytea      *result;
     473                 :     int         nbytes;
     474                 : 
     475 CBC         506 :     nbytes = buf->len - buf->cursor;
     476 GIC         506 :     result = (bytea *) palloc(nbytes + VARHDRSZ);
     477             506 :     SET_VARSIZE(result, nbytes + VARHDRSZ);
     478             506 :     pq_copymsgbytes(buf, VARDATA(result), nbytes);
     479             506 :     PG_RETURN_BYTEA_P(result);
     480                 : }
     481                 : 
     482                 : /*
     483                 :  *      byteasend           - converts bytea to binary format
     484 ECB             :  *
     485                 :  * This is a special case: just copy the input...
     486                 :  */
     487                 : Datum
     488 CBC        2811 : byteasend(PG_FUNCTION_ARGS)
     489                 : {
     490 GIC        2811 :     bytea      *vlena = PG_GETARG_BYTEA_P_COPY(0);
     491                 : 
     492 CBC        2811 :     PG_RETURN_BYTEA_P(vlena);
     493                 : }
     494                 : 
     495                 : Datum
     496           46387 : bytea_string_agg_transfn(PG_FUNCTION_ARGS)
     497                 : {
     498                 :     StringInfo  state;
     499 ECB             : 
     500 GIC       46387 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
     501 ECB             : 
     502                 :     /* Append the value unless null, preceding it with the delimiter. */
     503 GIC       46387 :     if (!PG_ARGISNULL(1))
     504                 :     {
     505           38887 :         bytea      *value = PG_GETARG_BYTEA_PP(1);
     506 GNC       38887 :         bool        isfirst = false;
     507                 : 
     508                 :         /*
     509                 :          * You might think we can just throw away the first delimiter, however
     510                 :          * we must keep it as we may be a parallel worker doing partial
     511                 :          * aggregation building a state to send to the main process.  We need
     512                 :          * to keep the delimiter of every aggregation so that the combine
     513                 :          * function can properly join up the strings of two separately
     514                 :          * partially aggregated results.  The first delimiter is only stripped
     515                 :          * off in the final function.  To know how much to strip off the front
     516                 :          * of the string, we store the length of the first delimiter in the
     517                 :          * StringInfo's cursor field, which we don't otherwise need here.
     518                 :          */
     519 GIC       38887 :         if (state == NULL)
     520                 :         {
     521              93 :             state = makeStringAggState(fcinfo);
     522 GNC          93 :             isfirst = true;
     523                 :         }
     524                 : 
     525           38887 :         if (!PG_ARGISNULL(2))
     526                 :         {
     527 GIC       38881 :             bytea      *delim = PG_GETARG_BYTEA_PP(2);
     528                 : 
     529 GNC       38881 :             appendBinaryStringInfo(state, VARDATA_ANY(delim),
     530           38881 :                                    VARSIZE_ANY_EXHDR(delim));
     531           38881 :             if (isfirst)
     532              90 :                 state->cursor = VARSIZE_ANY_EXHDR(delim);
     533 ECB             :         }
     534                 : 
     535 GNC       38887 :         appendBinaryStringInfo(state, VARDATA_ANY(value),
     536           38887 :                                VARSIZE_ANY_EXHDR(value));
     537 ECB             :     }
     538                 : 
     539                 :     /*
     540                 :      * The transition type for string_agg() is declared to be "internal",
     541                 :      * which is a pass-by-value type the same size as a pointer.
     542                 :      */
     543 GNC       46387 :     if (state)
     544           46363 :         PG_RETURN_POINTER(state);
     545              24 :     PG_RETURN_NULL();
     546 ECB             : }
     547                 : 
     548                 : Datum
     549 CBC          76 : bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
     550                 : {
     551                 :     StringInfo  state;
     552 ECB             : 
     553                 :     /* cannot be called directly because of internal-type argument */
     554 GIC          76 :     Assert(AggCheckCallContext(fcinfo, NULL));
     555                 : 
     556              76 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
     557                 : 
     558              76 :     if (state != NULL)
     559                 :     {
     560                 :         /* As per comment in transfn, strip data before the cursor position */
     561 ECB             :         bytea      *result;
     562 GNC          73 :         int         strippedlen = state->len - state->cursor;
     563 ECB             : 
     564 GNC          73 :         result = (bytea *) palloc(strippedlen + VARHDRSZ);
     565              73 :         SET_VARSIZE(result, strippedlen + VARHDRSZ);
     566              73 :         memcpy(VARDATA(result), &state->data[state->cursor], strippedlen);
     567 GIC          73 :         PG_RETURN_BYTEA_P(result);
     568 ECB             :     }
     569                 :     else
     570 GIC           3 :         PG_RETURN_NULL();
     571                 : }
     572                 : 
     573 ECB             : /*
     574                 :  *      textin          - converts "..." to internal representation
     575                 :  */
     576                 : Datum
     577 CBC     8942398 : textin(PG_FUNCTION_ARGS)
     578                 : {
     579 GIC     8942398 :     char       *inputText = PG_GETARG_CSTRING(0);
     580                 : 
     581 CBC     8942398 :     PG_RETURN_TEXT_P(cstring_to_text(inputText));
     582                 : }
     583 ECB             : 
     584                 : /*
     585                 :  *      textout         - converts internal representation to "..."
     586                 :  */
     587                 : Datum
     588 GIC     4169602 : textout(PG_FUNCTION_ARGS)
     589 ECB             : {
     590 GIC     4169602 :     Datum       txt = PG_GETARG_DATUM(0);
     591                 : 
     592         4169602 :     PG_RETURN_CSTRING(TextDatumGetCString(txt));
     593                 : }
     594                 : 
     595                 : /*
     596 ECB             :  *      textrecv            - converts external binary format to text
     597                 :  */
     598                 : Datum
     599 GIC       53360 : textrecv(PG_FUNCTION_ARGS)
     600 ECB             : {
     601 GIC       53360 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     602                 :     text       *result;
     603                 :     char       *str;
     604                 :     int         nbytes;
     605                 : 
     606           53360 :     str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
     607 ECB             : 
     608 GIC       53360 :     result = cstring_to_text_with_len(str, nbytes);
     609 CBC       53360 :     pfree(str);
     610 GIC       53360 :     PG_RETURN_TEXT_P(result);
     611 ECB             : }
     612                 : 
     613                 : /*
     614                 :  *      textsend            - converts text to binary format
     615                 :  */
     616                 : Datum
     617 GIC       34115 : textsend(PG_FUNCTION_ARGS)
     618 ECB             : {
     619 GIC       34115 :     text       *t = PG_GETARG_TEXT_PP(0);
     620 ECB             :     StringInfoData buf;
     621                 : 
     622 GIC       34115 :     pq_begintypsend(&buf);
     623           34115 :     pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
     624           34115 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     625 ECB             : }
     626                 : 
     627                 : 
     628                 : /*
     629                 :  *      unknownin           - converts "..." to internal representation
     630                 :  */
     631                 : Datum
     632 UIC           0 : unknownin(PG_FUNCTION_ARGS)
     633                 : {
     634               0 :     char       *str = PG_GETARG_CSTRING(0);
     635                 : 
     636 ECB             :     /* representation is same as cstring */
     637 UIC           0 :     PG_RETURN_CSTRING(pstrdup(str));
     638 ECB             : }
     639                 : 
     640                 : /*
     641                 :  *      unknownout          - converts internal representation to "..."
     642                 :  */
     643                 : Datum
     644 GIC         340 : unknownout(PG_FUNCTION_ARGS)
     645                 : {
     646                 :     /* representation is same as cstring */
     647             340 :     char       *str = PG_GETARG_CSTRING(0);
     648                 : 
     649             340 :     PG_RETURN_CSTRING(pstrdup(str));
     650                 : }
     651 EUB             : 
     652                 : /*
     653                 :  *      unknownrecv         - converts external binary format to unknown
     654                 :  */
     655                 : Datum
     656 UBC           0 : unknownrecv(PG_FUNCTION_ARGS)
     657                 : {
     658 UIC           0 :     StringInfo  buf = (StringInfo) PG_GETARG_POINTER(0);
     659                 :     char       *str;
     660                 :     int         nbytes;
     661                 : 
     662               0 :     str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
     663 ECB             :     /* representation is same as cstring */
     664 UIC           0 :     PG_RETURN_CSTRING(str);
     665                 : }
     666 ECB             : 
     667                 : /*
     668                 :  *      unknownsend         - converts unknown to binary format
     669                 :  */
     670                 : Datum
     671 UIC           0 : unknownsend(PG_FUNCTION_ARGS)
     672                 : {
     673                 :     /* representation is same as cstring */
     674               0 :     char       *str = PG_GETARG_CSTRING(0);
     675 EUB             :     StringInfoData buf;
     676                 : 
     677 UBC           0 :     pq_begintypsend(&buf);
     678 UIC           0 :     pq_sendtext(&buf, str, strlen(str));
     679               0 :     PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
     680                 : }
     681 EUB             : 
     682                 : 
     683                 : /* ========== PUBLIC ROUTINES ========== */
     684                 : 
     685                 : /*
     686                 :  * textlen -
     687                 :  *    returns the logical length of a text*
     688                 :  *     (which is less than the VARSIZE of the text*)
     689                 :  */
     690                 : Datum
     691 GIC      215278 : textlen(PG_FUNCTION_ARGS)
     692                 : {
     693 GBC      215278 :     Datum       str = PG_GETARG_DATUM(0);
     694                 : 
     695                 :     /* try to avoid decompressing argument */
     696          215278 :     PG_RETURN_INT32(text_length(str));
     697 EUB             : }
     698                 : 
     699                 : /*
     700                 :  * text_length -
     701                 :  *  Does the real work for textlen()
     702                 :  *
     703                 :  *  This is broken out so it can be called directly by other string processing
     704                 :  *  functions.  Note that the argument is passed as a Datum, to indicate that
     705                 :  *  it may still be in compressed form.  We can avoid decompressing it at all
     706                 :  *  in some cases.
     707                 :  */
     708                 : static int32
     709 GIC      215284 : text_length(Datum str)
     710 ECB             : {
     711                 :     /* fastpath when max encoding length is one */
     712 CBC      215284 :     if (pg_database_encoding_max_length() == 1)
     713 GIC          10 :         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
     714                 :     else
     715 ECB             :     {
     716 GIC      215274 :         text       *t = DatumGetTextPP(str);
     717                 : 
     718          215274 :         PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
     719                 :                                              VARSIZE_ANY_EXHDR(t)));
     720                 :     }
     721                 : }
     722                 : 
     723                 : /*
     724                 :  * textoctetlen -
     725                 :  *    returns the physical length of a text*
     726                 :  *     (which is less than the VARSIZE of the text*)
     727                 :  */
     728 ECB             : Datum
     729 GIC          35 : textoctetlen(PG_FUNCTION_ARGS)
     730                 : {
     731 CBC          35 :     Datum       str = PG_GETARG_DATUM(0);
     732 ECB             : 
     733                 :     /* We need not detoast the input at all */
     734 GIC          35 :     PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
     735 ECB             : }
     736                 : 
     737                 : /*
     738                 :  * textcat -
     739                 :  *    takes two text* and returns a text* that is the concatenation of
     740                 :  *    the two.
     741                 :  *
     742                 :  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
     743                 :  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
     744                 :  * Allocate space for output in all cases.
     745                 :  * XXX - thomas 1997-07-10
     746                 :  */
     747                 : Datum
     748 CBC     1335495 : textcat(PG_FUNCTION_ARGS)
     749                 : {
     750         1335495 :     text       *t1 = PG_GETARG_TEXT_PP(0);
     751 GIC     1335495 :     text       *t2 = PG_GETARG_TEXT_PP(1);
     752                 : 
     753 CBC     1335495 :     PG_RETURN_TEXT_P(text_catenate(t1, t2));
     754                 : }
     755                 : 
     756                 : /*
     757                 :  * text_catenate
     758                 :  *  Guts of textcat(), broken out so it can be used by other functions
     759                 :  *
     760                 :  * Arguments can be in short-header form, but not compressed or out-of-line
     761                 :  */
     762                 : static text *
     763 GIC     1335535 : text_catenate(text *t1, text *t2)
     764                 : {
     765                 :     text       *result;
     766                 :     int         len1,
     767 ECB             :                 len2,
     768                 :                 len;
     769                 :     char       *ptr;
     770                 : 
     771 GIC     1335535 :     len1 = VARSIZE_ANY_EXHDR(t1);
     772 CBC     1335535 :     len2 = VARSIZE_ANY_EXHDR(t2);
     773                 : 
     774                 :     /* paranoia ... probably should throw error instead? */
     775 GIC     1335535 :     if (len1 < 0)
     776 UIC           0 :         len1 = 0;
     777 GIC     1335535 :     if (len2 < 0)
     778 UIC           0 :         len2 = 0;
     779                 : 
     780 GIC     1335535 :     len = len1 + len2 + VARHDRSZ;
     781         1335535 :     result = (text *) palloc(len);
     782 ECB             : 
     783                 :     /* Set size of result string... */
     784 GIC     1335535 :     SET_VARSIZE(result, len);
     785                 : 
     786                 :     /* Fill data field of result string... */
     787         1335535 :     ptr = VARDATA(result);
     788         1335535 :     if (len1 > 0)
     789         1333907 :         memcpy(ptr, VARDATA_ANY(t1), len1);
     790 CBC     1335535 :     if (len2 > 0)
     791         1335430 :         memcpy(ptr + len1, VARDATA_ANY(t2), len2);
     792                 : 
     793 GIC     1335535 :     return result;
     794 ECB             : }
     795 EUB             : 
     796 ECB             : /*
     797 EUB             :  * charlen_to_bytelen()
     798                 :  *  Compute the number of bytes occupied by n characters starting at *p
     799 ECB             :  *
     800                 :  * It is caller's responsibility that there actually are n characters;
     801                 :  * the string need not be null-terminated.
     802                 :  */
     803                 : static int
     804 GIC        5457 : charlen_to_bytelen(const char *p, int n)
     805                 : {
     806 CBC        5457 :     if (pg_database_encoding_max_length() == 1)
     807 ECB             :     {
     808                 :         /* Optimization for single-byte encodings */
     809 LBC           0 :         return n;
     810 ECB             :     }
     811                 :     else
     812                 :     {
     813                 :         const char *s;
     814                 : 
     815 GIC     2960165 :         for (s = p; n > 0; n--)
     816         2954708 :             s += pg_mblen(s);
     817                 : 
     818            5457 :         return s - p;
     819                 :     }
     820                 : }
     821                 : 
     822                 : /*
     823 ECB             :  * text_substr()
     824                 :  * Return a substring starting at the specified position.
     825                 :  * - thomas 1997-12-31
     826                 :  *
     827                 :  * Input:
     828 EUB             :  *  - string
     829                 :  *  - starting position (is one-based)
     830                 :  *  - string length
     831                 :  *
     832                 :  * If the starting position is zero or less, then return from the start of the string
     833                 :  *  adjusting the length to be consistent with the "negative start" per SQL.
     834 ECB             :  * If the length is less than zero, return the remaining string.
     835                 :  *
     836                 :  * Added multibyte support.
     837                 :  * - Tatsuo Ishii 1998-4-21
     838                 :  * Changed behavior if starting position is less than one to conform to SQL behavior.
     839                 :  * Formerly returned the entire string; now returns a portion.
     840                 :  * - Thomas Lockhart 1998-12-10
     841                 :  * Now uses faster TOAST-slicing interface
     842                 :  * - John Gray 2002-02-22
     843                 :  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
     844                 :  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
     845                 :  * error; if E < 1, return '', not entire string). Fixed MB related bug when
     846                 :  * S > LC and < LC + 4 sometimes garbage characters are returned.
     847                 :  * - Joe Conway 2002-08-10
     848                 :  */
     849                 : Datum
     850 GIC      374965 : text_substr(PG_FUNCTION_ARGS)
     851                 : {
     852          374965 :     PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
     853                 :                                     PG_GETARG_INT32(1),
     854                 :                                     PG_GETARG_INT32(2),
     855                 :                                     false));
     856                 : }
     857                 : 
     858                 : /*
     859                 :  * text_substr_no_len -
     860                 :  *    Wrapper to avoid opr_sanity failure due to
     861                 :  *    one function accepting a different number of args.
     862                 :  */
     863                 : Datum
     864              24 : text_substr_no_len(PG_FUNCTION_ARGS)
     865                 : {
     866              24 :     PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
     867                 :                                     PG_GETARG_INT32(1),
     868                 :                                     -1, true));
     869 ECB             : }
     870                 : 
     871                 : /*
     872                 :  * text_substring -
     873                 :  *  Does the real work for text_substr() and text_substr_no_len()
     874                 :  *
     875                 :  *  This is broken out so it can be called directly by other string processing
     876                 :  *  functions.  Note that the argument is passed as a Datum, to indicate that
     877                 :  *  it may still be in compressed/toasted form.  We can avoid detoasting all
     878                 :  *  of it in some cases.
     879                 :  *
     880                 :  *  The result is always a freshly palloc'd datum.
     881                 :  */
     882                 : static text *
     883 CBC      394913 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
     884                 : {
     885          394913 :     int32       eml = pg_database_encoding_max_length();
     886 GIC      394913 :     int32       S = start;      /* start position */
     887                 :     int32       S1;             /* adjusted start position */
     888                 :     int32       L1;             /* adjusted substring length */
     889                 :     int32       E;              /* end position */
     890                 : 
     891                 :     /*
     892                 :      * SQL99 says S can be zero or negative, but we still must fetch from the
     893                 :      * start of the string.
     894                 :      */
     895          394913 :     S1 = Max(S, 1);
     896                 : 
     897                 :     /* life is easy if the encoding max length is 1 */
     898          394913 :     if (eml == 1)
     899                 :     {
     900               6 :         if (length_not_specified)   /* special case - get length to end of
     901                 :                                      * string */
     902 LBC           0 :             L1 = -1;
     903 GIC           6 :         else if (length < 0)
     904 ECB             :         {
     905                 :             /* SQL99 says to throw an error for E < S, i.e., negative length */
     906 UIC           0 :             ereport(ERROR,
     907                 :                     (errcode(ERRCODE_SUBSTRING_ERROR),
     908                 :                      errmsg("negative substring length not allowed")));
     909                 :             L1 = -1;            /* silence stupider compilers */
     910                 :         }
     911 GIC           6 :         else if (pg_add_s32_overflow(S, length, &E))
     912                 :         {
     913                 :             /*
     914 ECB             :              * L could be large enough for S + L to overflow, in which case
     915                 :              * the substring must run to end of string.
     916                 :              */
     917 LBC           0 :             L1 = -1;
     918                 :         }
     919 ECB             :         else
     920                 :         {
     921 EUB             :             /*
     922 ECB             :              * A zero or negative value for the end position can happen if the
     923                 :              * start was negative or one. SQL99 says to return a zero-length
     924                 :              * string.
     925 EUB             :              */
     926 GIC           6 :             if (E < 1)
     927 UIC           0 :                 return cstring_to_text("");
     928                 : 
     929 GIC           6 :             L1 = E - S1;
     930 ECB             :         }
     931                 : 
     932                 :         /*
     933                 :          * If the start position is past the end of the string, SQL99 says to
     934                 :          * return a zero-length string -- DatumGetTextPSlice() will do that
     935                 :          * for us.  We need only convert S1 to zero-based starting position.
     936 EUB             :          */
     937 GIC           6 :         return DatumGetTextPSlice(str, S1 - 1, L1);
     938                 :     }
     939          394907 :     else if (eml > 1)
     940                 :     {
     941                 :         /*
     942                 :          * When encoding max length is > 1, we can't get LC without
     943                 :          * detoasting, so we'll grab a conservatively large slice now and go
     944                 :          * back later to do the right thing
     945 ECB             :          */
     946 EUB             :         int32       slice_start;
     947                 :         int32       slice_size;
     948 ECB             :         int32       slice_strlen;
     949                 :         text       *slice;
     950                 :         int32       E1;
     951                 :         int32       i;
     952                 :         char       *p;
     953                 :         char       *s;
     954                 :         text       *ret;
     955                 : 
     956                 :         /*
     957                 :          * We need to start at position zero because there is no way to know
     958                 :          * in advance which byte offset corresponds to the supplied start
     959                 :          * position.
     960                 :          */
     961 GIC      394907 :         slice_start = 0;
     962                 : 
     963          394907 :         if (length_not_specified)   /* special case - get length to end of
     964                 :                                      * string */
     965              44 :             slice_size = L1 = -1;
     966          394863 :         else if (length < 0)
     967                 :         {
     968                 :             /* SQL99 says to throw an error for E < S, i.e., negative length */
     969               6 :             ereport(ERROR,
     970                 :                     (errcode(ERRCODE_SUBSTRING_ERROR),
     971                 :                      errmsg("negative substring length not allowed")));
     972                 :             slice_size = L1 = -1;   /* silence stupider compilers */
     973                 :         }
     974          394857 :         else if (pg_add_s32_overflow(S, length, &E))
     975                 :         {
     976                 :             /*
     977                 :              * L could be large enough for S + L to overflow, in which case
     978                 :              * the substring must run to end of string.
     979                 :              */
     980 CBC           3 :             slice_size = L1 = -1;
     981                 :         }
     982 ECB             :         else
     983                 :         {
     984                 :             /*
     985                 :              * A zero or negative value for the end position can happen if the
     986                 :              * start was negative or one. SQL99 says to return a zero-length
     987                 :              * string.
     988                 :              */
     989 GIC      394854 :             if (E < 1)
     990 UIC           0 :                 return cstring_to_text("");
     991                 : 
     992                 :             /*
     993 ECB             :              * if E is past the end of the string, the tuple toaster will
     994                 :              * truncate the length for us
     995                 :              */
     996 GIC      394854 :             L1 = E - S1;
     997                 : 
     998                 :             /*
     999 ECB             :              * Total slice size in bytes can't be any longer than the start
    1000                 :              * position plus substring length times the encoding max length.
    1001                 :              * If that overflows, we can just use -1.
    1002                 :              */
    1003 GIC      394854 :             if (pg_mul_s32_overflow(E, eml, &slice_size))
    1004               3 :                 slice_size = -1;
    1005                 :         }
    1006                 : 
    1007                 :         /*
    1008 ECB             :          * If we're working with an untoasted source, no need to do an extra
    1009 EUB             :          * copying step.
    1010                 :          */
    1011 GIC      394901 :         if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
    1012          394874 :             VARATT_IS_EXTERNAL(DatumGetPointer(str)))
    1013             162 :             slice = DatumGetTextPSlice(str, slice_start, slice_size);
    1014                 :         else
    1015 CBC      394739 :             slice = (text *) DatumGetPointer(str);
    1016                 : 
    1017                 :         /* see if we got back an empty string */
    1018 GIC      394901 :         if (VARSIZE_ANY_EXHDR(slice) == 0)
    1019                 :         {
    1020 UIC           0 :             if (slice != (text *) DatumGetPointer(str))
    1021               0 :                 pfree(slice);
    1022 LBC           0 :             return cstring_to_text("");
    1023 ECB             :         }
    1024                 : 
    1025                 :         /* Now we can get the actual length of the slice in MB characters */
    1026 GIC      394901 :         slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
    1027          394901 :                                             VARSIZE_ANY_EXHDR(slice));
    1028                 : 
    1029                 :         /*
    1030 ECB             :          * Check that the start position wasn't > slice_strlen. If so, SQL99
    1031                 :          * says to return a zero-length string.
    1032                 :          */
    1033 GIC      394901 :         if (S1 > slice_strlen)
    1034 ECB             :         {
    1035 GIC          11 :             if (slice != (text *) DatumGetPointer(str))
    1036 UIC           0 :                 pfree(slice);
    1037 CBC          11 :             return cstring_to_text("");
    1038                 :         }
    1039 EUB             : 
    1040                 :         /*
    1041                 :          * Adjust L1 and E1 now that we know the slice string length. Again
    1042                 :          * remember that S1 is one based, and slice_start is zero based.
    1043                 :          */
    1044 GIC      394890 :         if (L1 > -1)
    1045 CBC      394854 :             E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
    1046 ECB             :         else
    1047 GIC          36 :             E1 = slice_start + 1 + slice_strlen;
    1048                 : 
    1049                 :         /*
    1050                 :          * Find the start position in the slice; remember S1 is not zero based
    1051                 :          */
    1052 CBC      394890 :         p = VARDATA_ANY(slice);
    1053 GIC     2805764 :         for (i = 0; i < S1 - 1; i++)
    1054 CBC     2410874 :             p += pg_mblen(p);
    1055 EUB             : 
    1056 ECB             :         /* hang onto a pointer to our start position */
    1057 GIC      394890 :         s = p;
    1058                 : 
    1059                 :         /*
    1060                 :          * Count the actual bytes used by the substring of the requested
    1061                 :          * length.
    1062                 :          */
    1063 CBC     4855113 :         for (i = S1; i < E1; i++)
    1064         4460223 :             p += pg_mblen(p);
    1065                 : 
    1066          394890 :         ret = (text *) palloc(VARHDRSZ + (p - s));
    1067 GIC      394890 :         SET_VARSIZE(ret, VARHDRSZ + (p - s));
    1068          394890 :         memcpy(VARDATA(ret), s, (p - s));
    1069                 : 
    1070          394890 :         if (slice != (text *) DatumGetPointer(str))
    1071 CBC         162 :             pfree(slice);
    1072 ECB             : 
    1073 CBC      394890 :         return ret;
    1074                 :     }
    1075                 :     else
    1076 LBC           0 :         elog(ERROR, "invalid backend encoding: encoding max length < 1");
    1077                 : 
    1078                 :     /* not reached: suppress compiler warning */
    1079                 :     return NULL;
    1080                 : }
    1081                 : 
    1082 ECB             : /*
    1083                 :  * textoverlay
    1084                 :  *  Replace specified substring of first string with second
    1085                 :  *
    1086                 :  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
    1087                 :  * This code is a direct implementation of what the standard says.
    1088                 :  */
    1089                 : Datum
    1090 CBC          14 : textoverlay(PG_FUNCTION_ARGS)
    1091                 : {
    1092              14 :     text       *t1 = PG_GETARG_TEXT_PP(0);
    1093 GIC          14 :     text       *t2 = PG_GETARG_TEXT_PP(1);
    1094              14 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
    1095 GBC          14 :     int         sl = PG_GETARG_INT32(3);    /* substring length */
    1096                 : 
    1097 GIC          14 :     PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
    1098                 : }
    1099                 : 
    1100                 : Datum
    1101               6 : textoverlay_no_len(PG_FUNCTION_ARGS)
    1102                 : {
    1103               6 :     text       *t1 = PG_GETARG_TEXT_PP(0);
    1104               6 :     text       *t2 = PG_GETARG_TEXT_PP(1);
    1105               6 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
    1106                 :     int         sl;
    1107                 : 
    1108               6 :     sl = text_length(PointerGetDatum(t2));  /* defaults to length(t2) */
    1109 CBC           6 :     PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
    1110                 : }
    1111 ECB             : 
    1112                 : static text *
    1113 CBC          20 : text_overlay(text *t1, text *t2, int sp, int sl)
    1114 ECB             : {
    1115                 :     text       *result;
    1116                 :     text       *s1;
    1117                 :     text       *s2;
    1118                 :     int         sp_pl_sl;
    1119                 : 
    1120                 :     /*
    1121                 :      * Check for possible integer-overflow cases.  For negative sp, throw a
    1122                 :      * "substring length" error because that's what should be expected
    1123                 :      * according to the spec's definition of OVERLAY().
    1124                 :      */
    1125 GIC          20 :     if (sp <= 0)
    1126 UIC           0 :         ereport(ERROR,
    1127 ECB             :                 (errcode(ERRCODE_SUBSTRING_ERROR),
    1128                 :                  errmsg("negative substring length not allowed")));
    1129 GIC          20 :     if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
    1130 UIC           0 :         ereport(ERROR,
    1131                 :                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    1132 ECB             :                  errmsg("integer out of range")));
    1133                 : 
    1134 GIC          20 :     s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
    1135              20 :     s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
    1136              20 :     result = text_catenate(s1, t2);
    1137              20 :     result = text_catenate(result, s2);
    1138                 : 
    1139              20 :     return result;
    1140                 : }
    1141                 : 
    1142                 : /*
    1143                 :  * textpos -
    1144 ECB             :  *    Return the position of the specified substring.
    1145 EUB             :  *    Implements the SQL POSITION() function.
    1146                 :  *    Ref: A Guide To The SQL Standard, Date & Darwen, 1997
    1147                 :  * - thomas 1997-07-27
    1148 ECB             :  */
    1149 EUB             : Datum
    1150 GIC          53 : textpos(PG_FUNCTION_ARGS)
    1151                 : {
    1152              53 :     text       *str = PG_GETARG_TEXT_PP(0);
    1153 CBC          53 :     text       *search_str = PG_GETARG_TEXT_PP(1);
    1154 ECB             : 
    1155 CBC          53 :     PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
    1156 ECB             : }
    1157                 : 
    1158                 : /*
    1159                 :  * text_position -
    1160                 :  *  Does the real work for textpos()
    1161                 :  *
    1162                 :  * Inputs:
    1163                 :  *      t1 - string to be searched
    1164                 :  *      t2 - pattern to match within t1
    1165                 :  * Result:
    1166                 :  *      Character index of the first matched char, starting from 1,
    1167                 :  *      or 0 if no match.
    1168                 :  *
    1169                 :  *  This is broken out so it can be called directly by other string processing
    1170                 :  *  functions.
    1171                 :  */
    1172                 : static int
    1173 GIC          53 : text_position(text *t1, text *t2, Oid collid)
    1174 ECB             : {
    1175                 :     TextPositionState state;
    1176                 :     int         result;
    1177                 : 
    1178                 :     /* Empty needle always matches at position 1 */
    1179 GIC          53 :     if (VARSIZE_ANY_EXHDR(t2) < 1)
    1180               6 :         return 1;
    1181                 : 
    1182                 :     /* Otherwise, can't match if haystack is shorter than needle */
    1183              47 :     if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
    1184              11 :         return 0;
    1185                 : 
    1186              36 :     text_position_setup(t1, t2, collid, &state);
    1187              36 :     if (!text_position_next(&state))
    1188              12 :         result = 0;
    1189                 :     else
    1190              24 :         result = text_position_get_match_pos(&state);
    1191              36 :     text_position_cleanup(&state);
    1192 CBC          36 :     return result;
    1193                 : }
    1194                 : 
    1195                 : 
    1196                 : /*
    1197                 :  * text_position_setup, text_position_next, text_position_cleanup -
    1198 ECB             :  *  Component steps of text_position()
    1199                 :  *
    1200                 :  * These are broken out so that a string can be efficiently searched for
    1201                 :  * multiple occurrences of the same pattern.  text_position_next may be
    1202                 :  * called multiple times, and it advances to the next match on each call.
    1203                 :  * text_position_get_match_ptr() and text_position_get_match_pos() return
    1204                 :  * a pointer or 1-based character position of the last match, respectively.
    1205                 :  *
    1206                 :  * The "state" variable is normally just a local variable in the caller.
    1207                 :  *
    1208                 :  * NOTE: text_position_next skips over the matched portion.  For example,
    1209                 :  * searching for "xx" in "xxx" returns only one match, not two.
    1210                 :  */
    1211                 : 
    1212                 : static void
    1213 GIC        1405 : text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
    1214                 : {
    1215            1405 :     int         len1 = VARSIZE_ANY_EXHDR(t1);
    1216            1405 :     int         len2 = VARSIZE_ANY_EXHDR(t2);
    1217            1405 :     pg_locale_t mylocale = 0;
    1218                 : 
    1219            1405 :     check_collation_set(collid);
    1220                 : 
    1221            1405 :     if (!lc_collate_is_c(collid))
    1222             140 :         mylocale = pg_newlocale_from_collation(collid);
    1223                 : 
    1224 GNC        1405 :     if (!pg_locale_deterministic(mylocale))
    1225 GIC           6 :         ereport(ERROR,
    1226                 :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1227                 :                  errmsg("nondeterministic collations are not supported for substring searches")));
    1228                 : 
    1229            1399 :     Assert(len1 > 0);
    1230            1399 :     Assert(len2 > 0);
    1231                 : 
    1232 ECB             :     /*
    1233                 :      * Even with a multi-byte encoding, we perform the search using the raw
    1234                 :      * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
    1235                 :      * because in UTF-8 the byte sequence of one character cannot contain
    1236                 :      * another character.  For other multi-byte encodings, we do the search
    1237                 :      * initially as a simple byte search, ignoring multibyte issues, but
    1238                 :      * verify afterwards that the match we found is at a character boundary,
    1239                 :      * and continue the search if it was a false match.
    1240                 :      */
    1241 CBC        1399 :     if (pg_database_encoding_max_length() == 1)
    1242 GIC          18 :         state->is_multibyte_char_in_char = false;
    1243 CBC        1381 :     else if (GetDatabaseEncoding() == PG_UTF8)
    1244            1381 :         state->is_multibyte_char_in_char = false;
    1245                 :     else
    1246 UIC           0 :         state->is_multibyte_char_in_char = true;
    1247                 : 
    1248 CBC        1399 :     state->str1 = VARDATA_ANY(t1);
    1249            1399 :     state->str2 = VARDATA_ANY(t2);
    1250 GIC        1399 :     state->len1 = len1;
    1251            1399 :     state->len2 = len2;
    1252            1399 :     state->last_match = NULL;
    1253            1399 :     state->refpoint = state->str1;
    1254            1399 :     state->refpos = 0;
    1255                 : 
    1256                 :     /*
    1257                 :      * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
    1258                 :      * notes we use the terminology that the "haystack" is the string to be
    1259                 :      * searched (t1) and the "needle" is the pattern being sought (t2).
    1260 ECB             :      *
    1261                 :      * If the needle is empty or bigger than the haystack then there is no
    1262                 :      * point in wasting cycles initializing the table.  We also choose not to
    1263                 :      * use B-M-H for needles of length 1, since the skip table can't possibly
    1264                 :      * save anything in that case.
    1265 EUB             :      */
    1266 GIC        1399 :     if (len1 >= len2 && len2 > 1)
    1267 ECB             :     {
    1268 CBC        1295 :         int         searchlength = len1 - len2;
    1269 ECB             :         int         skiptablemask;
    1270                 :         int         last;
    1271                 :         int         i;
    1272 CBC        1295 :         const char *str2 = state->str2;
    1273 ECB             : 
    1274                 :         /*
    1275                 :          * First we must determine how much of the skip table to use.  The
    1276                 :          * declaration of TextPositionState allows up to 256 elements, but for
    1277                 :          * short search problems we don't really want to have to initialize so
    1278                 :          * many elements --- it would take too long in comparison to the
    1279                 :          * actual search time.  So we choose a useful skip table size based on
    1280                 :          * the haystack length minus the needle length.  The closer the needle
    1281                 :          * length is to the haystack length the less useful skipping becomes.
    1282                 :          *
    1283                 :          * Note: since we use bit-masking to select table elements, the skip
    1284                 :          * table size MUST be a power of 2, and so the mask must be 2^N-1.
    1285                 :          */
    1286 GIC        1295 :         if (searchlength < 16)
    1287 CBC          27 :             skiptablemask = 3;
    1288 GIC        1268 :         else if (searchlength < 64)
    1289               8 :             skiptablemask = 7;
    1290            1260 :         else if (searchlength < 128)
    1291 CBC           7 :             skiptablemask = 15;
    1292 GIC        1253 :         else if (searchlength < 512)
    1293              95 :             skiptablemask = 31;
    1294            1158 :         else if (searchlength < 2048)
    1295            1096 :             skiptablemask = 63;
    1296              62 :         else if (searchlength < 4096)
    1297              28 :             skiptablemask = 127;
    1298                 :         else
    1299              34 :             skiptablemask = 255;
    1300            1295 :         state->skiptablemask = skiptablemask;
    1301                 : 
    1302                 :         /*
    1303                 :          * Initialize the skip table.  We set all elements to the needle
    1304                 :          * length, since this is the correct skip distance for any character
    1305 ECB             :          * not found in the needle.
    1306                 :          */
    1307 CBC       87051 :         for (i = 0; i <= skiptablemask; i++)
    1308           85756 :             state->skiptable[i] = len2;
    1309 ECB             : 
    1310                 :         /*
    1311                 :          * Now examine the needle.  For each character except the last one,
    1312                 :          * set the corresponding table element to the appropriate skip
    1313                 :          * distance.  Note that when two characters share the same skip table
    1314                 :          * entry, the one later in the needle must determine the skip
    1315                 :          * distance.
    1316                 :          */
    1317 GIC        1295 :         last = len2 - 1;
    1318 ECB             : 
    1319 CBC       16263 :         for (i = 0; i < last; i++)
    1320 GIC       14968 :             state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
    1321                 :     }
    1322            1399 : }
    1323                 : 
    1324                 : /*
    1325                 :  * Advance to the next match, starting from the end of the previous match
    1326 ECB             :  * (or the beginning of the string, on first call).  Returns true if a match
    1327                 :  * is found.
    1328                 :  *
    1329                 :  * Note that this refuses to match an empty-string needle.  Most callers
    1330                 :  * will have handled that case specially and we'll never see it here.
    1331                 :  */
    1332                 : static bool
    1333 GIC        4789 : text_position_next(TextPositionState *state)
    1334                 : {
    1335            4789 :     int         needle_len = state->len2;
    1336 ECB             :     char       *start_ptr;
    1337                 :     char       *matchptr;
    1338                 : 
    1339 CBC        4789 :     if (needle_len <= 0)
    1340 UIC           0 :         return false;           /* result for empty pattern */
    1341 ECB             : 
    1342                 :     /* Start from the point right after the previous match. */
    1343 GIC        4789 :     if (state->last_match)
    1344            3384 :         start_ptr = state->last_match + needle_len;
    1345                 :     else
    1346            1405 :         start_ptr = state->str1;
    1347                 : 
    1348            4789 : retry:
    1349            4789 :     matchptr = text_position_next_internal(start_ptr, state);
    1350                 : 
    1351            4789 :     if (!matchptr)
    1352 CBC        1369 :         return false;
    1353                 : 
    1354 ECB             :     /*
    1355                 :      * Found a match for the byte sequence.  If this is a multibyte encoding,
    1356                 :      * where one character's byte sequence can appear inside a longer
    1357                 :      * multi-byte character, we need to verify that the match was at a
    1358                 :      * character boundary, not in the middle of a multi-byte character.
    1359 EUB             :      */
    1360 GIC        3420 :     if (state->is_multibyte_char_in_char)
    1361                 :     {
    1362 ECB             :         /* Walk one character at a time, until we reach the match. */
    1363                 : 
    1364                 :         /* the search should never move backwards. */
    1365 LBC           0 :         Assert(state->refpoint <= matchptr);
    1366                 : 
    1367               0 :         while (state->refpoint < matchptr)
    1368 ECB             :         {
    1369                 :             /* step to next character. */
    1370 LBC           0 :             state->refpoint += pg_mblen(state->refpoint);
    1371               0 :             state->refpos++;
    1372                 : 
    1373                 :             /*
    1374                 :              * If we stepped over the match's start position, then it was a
    1375                 :              * false positive, where the byte sequence appeared in the middle
    1376                 :              * of a multi-byte character.  Skip it, and continue the search at
    1377                 :              * the next character boundary.
    1378                 :              */
    1379               0 :             if (state->refpoint > matchptr)
    1380                 :             {
    1381 UIC           0 :                 start_ptr = state->refpoint;
    1382               0 :                 goto retry;
    1383                 :             }
    1384 EUB             :         }
    1385                 :     }
    1386                 : 
    1387 GIC        3420 :     state->last_match = matchptr;
    1388            3420 :     return true;
    1389 EUB             : }
    1390                 : 
    1391                 : /*
    1392                 :  * Subroutine of text_position_next().  This searches for the raw byte
    1393                 :  * sequence, ignoring any multi-byte encoding issues.  Returns the first
    1394                 :  * match starting at 'start_ptr', or NULL if no match is found.
    1395                 :  */
    1396                 : static char *
    1397 GIC        4789 : text_position_next_internal(char *start_ptr, TextPositionState *state)
    1398 EUB             : {
    1399 GIC        4789 :     int         haystack_len = state->len1;
    1400 GBC        4789 :     int         needle_len = state->len2;
    1401            4789 :     int         skiptablemask = state->skiptablemask;
    1402 GIC        4789 :     const char *haystack = state->str1;
    1403            4789 :     const char *needle = state->str2;
    1404            4789 :     const char *haystack_end = &haystack[haystack_len];
    1405                 :     const char *hptr;
    1406 ECB             : 
    1407 CBC        4789 :     Assert(start_ptr >= haystack && start_ptr <= haystack_end);
    1408                 : 
    1409 GIC        4789 :     if (needle_len == 1)
    1410                 :     {
    1411                 :         /* No point in using B-M-H for a one-character needle */
    1412             377 :         char        nchar = *needle;
    1413                 : 
    1414             377 :         hptr = start_ptr;
    1415            2877 :         while (hptr < haystack_end)
    1416 ECB             :         {
    1417 GIC        2794 :             if (*hptr == nchar)
    1418 CBC         294 :                 return (char *) hptr;
    1419            2500 :             hptr++;
    1420 ECB             :         }
    1421                 :     }
    1422                 :     else
    1423                 :     {
    1424 GIC        4412 :         const char *needle_last = &needle[needle_len - 1];
    1425                 : 
    1426 ECB             :         /* Start at startpos plus the length of the needle */
    1427 GIC        4412 :         hptr = start_ptr + needle_len - 1;
    1428 CBC      116857 :         while (hptr < haystack_end)
    1429                 :         {
    1430                 :             /* Match the needle scanning *backward* */
    1431 ECB             :             const char *nptr;
    1432                 :             const char *p;
    1433                 : 
    1434 CBC      115571 :             nptr = needle_last;
    1435 GIC      115571 :             p = hptr;
    1436 CBC      160855 :             while (*nptr == *p)
    1437 ECB             :             {
    1438                 :                 /* Matched it all?  If so, return 1-based position */
    1439 GIC       48410 :                 if (nptr == needle)
    1440            3126 :                     return (char *) p;
    1441           45284 :                 nptr--, p--;
    1442                 :             }
    1443 ECB             : 
    1444                 :             /*
    1445                 :              * No match, so use the haystack char at hptr to decide how far to
    1446                 :              * advance.  If the needle had any occurrence of that character
    1447                 :              * (or more precisely, one sharing the same skiptable entry)
    1448                 :              * before its last character, then we advance far enough to align
    1449                 :              * the last such needle character with that haystack position.
    1450                 :              * Otherwise we can advance by the whole needle length.
    1451                 :              */
    1452 GIC      112445 :             hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
    1453 ECB             :         }
    1454                 :     }
    1455                 : 
    1456 GIC        1369 :     return 0;                   /* not found */
    1457                 : }
    1458 ECB             : 
    1459                 : /*
    1460                 :  * Return a pointer to the current match.
    1461                 :  *
    1462                 :  * The returned pointer points into the original haystack string.
    1463                 :  */
    1464                 : static char *
    1465 GIC        3381 : text_position_get_match_ptr(TextPositionState *state)
    1466                 : {
    1467            3381 :     return state->last_match;
    1468                 : }
    1469                 : 
    1470                 : /*
    1471 ECB             :  * Return the offset of the current match.
    1472                 :  *
    1473                 :  * The offset is in characters, 1-based.
    1474                 :  */
    1475                 : static int
    1476 GIC          24 : text_position_get_match_pos(TextPositionState *state)
    1477                 : {
    1478                 :     /* Convert the byte position to char position. */
    1479              48 :     state->refpos += pg_mbstrlen_with_len(state->refpoint,
    1480              24 :                                           state->last_match - state->refpoint);
    1481              24 :     state->refpoint = state->last_match;
    1482              24 :     return state->refpos + 1;
    1483                 : }
    1484 ECB             : 
    1485                 : /*
    1486                 :  * Reset search state to the initial state installed by text_position_setup.
    1487                 :  *
    1488                 :  * The next call to text_position_next will search from the beginning
    1489                 :  * of the string.
    1490                 :  */
    1491                 : static void
    1492 GIC           6 : text_position_reset(TextPositionState *state)
    1493                 : {
    1494               6 :     state->last_match = NULL;
    1495 CBC           6 :     state->refpoint = state->str1;
    1496 GIC           6 :     state->refpos = 0;
    1497               6 : }
    1498 ECB             : 
    1499                 : static void
    1500 CBC        1399 : text_position_cleanup(TextPositionState *state)
    1501 ECB             : {
    1502                 :     /* no cleanup needed */
    1503 GIC        1399 : }
    1504                 : 
    1505                 : 
    1506                 : static void
    1507         8518884 : check_collation_set(Oid collid)
    1508                 : {
    1509         8518884 :     if (!OidIsValid(collid))
    1510                 :     {
    1511 ECB             :         /*
    1512                 :          * This typically means that the parser could not resolve a conflict
    1513                 :          * of implicit collations, so report it that way.
    1514                 :          */
    1515 CBC          24 :         ereport(ERROR,
    1516 ECB             :                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
    1517                 :                  errmsg("could not determine which collation to use for string comparison"),
    1518                 :                  errhint("Use the COLLATE clause to set the collation explicitly.")));
    1519                 :     }
    1520 GIC     8518860 : }
    1521                 : 
    1522 ECB             : /* varstr_cmp()
    1523                 :  * Comparison function for text strings with given lengths.
    1524                 :  * Includes locale support, but must copy strings to temporary memory
    1525                 :  *  to allow null-termination for inputs to strcoll().
    1526                 :  * Returns an integer less than, equal to, or greater than zero, indicating
    1527                 :  * whether arg1 is less than, equal to, or greater than arg2.
    1528                 :  *
    1529                 :  * Note: many functions that depend on this are marked leakproof; therefore,
    1530                 :  * avoid reporting the actual contents of the input when throwing errors.
    1531                 :  * All errors herein should be things that can't happen except on corrupt
    1532                 :  * data, anyway; otherwise we will have trouble with indexing strings that
    1533                 :  * would cause them.
    1534                 :  */
    1535                 : int
    1536 GIC     4679892 : varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
    1537                 : {
    1538                 :     int         result;
    1539 ECB             : 
    1540 GIC     4679892 :     check_collation_set(collid);
    1541                 : 
    1542                 :     /*
    1543                 :      * Unfortunately, there is no strncoll(), so in the non-C locale case we
    1544                 :      * have to do some memory copying.  This turns out to be significantly
    1545                 :      * slower, so we optimize the case where LC_COLLATE is C.  We also try to
    1546                 :      * optimize relatively-short strings by avoiding palloc/pfree overhead.
    1547                 :      */
    1548         4679877 :     if (lc_collate_is_c(collid))
    1549                 :     {
    1550         2394653 :         result = memcmp(arg1, arg2, Min(len1, len2));
    1551         2394653 :         if ((result == 0) && (len1 != len2))
    1552           68787 :             result = (len1 < len2) ? -1 : 1;
    1553                 :     }
    1554                 :     else
    1555 ECB             :     {
    1556                 :         pg_locale_t mylocale;
    1557                 : 
    1558 GIC     2285224 :         mylocale = pg_newlocale_from_collation(collid);
    1559                 : 
    1560                 :         /*
    1561                 :          * memcmp() can't tell us which of two unequal strings sorts first,
    1562                 :          * but it's a cheap way to tell if they're equal.  Testing shows that
    1563 ECB             :          * memcmp() followed by strcoll() is only trivially slower than
    1564                 :          * strcoll() by itself, so we don't lose much if this doesn't work out
    1565                 :          * very often, and if it does - for example, because there are many
    1566                 :          * equal strings in the input - then we win big by avoiding expensive
    1567                 :          * collation-aware comparisons.
    1568                 :          */
    1569 GIC     2285224 :         if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
    1570          949522 :             return 0;
    1571                 : 
    1572 GNC     1335702 :         result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
    1573                 : 
    1574                 :         /* Break tie if necessary. */
    1575         1335702 :         if (result == 0 && pg_locale_deterministic(mylocale))
    1576                 :         {
    1577 UNC           0 :             result = memcmp(arg1, arg2, Min(len1, len2));
    1578               0 :             if ((result == 0) && (len1 != len2))
    1579               0 :                 result = (len1 < len2) ? -1 : 1;
    1580                 :         }
    1581                 :     }
    1582                 : 
    1583 CBC     3730355 :     return result;
    1584                 : }
    1585                 : 
    1586                 : /* text_cmp()
    1587 ECB             :  * Internal comparison function for text strings.
    1588                 :  * Returns -1, 0 or 1
    1589                 :  */
    1590                 : static int
    1591 GIC     3848460 : text_cmp(text *arg1, text *arg2, Oid collid)
    1592                 : {
    1593 ECB             :     char       *a1p,
    1594                 :                *a2p;
    1595                 :     int         len1,
    1596                 :                 len2;
    1597                 : 
    1598 CBC     3848460 :     a1p = VARDATA_ANY(arg1);
    1599 GIC     3848460 :     a2p = VARDATA_ANY(arg2);
    1600                 : 
    1601         3848460 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    1602 CBC     3848460 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    1603                 : 
    1604         3848460 :     return varstr_cmp(a1p, len1, a2p, len2, collid);
    1605 ECB             : }
    1606                 : 
    1607                 : /*
    1608                 :  * Comparison functions for text strings.
    1609                 :  *
    1610                 :  * Note: btree indexes need these routines not to leak memory; therefore,
    1611                 :  * be careful to free working copies of toasted datums.  Most places don't
    1612                 :  * need to be so careful.
    1613                 :  */
    1614                 : 
    1615                 : Datum
    1616 GIC     3610619 : texteq(PG_FUNCTION_ARGS)
    1617 ECB             : {
    1618 GIC     3610619 :     Oid         collid = PG_GET_COLLATION();
    1619 CBC     3610619 :     bool        locale_is_c = false;
    1620         3610619 :     pg_locale_t mylocale = 0;
    1621                 :     bool        result;
    1622                 : 
    1623         3610619 :     check_collation_set(collid);
    1624                 : 
    1625         3610619 :     if (lc_collate_is_c(collid))
    1626          273132 :         locale_is_c = true;
    1627                 :     else
    1628         3337487 :         mylocale = pg_newlocale_from_collation(collid);
    1629                 : 
    1630 GNC     3610619 :     if (locale_is_c || pg_locale_deterministic(mylocale))
    1631 GIC     3610377 :     {
    1632 CBC     3610377 :         Datum       arg1 = PG_GETARG_DATUM(0);
    1633 GIC     3610377 :         Datum       arg2 = PG_GETARG_DATUM(1);
    1634 ECB             :         Size        len1,
    1635                 :                     len2;
    1636                 : 
    1637                 :         /*
    1638                 :          * Since we only care about equality or not-equality, we can avoid all
    1639                 :          * the expense of strcoll() here, and just do bitwise comparison.  In
    1640                 :          * fact, we don't even have to do a bitwise comparison if we can show
    1641                 :          * the lengths of the strings are unequal; which might save us from
    1642                 :          * having to detoast one or both values.
    1643                 :          */
    1644 GIC     3610377 :         len1 = toast_raw_datum_size(arg1);
    1645         3610377 :         len2 = toast_raw_datum_size(arg2);
    1646         3610377 :         if (len1 != len2)
    1647 CBC     1435697 :             result = false;
    1648                 :         else
    1649 ECB             :         {
    1650 CBC     2174680 :             text       *targ1 = DatumGetTextPP(arg1);
    1651         2174680 :             text       *targ2 = DatumGetTextPP(arg2);
    1652 ECB             : 
    1653 GIC     2174680 :             result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1654                 :                              len1 - VARHDRSZ) == 0);
    1655                 : 
    1656         2174680 :             PG_FREE_IF_COPY(targ1, 0);
    1657 CBC     2174680 :             PG_FREE_IF_COPY(targ2, 1);
    1658                 :         }
    1659 ECB             :     }
    1660                 :     else
    1661                 :     {
    1662 CBC         242 :         text       *arg1 = PG_GETARG_TEXT_PP(0);
    1663 GBC         242 :         text       *arg2 = PG_GETARG_TEXT_PP(1);
    1664                 : 
    1665 GIC         242 :         result = (text_cmp(arg1, arg2, collid) == 0);
    1666                 : 
    1667 CBC         242 :         PG_FREE_IF_COPY(arg1, 0);
    1668             242 :         PG_FREE_IF_COPY(arg2, 1);
    1669 ECB             :     }
    1670 EUB             : 
    1671 GIC     3610619 :     PG_RETURN_BOOL(result);
    1672                 : }
    1673 ECB             : 
    1674                 : Datum
    1675 GIC        9726 : textne(PG_FUNCTION_ARGS)
    1676 ECB             : {
    1677 CBC        9726 :     Oid         collid = PG_GET_COLLATION();
    1678 GIC        9726 :     bool        locale_is_c = false;
    1679 CBC        9726 :     pg_locale_t mylocale = 0;
    1680 ECB             :     bool        result;
    1681                 : 
    1682 GIC        9726 :     check_collation_set(collid);
    1683 ECB             : 
    1684 GIC        9726 :     if (lc_collate_is_c(collid))
    1685               9 :         locale_is_c = true;
    1686                 :     else
    1687 CBC        9717 :         mylocale = pg_newlocale_from_collation(collid);
    1688                 : 
    1689 GNC        9726 :     if (locale_is_c || pg_locale_deterministic(mylocale))
    1690 CBC        9714 :     {
    1691 GIC        9714 :         Datum       arg1 = PG_GETARG_DATUM(0);
    1692            9714 :         Datum       arg2 = PG_GETARG_DATUM(1);
    1693 ECB             :         Size        len1,
    1694                 :                     len2;
    1695                 : 
    1696                 :         /* See comment in texteq() */
    1697 GIC        9714 :         len1 = toast_raw_datum_size(arg1);
    1698 CBC        9714 :         len2 = toast_raw_datum_size(arg2);
    1699 GIC        9714 :         if (len1 != len2)
    1700             917 :             result = true;
    1701                 :         else
    1702 ECB             :         {
    1703 GIC        8797 :             text       *targ1 = DatumGetTextPP(arg1);
    1704 CBC        8797 :             text       *targ2 = DatumGetTextPP(arg2);
    1705 ECB             : 
    1706 GIC        8797 :             result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1707                 :                              len1 - VARHDRSZ) != 0);
    1708 ECB             : 
    1709 GIC        8797 :             PG_FREE_IF_COPY(targ1, 0);
    1710            8797 :             PG_FREE_IF_COPY(targ2, 1);
    1711 ECB             :         }
    1712                 :     }
    1713                 :     else
    1714                 :     {
    1715 CBC          12 :         text       *arg1 = PG_GETARG_TEXT_PP(0);
    1716 GIC          12 :         text       *arg2 = PG_GETARG_TEXT_PP(1);
    1717                 : 
    1718              12 :         result = (text_cmp(arg1, arg2, collid) != 0);
    1719                 : 
    1720              12 :         PG_FREE_IF_COPY(arg1, 0);
    1721              12 :         PG_FREE_IF_COPY(arg2, 1);
    1722                 :     }
    1723                 : 
    1724            9726 :     PG_RETURN_BOOL(result);
    1725                 : }
    1726                 : 
    1727                 : Datum
    1728           62299 : text_lt(PG_FUNCTION_ARGS)
    1729 ECB             : {
    1730 GIC       62299 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1731 CBC       62299 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1732 ECB             :     bool        result;
    1733                 : 
    1734 CBC       62299 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
    1735                 : 
    1736           62284 :     PG_FREE_IF_COPY(arg1, 0);
    1737 GIC       62284 :     PG_FREE_IF_COPY(arg2, 1);
    1738                 : 
    1739           62284 :     PG_RETURN_BOOL(result);
    1740                 : }
    1741                 : 
    1742                 : Datum
    1743          162176 : text_le(PG_FUNCTION_ARGS)
    1744                 : {
    1745          162176 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1746          162176 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1747                 :     bool        result;
    1748                 : 
    1749          162176 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
    1750                 : 
    1751 CBC      162176 :     PG_FREE_IF_COPY(arg1, 0);
    1752 GIC      162176 :     PG_FREE_IF_COPY(arg2, 1);
    1753 ECB             : 
    1754 CBC      162176 :     PG_RETURN_BOOL(result);
    1755 ECB             : }
    1756                 : 
    1757                 : Datum
    1758 GIC       56829 : text_gt(PG_FUNCTION_ARGS)
    1759 ECB             : {
    1760 GIC       56829 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1761           56829 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1762 ECB             :     bool        result;
    1763                 : 
    1764 CBC       56829 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
    1765                 : 
    1766 GIC       56829 :     PG_FREE_IF_COPY(arg1, 0);
    1767           56829 :     PG_FREE_IF_COPY(arg2, 1);
    1768                 : 
    1769           56829 :     PG_RETURN_BOOL(result);
    1770                 : }
    1771                 : 
    1772                 : Datum
    1773 CBC       92446 : text_ge(PG_FUNCTION_ARGS)
    1774                 : {
    1775 GIC       92446 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1776           92446 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1777                 :     bool        result;
    1778 ECB             : 
    1779 GIC       92446 :     result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
    1780 EUB             : 
    1781 GIC       92446 :     PG_FREE_IF_COPY(arg1, 0);
    1782 GBC       92446 :     PG_FREE_IF_COPY(arg2, 1);
    1783                 : 
    1784 GIC       92446 :     PG_RETURN_BOOL(result);
    1785 ECB             : }
    1786                 : 
    1787                 : Datum
    1788 GIC       18957 : text_starts_with(PG_FUNCTION_ARGS)
    1789                 : {
    1790           18957 :     Datum       arg1 = PG_GETARG_DATUM(0);
    1791           18957 :     Datum       arg2 = PG_GETARG_DATUM(1);
    1792           18957 :     Oid         collid = PG_GET_COLLATION();
    1793           18957 :     pg_locale_t mylocale = 0;
    1794                 :     bool        result;
    1795                 :     Size        len1,
    1796                 :                 len2;
    1797                 : 
    1798           18957 :     check_collation_set(collid);
    1799                 : 
    1800           18957 :     if (!lc_collate_is_c(collid))
    1801           18957 :         mylocale = pg_newlocale_from_collation(collid);
    1802                 : 
    1803 GNC       18957 :     if (!pg_locale_deterministic(mylocale))
    1804 LBC           0 :         ereport(ERROR,
    1805                 :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1806                 :                  errmsg("nondeterministic collations are not supported for substring searches")));
    1807                 : 
    1808 GIC       18957 :     len1 = toast_raw_datum_size(arg1);
    1809           18957 :     len2 = toast_raw_datum_size(arg2);
    1810           18957 :     if (len2 > len1)
    1811 UIC           0 :         result = false;
    1812                 :     else
    1813 ECB             :     {
    1814 GIC       18957 :         text       *targ1 = text_substring(arg1, 1, len2, false);
    1815 CBC       18957 :         text       *targ2 = DatumGetTextPP(arg2);
    1816 ECB             : 
    1817 CBC       18957 :         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
    1818           18957 :                          VARSIZE_ANY_EXHDR(targ2)) == 0);
    1819 ECB             : 
    1820 GIC       18957 :         PG_FREE_IF_COPY(targ1, 0);
    1821 CBC       18957 :         PG_FREE_IF_COPY(targ2, 1);
    1822 ECB             :     }
    1823                 : 
    1824 CBC       18957 :     PG_RETURN_BOOL(result);
    1825 ECB             : }
    1826                 : 
    1827                 : Datum
    1828 GIC     3316584 : bttextcmp(PG_FUNCTION_ARGS)
    1829                 : {
    1830         3316584 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    1831         3316584 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    1832                 :     int32       result;
    1833                 : 
    1834         3316584 :     result = text_cmp(arg1, arg2, PG_GET_COLLATION());
    1835                 : 
    1836         3316584 :     PG_FREE_IF_COPY(arg1, 0);
    1837         3316584 :     PG_FREE_IF_COPY(arg2, 1);
    1838                 : 
    1839         3316584 :     PG_RETURN_INT32(result);
    1840                 : }
    1841 ECB             : 
    1842                 : Datum
    1843 CBC       44359 : bttextsortsupport(PG_FUNCTION_ARGS)
    1844 ECB             : {
    1845 GIC       44359 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    1846           44359 :     Oid         collid = ssup->ssup_collation;
    1847                 :     MemoryContext oldcontext;
    1848                 : 
    1849           44359 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    1850                 : 
    1851 ECB             :     /* Use generic string SortSupport */
    1852 GIC       44359 :     varstr_sortsupport(ssup, TEXTOID, collid);
    1853 ECB             : 
    1854 CBC       44350 :     MemoryContextSwitchTo(oldcontext);
    1855 ECB             : 
    1856 CBC       44350 :     PG_RETURN_VOID();
    1857 ECB             : }
    1858                 : 
    1859                 : /*
    1860                 :  * Generic sortsupport interface for character type's operator classes.
    1861                 :  * Includes locale support, and support for BpChar semantics (i.e. removing
    1862                 :  * trailing spaces before comparison).
    1863                 :  *
    1864                 :  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
    1865                 :  * same representation.  Callers that always use the C collation (e.g.
    1866                 :  * non-collatable type callers like bytea) may have NUL bytes in their strings;
    1867                 :  * this will not work with any other collation, though.
    1868                 :  */
    1869                 : void
    1870 CBC       83052 : varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
    1871 ECB             : {
    1872 GIC       83052 :     bool        abbreviate = ssup->abbreviate;
    1873           83052 :     bool        collate_c = false;
    1874                 :     VarStringSortSupport *sss;
    1875           83052 :     pg_locale_t locale = 0;
    1876                 : 
    1877           83052 :     check_collation_set(collid);
    1878 ECB             : 
    1879                 :     /*
    1880                 :      * If possible, set ssup->comparator to a function which can be used to
    1881                 :      * directly compare two datums.  If we can do this, we'll avoid the
    1882                 :      * overhead of a trip through the fmgr layer for every comparison, which
    1883                 :      * can be substantial.
    1884                 :      *
    1885                 :      * Most typically, we'll set the comparator to varlenafastcmp_locale,
    1886                 :      * which uses strcoll() to perform comparisons.  We use that for the
    1887                 :      * BpChar case too, but type NAME uses namefastcmp_locale. However, if
    1888                 :      * LC_COLLATE = C, we can make things quite a bit faster with
    1889                 :      * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
    1890 EUB             :      * memcmp() rather than strcoll().
    1891 ECB             :      */
    1892 GBC       83043 :     if (lc_collate_is_c(collid))
    1893                 :     {
    1894 CBC       56902 :         if (typid == BPCHAROID)
    1895 GIC          11 :             ssup->comparator = bpcharfastcmp_c;
    1896           56891 :         else if (typid == NAMEOID)
    1897                 :         {
    1898           38227 :             ssup->comparator = namefastcmp_c;
    1899                 :             /* Not supporting abbreviation with type NAME, for now */
    1900           38227 :             abbreviate = false;
    1901                 :         }
    1902                 :         else
    1903           18664 :             ssup->comparator = varstrfastcmp_c;
    1904                 : 
    1905 CBC       56902 :         collate_c = true;
    1906                 :     }
    1907 ECB             :     else
    1908                 :     {
    1909                 :         /*
    1910                 :          * We need a collation-sensitive comparison.  To make things faster,
    1911                 :          * we'll figure out the collation based on the locale id and cache the
    1912                 :          * result.
    1913                 :          */
    1914 GIC       26141 :         locale = pg_newlocale_from_collation(collid);
    1915 ECB             : 
    1916                 :         /*
    1917                 :          * We use varlenafastcmp_locale except for type NAME.
    1918                 :          */
    1919 GIC       26141 :         if (typid == NAMEOID)
    1920                 :         {
    1921 UIC           0 :             ssup->comparator = namefastcmp_locale;
    1922                 :             /* Not supporting abbreviation with type NAME, for now */
    1923               0 :             abbreviate = false;
    1924 ECB             :         }
    1925                 :         else
    1926 CBC       26141 :             ssup->comparator = varlenafastcmp_locale;
    1927 ECB             :     }
    1928                 : 
    1929                 :     /*
    1930                 :      * Unfortunately, it seems that abbreviation for non-C collations is
    1931                 :      * broken on many common platforms; see pg_strxfrm_enabled().
    1932                 :      *
    1933                 :      * Even apart from the risk of broken locales, it's possible that there
    1934                 :      * are platforms where the use of abbreviated keys should be disabled at
    1935                 :      * compile time.  Having only 4 byte datums could make worst-case
    1936                 :      * performance drastically more likely, for example.  Moreover, macOS's
    1937                 :      * strxfrm() implementation is known to not effectively concentrate a
    1938                 :      * significant amount of entropy from the original string in earlier
    1939                 :      * transformed blobs.  It's possible that other supported platforms are
    1940                 :      * similarly encumbered.  So, if we ever get past disabling this
    1941                 :      * categorically, we may still want or need to disable it for particular
    1942                 :      * platforms.
    1943                 :      */
    1944 GNC       83043 :     if (!collate_c && !pg_strxfrm_enabled(locale))
    1945 CBC         105 :         abbreviate = false;
    1946                 : 
    1947 ECB             :     /*
    1948                 :      * If we're using abbreviated keys, or if we're using a locale-aware
    1949                 :      * comparison, we need to initialize a VarStringSortSupport object. Both
    1950                 :      * cases will make use of the temporary buffers we initialize here for
    1951                 :      * scratch space (and to detect requirement for BpChar semantics from
    1952                 :      * caller), and the abbreviation case requires additional state.
    1953                 :      */
    1954 GIC       83043 :     if (abbreviate || !collate_c)
    1955                 :     {
    1956           27973 :         sss = palloc(sizeof(VarStringSortSupport));
    1957           27973 :         sss->buf1 = palloc(TEXTBUFLEN);
    1958           27973 :         sss->buflen1 = TEXTBUFLEN;
    1959 GBC       27973 :         sss->buf2 = palloc(TEXTBUFLEN);
    1960 GIC       27973 :         sss->buflen2 = TEXTBUFLEN;
    1961 EUB             :         /* Start with invalid values */
    1962 GBC       27973 :         sss->last_len1 = -1;
    1963 GIC       27973 :         sss->last_len2 = -1;
    1964 EUB             :         /* Initialize */
    1965 GBC       27973 :         sss->last_returned = 0;
    1966 GIC       27973 :         sss->locale = locale;
    1967                 : 
    1968                 :         /*
    1969                 :          * To avoid somehow confusing a strxfrm() blob and an original string,
    1970                 :          * constantly keep track of the variety of data that buf1 and buf2
    1971                 :          * currently contain.
    1972                 :          *
    1973 ECB             :          * Comparisons may be interleaved with conversion calls.  Frequently,
    1974                 :          * conversions and comparisons are batched into two distinct phases,
    1975                 :          * but the correctness of caching cannot hinge upon this.  For
    1976                 :          * comparison caching, buffer state is only trusted if cache_blob is
    1977                 :          * found set to false, whereas strxfrm() caching only trusts the state
    1978                 :          * when cache_blob is found set to true.
    1979                 :          *
    1980                 :          * Arbitrarily initialize cache_blob to true.
    1981                 :          */
    1982 GIC       27973 :         sss->cache_blob = true;
    1983           27973 :         sss->collate_c = collate_c;
    1984           27973 :         sss->typid = typid;
    1985           27973 :         ssup->ssup_extra = sss;
    1986                 : 
    1987                 :         /*
    1988                 :          * If possible, plan to use the abbreviated keys optimization.  The
    1989                 :          * core code may switch back to authoritative comparator should
    1990                 :          * abbreviation be aborted.
    1991                 :          */
    1992           27973 :         if (abbreviate)
    1993                 :         {
    1994           24109 :             sss->prop_card = 0.20;
    1995 CBC       24109 :             initHyperLogLog(&sss->abbr_card, 10);
    1996 GIC       24109 :             initHyperLogLog(&sss->full_card, 10);
    1997           24109 :             ssup->abbrev_full_comparator = ssup->comparator;
    1998 CBC       24109 :             ssup->comparator = ssup_datum_unsigned_cmp;
    1999 GIC       24109 :             ssup->abbrev_converter = varstr_abbrev_convert;
    2000           24109 :             ssup->abbrev_abort = varstr_abbrev_abort;
    2001 ECB             :         }
    2002                 :     }
    2003 GIC       83043 : }
    2004                 : 
    2005 ECB             : /*
    2006                 :  * sortsupport comparison func (for C locale case)
    2007 EUB             :  */
    2008                 : static int
    2009 GIC    63929703 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
    2010 ECB             : {
    2011 GIC    63929703 :     VarString  *arg1 = DatumGetVarStringPP(x);
    2012 CBC    63929703 :     VarString  *arg2 = DatumGetVarStringPP(y);
    2013 ECB             :     char       *a1p,
    2014                 :                *a2p;
    2015                 :     int         len1,
    2016                 :                 len2,
    2017                 :                 result;
    2018                 : 
    2019 GIC    63929703 :     a1p = VARDATA_ANY(arg1);
    2020        63929703 :     a2p = VARDATA_ANY(arg2);
    2021                 : 
    2022        63929703 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    2023        63929703 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    2024                 : 
    2025        63929703 :     result = memcmp(a1p, a2p, Min(len1, len2));
    2026 CBC    63929703 :     if ((result == 0) && (len1 != len2))
    2027         1846743 :         result = (len1 < len2) ? -1 : 1;
    2028                 : 
    2029 ECB             :     /* We can't afford to leak memory here. */
    2030 CBC    63929703 :     if (PointerGetDatum(arg1) != x)
    2031 LBC           0 :         pfree(arg1);
    2032 CBC    63929703 :     if (PointerGetDatum(arg2) != y)
    2033 UIC           0 :         pfree(arg2);
    2034                 : 
    2035 GIC    63929703 :     return result;
    2036                 : }
    2037                 : 
    2038                 : /*
    2039                 :  * sortsupport comparison func (for BpChar C locale case)
    2040                 :  *
    2041 ECB             :  * BpChar outsources its sortsupport to this module.  Specialization for the
    2042                 :  * varstr_sortsupport BpChar case, modeled on
    2043                 :  * internal_bpchar_pattern_compare().
    2044                 :  */
    2045                 : static int
    2046 GIC           8 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
    2047 ECB             : {
    2048 GIC           8 :     BpChar     *arg1 = DatumGetBpCharPP(x);
    2049               8 :     BpChar     *arg2 = DatumGetBpCharPP(y);
    2050 ECB             :     char       *a1p,
    2051                 :                *a2p;
    2052                 :     int         len1,
    2053                 :                 len2,
    2054                 :                 result;
    2055                 : 
    2056 CBC           8 :     a1p = VARDATA_ANY(arg1);
    2057 GBC           8 :     a2p = VARDATA_ANY(arg2);
    2058                 : 
    2059 GIC           8 :     len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
    2060 CBC           8 :     len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
    2061 ECB             : 
    2062 CBC           8 :     result = memcmp(a1p, a2p, Min(len1, len2));
    2063 GIC           8 :     if ((result == 0) && (len1 != len2))
    2064 UIC           0 :         result = (len1 < len2) ? -1 : 1;
    2065                 : 
    2066                 :     /* We can't afford to leak memory here. */
    2067 GIC           8 :     if (PointerGetDatum(arg1) != x)
    2068 UIC           0 :         pfree(arg1);
    2069 GIC           8 :     if (PointerGetDatum(arg2) != y)
    2070 UIC           0 :         pfree(arg2);
    2071                 : 
    2072 GIC           8 :     return result;
    2073 ECB             : }
    2074                 : 
    2075                 : /*
    2076                 :  * sortsupport comparison func (for NAME C locale case)
    2077                 :  */
    2078                 : static int
    2079 GIC    63594147 : namefastcmp_c(Datum x, Datum y, SortSupport ssup)
    2080                 : {
    2081        63594147 :     Name        arg1 = DatumGetName(x);
    2082        63594147 :     Name        arg2 = DatumGetName(y);
    2083                 : 
    2084        63594147 :     return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
    2085                 : }
    2086 ECB             : 
    2087                 : /*
    2088                 :  * sortsupport comparison func (for locale case with all varlena types)
    2089                 :  */
    2090                 : static int
    2091 GIC    18640117 : varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
    2092 ECB             : {
    2093 CBC    18640117 :     VarString  *arg1 = DatumGetVarStringPP(x);
    2094 GIC    18640117 :     VarString  *arg2 = DatumGetVarStringPP(y);
    2095                 :     char       *a1p,
    2096                 :                *a2p;
    2097                 :     int         len1,
    2098                 :                 len2,
    2099                 :                 result;
    2100                 : 
    2101        18640117 :     a1p = VARDATA_ANY(arg1);
    2102        18640117 :     a2p = VARDATA_ANY(arg2);
    2103                 : 
    2104        18640117 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    2105        18640117 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    2106                 : 
    2107        18640117 :     result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
    2108                 : 
    2109                 :     /* We can't afford to leak memory here. */
    2110        18640117 :     if (PointerGetDatum(arg1) != x)
    2111               3 :         pfree(arg1);
    2112        18640117 :     if (PointerGetDatum(arg2) != y)
    2113               3 :         pfree(arg2);
    2114                 : 
    2115        18640117 :     return result;
    2116                 : }
    2117                 : 
    2118                 : /*
    2119                 :  * sortsupport comparison func (for locale case with NAME type)
    2120                 :  */
    2121                 : static int
    2122 LBC           0 : namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
    2123 ECB             : {
    2124 UIC           0 :     Name        arg1 = DatumGetName(x);
    2125               0 :     Name        arg2 = DatumGetName(y);
    2126                 : 
    2127               0 :     return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
    2128               0 :                                 NameStr(*arg2), strlen(NameStr(*arg2)),
    2129                 :                                 ssup);
    2130                 : }
    2131                 : 
    2132                 : /*
    2133                 :  * sortsupport comparison func for locale cases
    2134 ECB             :  */
    2135                 : static int
    2136 CBC    18640117 : varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
    2137 ECB             : {
    2138 GIC    18640117 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    2139                 :     int         result;
    2140                 :     bool        arg1_match;
    2141 ECB             : 
    2142                 :     /* Fast pre-check for equality, as discussed in varstr_cmp() */
    2143 GIC    18640117 :     if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
    2144 ECB             :     {
    2145                 :         /*
    2146                 :          * No change in buf1 or buf2 contents, so avoid changing last_len1 or
    2147                 :          * last_len2.  Existing contents of buffers might still be used by
    2148                 :          * next call.
    2149                 :          *
    2150                 :          * It's fine to allow the comparison of BpChar padding bytes here,
    2151                 :          * even though that implies that the memcmp() will usually be
    2152                 :          * performed for BpChar callers (though multibyte characters could
    2153                 :          * still prevent that from occurring).  The memcmp() is still very
    2154                 :          * cheap, and BpChar's funny semantics have us remove trailing spaces
    2155                 :          * (not limited to padding), so we need make no distinction between
    2156                 :          * padding space characters and "real" space characters.
    2157                 :          */
    2158 CBC     6879216 :         return 0;
    2159                 :     }
    2160 ECB             : 
    2161 GIC    11760901 :     if (sss->typid == BPCHAROID)
    2162 EUB             :     {
    2163                 :         /* Get true number of bytes, ignoring trailing spaces */
    2164 GBC       16919 :         len1 = bpchartruelen(a1p, len1);
    2165 GIC       16919 :         len2 = bpchartruelen(a2p, len2);
    2166                 :     }
    2167 ECB             : 
    2168 GIC    11760901 :     if (len1 >= sss->buflen1)
    2169 ECB             :     {
    2170 UIC           0 :         sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
    2171               0 :         sss->buf1 = repalloc(sss->buf1, sss->buflen1);
    2172                 :     }
    2173 GIC    11760901 :     if (len2 >= sss->buflen2)
    2174                 :     {
    2175               3 :         sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
    2176               3 :         sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    2177                 :     }
    2178                 : 
    2179                 :     /*
    2180                 :      * We're likely to be asked to compare the same strings repeatedly, and
    2181                 :      * memcmp() is so much cheaper than strcoll() that it pays to try to cache
    2182 EUB             :      * comparisons, even though in general there is no reason to think that
    2183                 :      * that will work out (every string datum may be unique).  Caching does
    2184                 :      * not slow things down measurably when it doesn't work out, and can speed
    2185                 :      * things up by rather a lot when it does.  In part, this is because the
    2186                 :      * memcmp() compares data from cachelines that are needed in L1 cache even
    2187                 :      * when the last comparison's result cannot be reused.
    2188                 :      */
    2189 GIC    11760901 :     arg1_match = true;
    2190        11760901 :     if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
    2191                 :     {
    2192 GBC    10502281 :         arg1_match = false;
    2193 GIC    10502281 :         memcpy(sss->buf1, a1p, len1);
    2194 GBC    10502281 :         sss->buf1[len1] = '\0';
    2195 GIC    10502281 :         sss->last_len1 = len1;
    2196                 :     }
    2197                 : 
    2198                 :     /*
    2199                 :      * If we're comparing the same two strings as last time, we can return the
    2200                 :      * same answer without calling strcoll() again.  This is more likely than
    2201                 :      * it seems (at least with moderate to low cardinality sets), because
    2202                 :      * quicksort compares the same pivot against many values.
    2203                 :      */
    2204        11760901 :     if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
    2205                 :     {
    2206         1944023 :         memcpy(sss->buf2, a2p, len2);
    2207 CBC     1944023 :         sss->buf2[len2] = '\0';
    2208 GIC     1944023 :         sss->last_len2 = len2;
    2209                 :     }
    2210         9816878 :     else if (arg1_match && !sss->cache_blob)
    2211                 :     {
    2212                 :         /* Use result cached following last actual strcoll() call */
    2213         1067944 :         return sss->last_returned;
    2214                 :     }
    2215                 : 
    2216 GNC    10692957 :     result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
    2217                 : 
    2218                 :     /* Break tie if necessary. */
    2219        10692957 :     if (result == 0 && pg_locale_deterministic(sss->locale))
    2220 LBC           0 :         result = strcmp(sss->buf1, sss->buf2);
    2221                 : 
    2222                 :     /* Cache result, perhaps saving an expensive strcoll() call next time */
    2223 GIC    10692957 :     sss->cache_blob = false;
    2224 CBC    10692957 :     sss->last_returned = result;
    2225 GIC    10692957 :     return result;
    2226                 : }
    2227 ECB             : 
    2228                 : /*
    2229                 :  * Conversion routine for sortsupport.  Converts original to abbreviated key
    2230                 :  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
    2231                 :  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
    2232                 :  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
    2233                 :  * locale is used, or in case of bytea, just memcpy() from original instead.
    2234                 :  */
    2235                 : static Datum
    2236 GIC      577029 : varstr_abbrev_convert(Datum original, SortSupport ssup)
    2237                 : {
    2238 GNC      577029 :     const size_t max_prefix_bytes = sizeof(Datum);
    2239 CBC      577029 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    2240 GBC      577029 :     VarString  *authoritative = DatumGetVarStringPP(original);
    2241 GIC      577029 :     char       *authoritative_data = VARDATA_ANY(authoritative);
    2242 ECB             : 
    2243 EUB             :     /* working state */
    2244                 :     Datum       res;
    2245                 :     char       *pres;
    2246                 :     int         len;
    2247                 :     uint32      hash;
    2248                 : 
    2249 GIC      577029 :     pres = (char *) &res;
    2250                 :     /* memset(), so any non-overwritten bytes are NUL */
    2251 GNC      577029 :     memset(pres, 0, max_prefix_bytes);
    2252 GIC      577029 :     len = VARSIZE_ANY_EXHDR(authoritative);
    2253 EUB             : 
    2254                 :     /* Get number of bytes, ignoring trailing spaces */
    2255 GBC      577029 :     if (sss->typid == BPCHAROID)
    2256 GIC        1296 :         len = bpchartruelen(authoritative_data, len);
    2257                 : 
    2258                 :     /*
    2259                 :      * If we're using the C collation, use memcpy(), rather than strxfrm(), to
    2260                 :      * abbreviate keys.  The full comparator for the C locale is always
    2261                 :      * memcmp().  It would be incorrect to allow bytea callers (callers that
    2262                 :      * always force the C collation -- bytea isn't a collatable type, but this
    2263                 :      * approach is convenient) to use strxfrm().  This is because bytea
    2264                 :      * strings may contain NUL bytes.  Besides, this should be faster, too.
    2265                 :      *
    2266                 :      * More generally, it's okay that bytea callers can have NUL bytes in
    2267                 :      * strings because abbreviated cmp need not make a distinction between
    2268                 :      * terminating NUL bytes, and NUL bytes representing actual NULs in the
    2269                 :      * authoritative representation.  Hopefully a comparison at or past one
    2270                 :      * abbreviated key's terminating NUL byte will resolve the comparison
    2271                 :      * without consulting the authoritative representation; specifically, some
    2272                 :      * later non-NUL byte in the longer string can resolve the comparison
    2273                 :      * against a subsequent terminating NUL in the shorter string.  There will
    2274                 :      * usually be what is effectively a "length-wise" resolution there and
    2275                 :      * then.
    2276                 :      *
    2277                 :      * If that doesn't work out -- if all bytes in the longer string
    2278 ECB             :      * positioned at or past the offset of the smaller string's (first)
    2279                 :      * terminating NUL are actually representative of NUL bytes in the
    2280                 :      * authoritative binary string (perhaps with some *terminating* NUL bytes
    2281                 :      * towards the end of the longer string iff it happens to still be small)
    2282                 :      * -- then an authoritative tie-breaker will happen, and do the right
    2283                 :      * thing: explicitly consider string length.
    2284                 :      */
    2285 GIC      577029 :     if (sss->collate_c)
    2286 GNC      257041 :         memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
    2287                 :     else
    2288                 :     {
    2289                 :         Size        bsize;
    2290                 : 
    2291                 :         /*
    2292                 :          * We're not using the C collation, so fall back on strxfrm or ICU
    2293                 :          * analogs.
    2294                 :          */
    2295                 : 
    2296                 :         /* By convention, we use buffer 1 to store and NUL-terminate */
    2297 GIC      319988 :         if (len >= sss->buflen1)
    2298                 :         {
    2299              12 :             sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
    2300              12 :             sss->buf1 = repalloc(sss->buf1, sss->buflen1);
    2301 ECB             :         }
    2302                 : 
    2303                 :         /* Might be able to reuse strxfrm() blob from last call */
    2304 CBC      319988 :         if (sss->last_len1 == len && sss->cache_blob &&
    2305 GIC      312042 :             memcmp(sss->buf1, authoritative_data, len) == 0)
    2306                 :         {
    2307 GNC      153194 :             memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
    2308                 :             /* No change affecting cardinality, so no hashing required */
    2309 GIC      153194 :             goto done;
    2310                 :         }
    2311                 : 
    2312          166794 :         memcpy(sss->buf1, authoritative_data, len);
    2313                 : 
    2314                 :         /*
    2315                 :          * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated
    2316                 :          * strings.
    2317                 :          */
    2318          166794 :         sss->buf1[len] = '\0';
    2319 CBC      166794 :         sss->last_len1 = len;
    2320 EUB             : 
    2321 GNC      166794 :         if (pg_strxfrm_prefix_enabled(sss->locale))
    2322                 :         {
    2323          166794 :             if (sss->buflen2 < max_prefix_bytes)
    2324 ECB             :             {
    2325 UNC           0 :                 sss->buflen2 = Max(max_prefix_bytes,
    2326                 :                                    Min(sss->buflen2 * 2, MaxAllocSize));
    2327               0 :                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    2328 ECB             :             }
    2329                 : 
    2330 GNC      166794 :             bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
    2331                 :                                       max_prefix_bytes, sss->locale);
    2332 CBC      166794 :             sss->last_len2 = bsize;
    2333                 :         }
    2334                 :         else
    2335                 :         {
    2336 ECB             :             /*
    2337                 :              * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
    2338                 :              * again.  The pg_strxfrm() function leaves the result buffer
    2339                 :              * content undefined if the result did not fit, so we need to
    2340                 :              * retry until everything fits, even though we only need the first
    2341                 :              * few bytes in the end.
    2342                 :              */
    2343                 :             for (;;)
    2344                 :             {
    2345 UNC           0 :                 bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
    2346                 :                                    sss->locale);
    2347                 : 
    2348               0 :                 sss->last_len2 = bsize;
    2349               0 :                 if (bsize < sss->buflen2)
    2350               0 :                     break;
    2351                 : 
    2352                 :                 /*
    2353                 :                  * Grow buffer and retry.
    2354                 :                  */
    2355               0 :                 sss->buflen2 = Max(bsize + 1,
    2356                 :                                    Min(sss->buflen2 * 2, MaxAllocSize));
    2357               0 :                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
    2358                 :             }
    2359 ECB             :         }
    2360                 : 
    2361                 :         /*
    2362                 :          * Every Datum byte is always compared.  This is safe because the
    2363                 :          * strxfrm() blob is itself NUL terminated, leaving no danger of
    2364                 :          * misinterpreting any NUL bytes not intended to be interpreted as
    2365                 :          * logically representing termination.
    2366                 :          *
    2367                 :          * (Actually, even if there were NUL bytes in the blob it would be
    2368                 :          * okay.  See remarks on bytea case above.)
    2369                 :          */
    2370 GNC      166794 :         memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
    2371 ECB             :     }
    2372                 : 
    2373                 :     /*
    2374                 :      * Maintain approximate cardinality of both abbreviated keys and original,
    2375                 :      * authoritative keys using HyperLogLog.  Used as cheap insurance against
    2376                 :      * the worst case, where we do many string transformations for no saving
    2377                 :      * in full strcoll()-based comparisons.  These statistics are used by
    2378                 :      * varstr_abbrev_abort().
    2379                 :      *
    2380                 :      * First, Hash key proper, or a significant fraction of it.  Mix in length
    2381                 :      * in order to compensate for cases where differences are past
    2382                 :      * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
    2383                 :      */
    2384 GIC      423835 :     hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
    2385                 :                                    Min(len, PG_CACHE_LINE_SIZE)));
    2386 ECB             : 
    2387 GIC      423835 :     if (len > PG_CACHE_LINE_SIZE)
    2388 CBC          23 :         hash ^= DatumGetUInt32(hash_uint32((uint32) len));
    2389                 : 
    2390 GIC      423835 :     addHyperLogLog(&sss->full_card, hash);
    2391                 : 
    2392 ECB             :     /* Hash abbreviated key */
    2393                 : #if SIZEOF_DATUM == 8
    2394                 :     {
    2395                 :         uint32      lohalf,
    2396                 :                     hihalf;
    2397                 : 
    2398 CBC      423835 :         lohalf = (uint32) res;
    2399 GIC      423835 :         hihalf = (uint32) (res >> 32);
    2400          423835 :         hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
    2401 ECB             :     }
    2402                 : #else                           /* SIZEOF_DATUM != 8 */
    2403                 :     hash = DatumGetUInt32(hash_uint32((uint32) res));
    2404                 : #endif
    2405 EUB             : 
    2406 GIC      423835 :     addHyperLogLog(&sss->abbr_card, hash);
    2407 ECB             : 
    2408                 :     /* Cache result, perhaps saving an expensive strxfrm() call next time */
    2409 GIC      423835 :     sss->cache_blob = true;
    2410          577029 : done:
    2411 ECB             : 
    2412                 :     /*
    2413                 :      * Byteswap on little-endian machines.
    2414                 :      *
    2415                 :      * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
    2416                 :      * 3-way comparator) works correctly on all platforms.  If we didn't do
    2417                 :      * this, the comparator would have to call memcmp() with a pair of
    2418                 :      * pointers to the first byte of each abbreviated key, which is slower.
    2419                 :      */
    2420 CBC      577029 :     res = DatumBigEndianToNative(res);
    2421 ECB             : 
    2422                 :     /* Don't leak memory here */
    2423 CBC      577029 :     if (PointerGetDatum(authoritative) != original)
    2424 GIC           6 :         pfree(authoritative);
    2425                 : 
    2426 CBC      577029 :     return res;
    2427                 : }
    2428 ECB             : 
    2429 EUB             : /*
    2430                 :  * Callback for estimating effectiveness of abbreviated key optimization, using
    2431                 :  * heuristic rules.  Returns value indicating if the abbreviation optimization
    2432 ECB             :  * should be aborted, based on its projected effectiveness.
    2433                 :  */
    2434                 : static bool
    2435 GIC        1869 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
    2436 ECB             : {
    2437 GIC        1869 :     VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
    2438 ECB             :     double      abbrev_distinct,
    2439                 :                 key_distinct;
    2440                 : 
    2441 GIC        1869 :     Assert(ssup->abbreviate);
    2442 ECB             : 
    2443                 :     /* Have a little patience */
    2444 CBC        1869 :     if (memtupcount < 100)
    2445            1147 :         return false;
    2446                 : 
    2447 GIC         722 :     abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
    2448 CBC         722 :     key_distinct = estimateHyperLogLog(&sss->full_card);
    2449 ECB             : 
    2450                 :     /*
    2451                 :      * Clamp cardinality estimates to at least one distinct value.  While
    2452                 :      * NULLs are generally disregarded, if only NULL values were seen so far,
    2453                 :      * that might misrepresent costs if we failed to clamp.
    2454                 :      */
    2455 GIC         722 :     if (abbrev_distinct <= 1.0)
    2456 UIC           0 :         abbrev_distinct = 1.0;
    2457                 : 
    2458 GBC         722 :     if (key_distinct <= 1.0)
    2459 UIC           0 :         key_distinct = 1.0;
    2460 EUB             : 
    2461                 :     /*
    2462                 :      * In the worst case all abbreviated keys are identical, while at the same
    2463                 :      * time there are differences within full key strings not captured in
    2464                 :      * abbreviations.
    2465                 :      */
    2466                 : #ifdef TRACE_SORT
    2467 GIC         722 :     if (trace_sort)
    2468 EUB             :     {
    2469 UIC           0 :         double      norm_abbrev_card = abbrev_distinct / (double) memtupcount;
    2470 EUB             : 
    2471 UIC           0 :         elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
    2472                 :              "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
    2473                 :              memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
    2474                 :              sss->prop_card);
    2475                 :     }
    2476                 : #endif
    2477                 : 
    2478                 :     /*
    2479                 :      * If the number of distinct abbreviated keys approximately matches the
    2480 ECB             :      * number of distinct authoritative original keys, that's reason enough to
    2481                 :      * proceed.  We can win even with a very low cardinality set if most
    2482                 :      * tie-breakers only memcmp().  This is by far the most important
    2483                 :      * consideration.
    2484                 :      *
    2485                 :      * While comparisons that are resolved at the abbreviated key level are
    2486 EUB             :      * considerably cheaper than tie-breakers resolved with memcmp(), both of
    2487                 :      * those two outcomes are so much cheaper than a full strcoll() once
    2488                 :      * sorting is underway that it doesn't seem worth it to weigh abbreviated
    2489                 :      * cardinality against the overall size of the set in order to more
    2490                 :      * accurately model costs.  Assume that an abbreviated comparison, and an
    2491                 :      * abbreviated comparison with a cheap memcmp()-based authoritative
    2492                 :      * resolution are equivalent.
    2493                 :      */
    2494 GBC         722 :     if (abbrev_distinct > key_distinct * sss->prop_card)
    2495                 :     {
    2496                 :         /*
    2497                 :          * When we have exceeded 10,000 tuples, decay required cardinality
    2498 ECB             :          * aggressively for next call.
    2499                 :          *
    2500                 :          * This is useful because the number of comparisons required on
    2501                 :          * average increases at a linearithmic rate, and at roughly 10,000
    2502                 :          * tuples that factor will start to dominate over the linear costs of
    2503                 :          * string transformation (this is a conservative estimate).  The decay
    2504 EUB             :          * rate is chosen to be a little less aggressive than halving -- which
    2505                 :          * (since we're called at points at which memtupcount has doubled)
    2506                 :          * would never see the cost model actually abort past the first call
    2507                 :          * following a decay.  This decay rate is mostly a precaution against
    2508                 :          * a sudden, violent swing in how well abbreviated cardinality tracks
    2509                 :          * full key cardinality.  The decay also serves to prevent a marginal
    2510                 :          * case from being aborted too late, when too much has already been
    2511                 :          * invested in string transformation.
    2512                 :          *
    2513                 :          * It's possible for sets of several million distinct strings with
    2514                 :          * mere tens of thousands of distinct abbreviated keys to still
    2515                 :          * benefit very significantly.  This will generally occur provided
    2516                 :          * each abbreviated key is a proxy for a roughly uniform number of the
    2517                 :          * set's full keys. If it isn't so, we hope to catch that early and
    2518                 :          * abort.  If it isn't caught early, by the time the problem is
    2519                 :          * apparent it's probably not worth aborting.
    2520                 :          */
    2521 GIC         693 :         if (memtupcount > 10000)
    2522 GBC           3 :             sss->prop_card *= 0.65;
    2523                 : 
    2524             693 :         return false;
    2525                 :     }
    2526                 : 
    2527                 :     /*
    2528                 :      * Abort abbreviation strategy.
    2529                 :      *
    2530                 :      * The worst case, where all abbreviated keys are identical while all
    2531                 :      * original strings differ will typically only see a regression of about
    2532                 :      * 10% in execution time for small to medium sized lists of strings.
    2533                 :      * Whereas on modern CPUs where cache stalls are the dominant cost, we can
    2534                 :      * often expect very large improvements, particularly with sets of strings
    2535                 :      * of moderately high to high abbreviated cardinality.  There is little to
    2536                 :      * lose but much to gain, which our strategy reflects.
    2537                 :      */
    2538                 : #ifdef TRACE_SORT
    2539 CBC          29 :     if (trace_sort)
    2540 UIC           0 :         elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
    2541                 :              "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
    2542                 :              memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
    2543                 : #endif
    2544                 : 
    2545 CBC          29 :     return true;
    2546 ECB             : }
    2547                 : 
    2548                 : /*
    2549                 :  * Generic equalimage support function for character type's operator classes.
    2550                 :  * Disables the use of deduplication with nondeterministic collations.
    2551                 :  */
    2552 EUB             : Datum
    2553 CBC       13033 : btvarstrequalimage(PG_FUNCTION_ARGS)
    2554 ECB             : {
    2555                 :     /* Oid      opcintype = PG_GETARG_OID(0); */
    2556 CBC       13033 :     Oid         collid = PG_GET_COLLATION();
    2557                 : 
    2558 GIC       13033 :     check_collation_set(collid);
    2559                 : 
    2560           13033 :     if (lc_collate_is_c(collid) ||
    2561 CBC          25 :         collid == DEFAULT_COLLATION_OID ||
    2562 GIC          25 :         get_collation_isdeterministic(collid))
    2563 CBC       13023 :         PG_RETURN_BOOL(true);
    2564 ECB             :     else
    2565 GIC          10 :         PG_RETURN_BOOL(false);
    2566                 : }
    2567 ECB             : 
    2568                 : Datum
    2569 CBC      114807 : text_larger(PG_FUNCTION_ARGS)
    2570 ECB             : {
    2571 GIC      114807 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2572 CBC      114807 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2573                 :     text       *result;
    2574                 : 
    2575 GIC      114807 :     result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
    2576                 : 
    2577 CBC      114807 :     PG_RETURN_TEXT_P(result);
    2578                 : }
    2579 ECB             : 
    2580                 : Datum
    2581 GIC       43065 : text_smaller(PG_FUNCTION_ARGS)
    2582                 : {
    2583 CBC       43065 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2584 GIC       43065 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2585 ECB             :     text       *result;
    2586                 : 
    2587 GIC       43065 :     result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
    2588 ECB             : 
    2589 GIC       43065 :     PG_RETURN_TEXT_P(result);
    2590                 : }
    2591                 : 
    2592                 : 
    2593 ECB             : /*
    2594                 :  * Cross-type comparison functions for types text and name.
    2595                 :  */
    2596                 : 
    2597                 : Datum
    2598 GIC       98804 : nameeqtext(PG_FUNCTION_ARGS)
    2599 ECB             : {
    2600 GIC       98804 :     Name        arg1 = PG_GETARG_NAME(0);
    2601 CBC       98804 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2602           98804 :     size_t      len1 = strlen(NameStr(*arg1));
    2603 GIC       98804 :     size_t      len2 = VARSIZE_ANY_EXHDR(arg2);
    2604 CBC       98804 :     Oid         collid = PG_GET_COLLATION();
    2605                 :     bool        result;
    2606                 : 
    2607 GIC       98804 :     check_collation_set(collid);
    2608                 : 
    2609 CBC       98804 :     if (collid == C_COLLATION_OID)
    2610 GIC      148014 :         result = (len1 == len2 &&
    2611 CBC       65114 :                   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
    2612 ECB             :     else
    2613 GIC       15904 :         result = (varstr_cmp(NameStr(*arg1), len1,
    2614           15904 :                              VARDATA_ANY(arg2), len2,
    2615 ECB             :                              collid) == 0);
    2616                 : 
    2617 CBC       98804 :     PG_FREE_IF_COPY(arg2, 1);
    2618 ECB             : 
    2619 GIC       98804 :     PG_RETURN_BOOL(result);
    2620 ECB             : }
    2621                 : 
    2622                 : Datum
    2623 GIC        3369 : texteqname(PG_FUNCTION_ARGS)
    2624                 : {
    2625 CBC        3369 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2626 GIC        3369 :     Name        arg2 = PG_GETARG_NAME(1);
    2627 CBC        3369 :     size_t      len1 = VARSIZE_ANY_EXHDR(arg1);
    2628            3369 :     size_t      len2 = strlen(NameStr(*arg2));
    2629 GIC        3369 :     Oid         collid = PG_GET_COLLATION();
    2630                 :     bool        result;
    2631 ECB             : 
    2632 GIC        3369 :     check_collation_set(collid);
    2633 ECB             : 
    2634 CBC        3369 :     if (collid == C_COLLATION_OID)
    2635 GIC         282 :         result = (len1 == len2 &&
    2636 CBC          90 :                   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
    2637                 :     else
    2638 GIC        3177 :         result = (varstr_cmp(VARDATA_ANY(arg1), len1,
    2639            3177 :                              NameStr(*arg2), len2,
    2640                 :                              collid) == 0);
    2641 ECB             : 
    2642 GIC        3369 :     PG_FREE_IF_COPY(arg1, 0);
    2643 ECB             : 
    2644 GIC        3369 :     PG_RETURN_BOOL(result);
    2645                 : }
    2646 ECB             : 
    2647                 : Datum
    2648 GIC          18 : namenetext(PG_FUNCTION_ARGS)
    2649 ECB             : {
    2650 GIC          18 :     Name        arg1 = PG_GETARG_NAME(0);
    2651 CBC          18 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2652 GIC          18 :     size_t      len1 = strlen(NameStr(*arg1));
    2653 CBC          18 :     size_t      len2 = VARSIZE_ANY_EXHDR(arg2);
    2654 GIC          18 :     Oid         collid = PG_GET_COLLATION();
    2655                 :     bool        result;
    2656                 : 
    2657              18 :     check_collation_set(collid);
    2658                 : 
    2659              18 :     if (collid == C_COLLATION_OID)
    2660               9 :         result = !(len1 == len2 &&
    2661 UIC           0 :                    memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
    2662                 :     else
    2663 GIC           9 :         result = !(varstr_cmp(NameStr(*arg1), len1,
    2664 CBC           9 :                               VARDATA_ANY(arg2), len2,
    2665                 :                               collid) == 0);
    2666 ECB             : 
    2667 GIC          18 :     PG_FREE_IF_COPY(arg2, 1);
    2668                 : 
    2669 CBC          18 :     PG_RETURN_BOOL(result);
    2670                 : }
    2671                 : 
    2672                 : Datum
    2673 GIC           9 : textnename(PG_FUNCTION_ARGS)
    2674                 : {
    2675               9 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2676               9 :     Name        arg2 = PG_GETARG_NAME(1);
    2677               9 :     size_t      len1 = VARSIZE_ANY_EXHDR(arg1);
    2678               9 :     size_t      len2 = strlen(NameStr(*arg2));
    2679               9 :     Oid         collid = PG_GET_COLLATION();
    2680 ECB             :     bool        result;
    2681                 : 
    2682 CBC           9 :     check_collation_set(collid);
    2683 ECB             : 
    2684 GIC           9 :     if (collid == C_COLLATION_OID)
    2685 LBC           0 :         result = !(len1 == len2 &&
    2686 UIC           0 :                    memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
    2687                 :     else
    2688 GIC           9 :         result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
    2689               9 :                               NameStr(*arg2), len2,
    2690                 :                               collid) == 0);
    2691                 : 
    2692               9 :     PG_FREE_IF_COPY(arg1, 0);
    2693                 : 
    2694               9 :     PG_RETURN_BOOL(result);
    2695 ECB             : }
    2696                 : 
    2697                 : Datum
    2698 GIC       81923 : btnametextcmp(PG_FUNCTION_ARGS)
    2699                 : {
    2700           81923 :     Name        arg1 = PG_GETARG_NAME(0);
    2701           81923 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2702                 :     int32       result;
    2703 ECB             : 
    2704 CBC      163846 :     result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
    2705 GIC      163846 :                         VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
    2706                 :                         PG_GET_COLLATION());
    2707 ECB             : 
    2708 GBC       81923 :     PG_FREE_IF_COPY(arg2, 1);
    2709 ECB             : 
    2710 GBC       81923 :     PG_RETURN_INT32(result);
    2711                 : }
    2712 ECB             : 
    2713                 : Datum
    2714 UIC           0 : bttextnamecmp(PG_FUNCTION_ARGS)
    2715                 : {
    2716 LBC           0 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2717 UIC           0 :     Name        arg2 = PG_GETARG_NAME(1);
    2718                 :     int32       result;
    2719 ECB             : 
    2720 LBC           0 :     result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
    2721               0 :                         NameStr(*arg2), strlen(NameStr(*arg2)),
    2722 ECB             :                         PG_GET_COLLATION());
    2723                 : 
    2724 UIC           0 :     PG_FREE_IF_COPY(arg1, 0);
    2725 ECB             : 
    2726 UIC           0 :     PG_RETURN_INT32(result);
    2727                 : }
    2728                 : 
    2729                 : #define CmpCall(cmpfunc) \
    2730                 :     DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
    2731                 :                                           PG_GET_COLLATION(), \
    2732                 :                                           PG_GETARG_DATUM(0), \
    2733                 :                                           PG_GETARG_DATUM(1)))
    2734                 : 
    2735                 : Datum
    2736 GIC       26815 : namelttext(PG_FUNCTION_ARGS)
    2737                 : {
    2738           26815 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
    2739                 : }
    2740                 : 
    2741                 : Datum
    2742 UIC           0 : nameletext(PG_FUNCTION_ARGS)
    2743                 : {
    2744               0 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
    2745                 : }
    2746                 : 
    2747 ECB             : Datum
    2748 UIC           0 : namegttext(PG_FUNCTION_ARGS)
    2749 ECB             : {
    2750 UIC           0 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
    2751                 : }
    2752                 : 
    2753                 : Datum
    2754 GIC       25974 : namegetext(PG_FUNCTION_ARGS)
    2755                 : {
    2756           25974 :     PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
    2757                 : }
    2758                 : 
    2759                 : Datum
    2760 UIC           0 : textltname(PG_FUNCTION_ARGS)
    2761 ECB             : {
    2762 UIC           0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
    2763 ECB             : }
    2764                 : 
    2765                 : Datum
    2766 UIC           0 : textlename(PG_FUNCTION_ARGS)
    2767                 : {
    2768               0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
    2769                 : }
    2770 ECB             : 
    2771                 : Datum
    2772 UIC           0 : textgtname(PG_FUNCTION_ARGS)
    2773                 : {
    2774               0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
    2775                 : }
    2776                 : 
    2777                 : Datum
    2778               0 : textgename(PG_FUNCTION_ARGS)
    2779                 : {
    2780               0 :     PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
    2781                 : }
    2782 ECB             : 
    2783                 : #undef CmpCall
    2784                 : 
    2785                 : 
    2786                 : /*
    2787                 :  * The following operators support character-by-character comparison
    2788                 :  * of text datums, to allow building indexes suitable for LIKE clauses.
    2789                 :  * Note that the regular texteq/textne comparison operators, and regular
    2790                 :  * support functions 1 and 2 with "C" collation are assumed to be
    2791                 :  * compatible with these!
    2792                 :  */
    2793                 : 
    2794                 : static int
    2795 CBC       76040 : internal_text_pattern_compare(text *arg1, text *arg2)
    2796                 : {
    2797                 :     int         result;
    2798                 :     int         len1,
    2799                 :                 len2;
    2800 ECB             : 
    2801 GIC       76040 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    2802           76040 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    2803                 : 
    2804           76040 :     result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    2805           76040 :     if (result != 0)
    2806 CBC       76013 :         return result;
    2807 GIC          27 :     else if (len1 < len2)
    2808 UIC           0 :         return -1;
    2809 GIC          27 :     else if (len1 > len2)
    2810               9 :         return 1;
    2811                 :     else
    2812              18 :         return 0;
    2813                 : }
    2814                 : 
    2815 ECB             : 
    2816 EUB             : Datum
    2817 GIC       19769 : text_pattern_lt(PG_FUNCTION_ARGS)
    2818 ECB             : {
    2819 GIC       19769 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2820           19769 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2821                 :     int         result;
    2822                 : 
    2823           19769 :     result = internal_text_pattern_compare(arg1, arg2);
    2824                 : 
    2825           19769 :     PG_FREE_IF_COPY(arg1, 0);
    2826 CBC       19769 :     PG_FREE_IF_COPY(arg2, 1);
    2827                 : 
    2828 GIC       19769 :     PG_RETURN_BOOL(result < 0);
    2829                 : }
    2830                 : 
    2831                 : 
    2832                 : Datum
    2833           18755 : text_pattern_le(PG_FUNCTION_ARGS)
    2834                 : {
    2835           18755 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2836           18755 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2837 ECB             :     int         result;
    2838                 : 
    2839 CBC       18755 :     result = internal_text_pattern_compare(arg1, arg2);
    2840 ECB             : 
    2841 CBC       18755 :     PG_FREE_IF_COPY(arg1, 0);
    2842           18755 :     PG_FREE_IF_COPY(arg2, 1);
    2843                 : 
    2844           18755 :     PG_RETURN_BOOL(result <= 0);
    2845                 : }
    2846                 : 
    2847                 : 
    2848 ECB             : Datum
    2849 GIC       18755 : text_pattern_ge(PG_FUNCTION_ARGS)
    2850 ECB             : {
    2851 CBC       18755 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2852           18755 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2853                 :     int         result;
    2854                 : 
    2855           18755 :     result = internal_text_pattern_compare(arg1, arg2);
    2856 ECB             : 
    2857 GIC       18755 :     PG_FREE_IF_COPY(arg1, 0);
    2858           18755 :     PG_FREE_IF_COPY(arg2, 1);
    2859                 : 
    2860 CBC       18755 :     PG_RETURN_BOOL(result >= 0);
    2861                 : }
    2862                 : 
    2863                 : 
    2864                 : Datum
    2865 GIC       18755 : text_pattern_gt(PG_FUNCTION_ARGS)
    2866                 : {
    2867           18755 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2868           18755 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2869                 :     int         result;
    2870                 : 
    2871           18755 :     result = internal_text_pattern_compare(arg1, arg2);
    2872 ECB             : 
    2873 GBC       18755 :     PG_FREE_IF_COPY(arg1, 0);
    2874 GIC       18755 :     PG_FREE_IF_COPY(arg2, 1);
    2875                 : 
    2876 CBC       18755 :     PG_RETURN_BOOL(result > 0);
    2877 EUB             : }
    2878                 : 
    2879                 : 
    2880                 : Datum
    2881 CBC           6 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
    2882 ECB             : {
    2883 CBC           6 :     text       *arg1 = PG_GETARG_TEXT_PP(0);
    2884               6 :     text       *arg2 = PG_GETARG_TEXT_PP(1);
    2885                 :     int         result;
    2886 ECB             : 
    2887 GIC           6 :     result = internal_text_pattern_compare(arg1, arg2);
    2888                 : 
    2889               6 :     PG_FREE_IF_COPY(arg1, 0);
    2890               6 :     PG_FREE_IF_COPY(arg2, 1);
    2891                 : 
    2892               6 :     PG_RETURN_INT32(result);
    2893 ECB             : }
    2894                 : 
    2895                 : 
    2896                 : Datum
    2897 CBC          58 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
    2898                 : {
    2899 GIC          58 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    2900                 :     MemoryContext oldcontext;
    2901                 : 
    2902              58 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    2903                 : 
    2904                 :     /* Use generic string SortSupport, forcing "C" collation */
    2905              58 :     varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
    2906                 : 
    2907 GBC          58 :     MemoryContextSwitchTo(oldcontext);
    2908                 : 
    2909              58 :     PG_RETURN_VOID();
    2910 EUB             : }
    2911                 : 
    2912                 : 
    2913                 : /*-------------------------------------------------------------
    2914                 :  * byteaoctetlen
    2915                 :  *
    2916                 :  * get the number of bytes contained in an instance of type 'bytea'
    2917                 :  *-------------------------------------------------------------
    2918                 :  */
    2919                 : Datum
    2920 GBC         157 : byteaoctetlen(PG_FUNCTION_ARGS)
    2921                 : {
    2922             157 :     Datum       str = PG_GETARG_DATUM(0);
    2923 EUB             : 
    2924                 :     /* We need not detoast the input at all */
    2925 GBC         157 :     PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
    2926 EUB             : }
    2927                 : 
    2928                 : /*
    2929                 :  * byteacat -
    2930                 :  *    takes two bytea* and returns a bytea* that is the concatenation of
    2931                 :  *    the two.
    2932                 :  *
    2933                 :  * Cloned from textcat and modified as required.
    2934                 :  */
    2935                 : Datum
    2936 GIC         760 : byteacat(PG_FUNCTION_ARGS)
    2937 EUB             : {
    2938 GIC         760 :     bytea      *t1 = PG_GETARG_BYTEA_PP(0);
    2939             760 :     bytea      *t2 = PG_GETARG_BYTEA_PP(1);
    2940 EUB             : 
    2941 GIC         760 :     PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
    2942                 : }
    2943                 : 
    2944                 : /*
    2945                 :  * bytea_catenate
    2946                 :  *  Guts of byteacat(), broken out so it can be used by other functions
    2947                 :  *
    2948                 :  * Arguments can be in short-header form, but not compressed or out-of-line
    2949                 :  */
    2950                 : static bytea *
    2951 CBC         778 : bytea_catenate(bytea *t1, bytea *t2)
    2952                 : {
    2953 ECB             :     bytea      *result;
    2954                 :     int         len1,
    2955                 :                 len2,
    2956                 :                 len;
    2957                 :     char       *ptr;
    2958                 : 
    2959 GIC         778 :     len1 = VARSIZE_ANY_EXHDR(t1);
    2960 CBC         778 :     len2 = VARSIZE_ANY_EXHDR(t2);
    2961 ECB             : 
    2962                 :     /* paranoia ... probably should throw error instead? */
    2963 GIC         778 :     if (len1 < 0)
    2964 UIC           0 :         len1 = 0;
    2965 GIC         778 :     if (len2 < 0)
    2966 LBC           0 :         len2 = 0;
    2967                 : 
    2968 CBC         778 :     len = len1 + len2 + VARHDRSZ;
    2969 GIC         778 :     result = (bytea *) palloc(len);
    2970                 : 
    2971                 :     /* Set size of result string... */
    2972             778 :     SET_VARSIZE(result, len);
    2973                 : 
    2974                 :     /* Fill data field of result string... */
    2975             778 :     ptr = VARDATA(result);
    2976             778 :     if (len1 > 0)
    2977             778 :         memcpy(ptr, VARDATA_ANY(t1), len1);
    2978             778 :     if (len2 > 0)
    2979             769 :         memcpy(ptr + len1, VARDATA_ANY(t2), len2);
    2980 ECB             : 
    2981 GIC         778 :     return result;
    2982 ECB             : }
    2983                 : 
    2984                 : #define PG_STR_GET_BYTEA(str_) \
    2985                 :     DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
    2986                 : 
    2987                 : /*
    2988                 :  * bytea_substr()
    2989                 :  * Return a substring starting at the specified position.
    2990                 :  * Cloned from text_substr and modified as required.
    2991                 :  *
    2992                 :  * Input:
    2993                 :  *  - string
    2994                 :  *  - starting position (is one-based)
    2995                 :  *  - string length (optional)
    2996                 :  *
    2997                 :  * If the starting position is zero or less, then return from the start of the string
    2998                 :  * adjusting the length to be consistent with the "negative start" per SQL.
    2999                 :  * If the length is less than zero, an ERROR is thrown. If no third argument
    3000                 :  * (length) is provided, the length to the end of the string is assumed.
    3001                 :  */
    3002                 : Datum
    3003 CBC          43 : bytea_substr(PG_FUNCTION_ARGS)
    3004 ECB             : {
    3005 GIC          43 :     PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
    3006 EUB             :                                       PG_GETARG_INT32(1),
    3007                 :                                       PG_GETARG_INT32(2),
    3008                 :                                       false));
    3009                 : }
    3010                 : 
    3011                 : /*
    3012                 :  * bytea_substr_no_len -
    3013                 :  *    Wrapper to avoid opr_sanity failure due to
    3014                 :  *    one function accepting a different number of args.
    3015                 :  */
    3016                 : Datum
    3017 GIC        1950 : bytea_substr_no_len(PG_FUNCTION_ARGS)
    3018 ECB             : {
    3019 GIC        1950 :     PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
    3020 ECB             :                                       PG_GETARG_INT32(1),
    3021                 :                                       -1,
    3022                 :                                       true));
    3023                 : }
    3024                 : 
    3025                 : static bytea *
    3026 GIC        2011 : bytea_substring(Datum str,
    3027 ECB             :                 int S,
    3028                 :                 int L,
    3029                 :                 bool length_not_specified)
    3030                 : {
    3031                 :     int32       S1;             /* adjusted start position */
    3032                 :     int32       L1;             /* adjusted substring length */
    3033                 :     int32       E;              /* end position */
    3034                 : 
    3035                 :     /*
    3036                 :      * The logic here should generally match text_substring().
    3037                 :      */
    3038 CBC        2011 :     S1 = Max(S, 1);
    3039                 : 
    3040 GIC        2011 :     if (length_not_specified)
    3041                 :     {
    3042                 :         /*
    3043                 :          * Not passed a length - DatumGetByteaPSlice() grabs everything to the
    3044                 :          * end of the string if we pass it a negative value for length.
    3045                 :          */
    3046            1959 :         L1 = -1;
    3047                 :     }
    3048              52 :     else if (L < 0)
    3049                 :     {
    3050 ECB             :         /* SQL99 says to throw an error for E < S, i.e., negative length */
    3051 GIC           6 :         ereport(ERROR,
    3052 ECB             :                 (errcode(ERRCODE_SUBSTRING_ERROR),
    3053                 :                  errmsg("negative substring length not allowed")));
    3054                 :         L1 = -1;                /* silence stupider compilers */
    3055                 :     }
    3056 GIC          46 :     else if (pg_add_s32_overflow(S, L, &E))
    3057                 :     {
    3058                 :         /*
    3059                 :          * L could be large enough for S + L to overflow, in which case the
    3060                 :          * substring must run to end of string.
    3061 ECB             :          */
    3062 GIC           3 :         L1 = -1;
    3063 ECB             :     }
    3064                 :     else
    3065                 :     {
    3066                 :         /*
    3067                 :          * A zero or negative value for the end position can happen if the
    3068                 :          * start was negative or one. SQL99 says to return a zero-length
    3069                 :          * string.
    3070                 :          */
    3071 CBC          43 :         if (E < 1)
    3072 UIC           0 :             return PG_STR_GET_BYTEA("");
    3073                 : 
    3074 GIC          43 :         L1 = E - S1;
    3075                 :     }
    3076 ECB             : 
    3077 EUB             :     /*
    3078                 :      * If the start position is past the end of the string, SQL99 says to
    3079                 :      * return a zero-length string -- DatumGetByteaPSlice() will do that for
    3080                 :      * us.  We need only convert S1 to zero-based starting position.
    3081                 :      */
    3082 GIC        2005 :     return DatumGetByteaPSlice(str, S1 - 1, L1);
    3083                 : }
    3084 ECB             : 
    3085                 : /*
    3086                 :  * byteaoverlay
    3087                 :  *  Replace specified substring of first string with second
    3088                 :  *
    3089 EUB             :  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
    3090                 :  * This code is a direct implementation of what the standard says.
    3091 ECB             :  */
    3092                 : Datum
    3093 CBC           3 : byteaoverlay(PG_FUNCTION_ARGS)
    3094                 : {
    3095 GIC           3 :     bytea      *t1 = PG_GETARG_BYTEA_PP(0);
    3096               3 :     bytea      *t2 = PG_GETARG_BYTEA_PP(1);
    3097               3 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
    3098               3 :     int         sl = PG_GETARG_INT32(3);    /* substring length */
    3099                 : 
    3100               3 :     PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
    3101 ECB             : }
    3102                 : 
    3103                 : Datum
    3104 GIC           6 : byteaoverlay_no_len(PG_FUNCTION_ARGS)
    3105                 : {
    3106               6 :     bytea      *t1 = PG_GETARG_BYTEA_PP(0);
    3107 CBC           6 :     bytea      *t2 = PG_GETARG_BYTEA_PP(1);
    3108 GIC           6 :     int         sp = PG_GETARG_INT32(2);    /* substring start position */
    3109                 :     int         sl;
    3110 ECB             : 
    3111 CBC           6 :     sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
    3112 GIC           6 :     PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
    3113                 : }
    3114 ECB             : 
    3115                 : static bytea *
    3116 GIC           9 : bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
    3117 ECB             : {
    3118                 :     bytea      *result;
    3119                 :     bytea      *s1;
    3120                 :     bytea      *s2;
    3121                 :     int         sp_pl_sl;
    3122                 : 
    3123                 :     /*
    3124                 :      * Check for possible integer-overflow cases.  For negative sp, throw a
    3125                 :      * "substring length" error because that's what should be expected
    3126                 :      * according to the spec's definition of OVERLAY().
    3127                 :      */
    3128 CBC           9 :     if (sp <= 0)
    3129 UIC           0 :         ereport(ERROR,
    3130                 :                 (errcode(ERRCODE_SUBSTRING_ERROR),
    3131                 :                  errmsg("negative substring length not allowed")));
    3132 GIC           9 :     if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
    3133 UIC           0 :         ereport(ERROR,
    3134                 :                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    3135                 :                  errmsg("integer out of range")));
    3136                 : 
    3137 GIC           9 :     s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
    3138               9 :     s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
    3139               9 :     result = bytea_catenate(s1, t2);
    3140               9 :     result = bytea_catenate(result, s2);
    3141 ECB             : 
    3142 GIC           9 :     return result;
    3143                 : }
    3144 ECB             : 
    3145                 : /*
    3146                 :  * bit_count
    3147                 :  */
    3148                 : Datum
    3149 GIC           3 : bytea_bit_count(PG_FUNCTION_ARGS)
    3150 ECB             : {
    3151 GIC           3 :     bytea      *t1 = PG_GETARG_BYTEA_PP(0);
    3152 ECB             : 
    3153 GBC           3 :     PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
    3154                 : }
    3155                 : 
    3156                 : /*
    3157 ECB             :  * byteapos -
    3158 EUB             :  *    Return the position of the specified substring.
    3159                 :  *    Implements the SQL POSITION() function.
    3160                 :  * Cloned from textpos and modified as required.
    3161                 :  */
    3162 ECB             : Datum
    3163 UIC           0 : byteapos(PG_FUNCTION_ARGS)
    3164 ECB             : {
    3165 UIC           0 :     bytea      *t1 = PG_GETARG_BYTEA_PP(0);
    3166 LBC           0 :     bytea      *t2 = PG_GETARG_BYTEA_PP(1);
    3167                 :     int         pos;
    3168                 :     int         px,
    3169 ECB             :                 p;
    3170                 :     int         len1,
    3171                 :                 len2;
    3172                 :     char       *p1,
    3173                 :                *p2;
    3174                 : 
    3175 UIC           0 :     len1 = VARSIZE_ANY_EXHDR(t1);
    3176               0 :     len2 = VARSIZE_ANY_EXHDR(t2);
    3177                 : 
    3178               0 :     if (len2 <= 0)
    3179               0 :         PG_RETURN_INT32(1);     /* result for empty pattern */
    3180                 : 
    3181               0 :     p1 = VARDATA_ANY(t1);
    3182               0 :     p2 = VARDATA_ANY(t2);
    3183                 : 
    3184               0 :     pos = 0;
    3185               0 :     px = (len1 - len2);
    3186               0 :     for (p = 0; p <= px; p++)
    3187                 :     {
    3188               0 :         if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
    3189                 :         {
    3190               0 :             pos = p + 1;
    3191               0 :             break;
    3192                 :         };
    3193               0 :         p1++;
    3194                 :     };
    3195                 : 
    3196               0 :     PG_RETURN_INT32(pos);
    3197                 : }
    3198                 : 
    3199 ECB             : /*-------------------------------------------------------------
    3200                 :  * byteaGetByte
    3201                 :  *
    3202                 :  * this routine treats "bytea" as an array of bytes.
    3203                 :  * It returns the Nth byte (a number between 0 and 255).
    3204                 :  *-------------------------------------------------------------
    3205                 :  */
    3206                 : Datum
    3207 CBC          30 : byteaGetByte(PG_FUNCTION_ARGS)
    3208 EUB             : {
    3209 GIC          30 :     bytea      *v = PG_GETARG_BYTEA_PP(0);
    3210 CBC          30 :     int32       n = PG_GETARG_INT32(1);
    3211 ECB             :     int         len;
    3212                 :     int         byte;
    3213                 : 
    3214 GIC          30 :     len = VARSIZE_ANY_EXHDR(v);
    3215                 : 
    3216              30 :     if (n < 0 || n >= len)
    3217               3 :         ereport(ERROR,
    3218                 :                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
    3219 ECB             :                  errmsg("index %d out of valid range, 0..%d",
    3220                 :                         n, len - 1)));
    3221                 : 
    3222 CBC          27 :     byte = ((unsigned char *) VARDATA_ANY(v))[n];
    3223                 : 
    3224 GIC          27 :     PG_RETURN_INT32(byte);
    3225 ECB             : }
    3226                 : 
    3227 EUB             : /*-------------------------------------------------------------
    3228 ECB             :  * byteaGetBit
    3229                 :  *
    3230                 :  * This routine treats a "bytea" type like an array of bits.
    3231                 :  * It returns the value of the Nth bit (0 or 1).
    3232                 :  *
    3233                 :  *-------------------------------------------------------------
    3234                 :  */
    3235                 : Datum
    3236 GIC           6 : byteaGetBit(PG_FUNCTION_ARGS)
    3237                 : {
    3238               6 :     bytea      *v = PG_GETARG_BYTEA_PP(0);
    3239               6 :     int64       n = PG_GETARG_INT64(1);
    3240                 :     int         byteNo,
    3241                 :                 bitNo;
    3242                 :     int         len;
    3243 ECB             :     int         byte;
    3244                 : 
    3245 CBC           6 :     len = VARSIZE_ANY_EXHDR(v);
    3246 ECB             : 
    3247 CBC           6 :     if (n < 0 || n >= (int64) len * 8)
    3248               3 :         ereport(ERROR,
    3249 EUB             :                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
    3250                 :                  errmsg("index %lld out of valid range, 0..%lld",
    3251                 :                         (long long) n, (long long) len * 8 - 1)));
    3252                 : 
    3253                 :     /* n/8 is now known < len, so safe to cast to int */
    3254 GIC           3 :     byteNo = (int) (n / 8);
    3255               3 :     bitNo = (int) (n % 8);
    3256                 : 
    3257               3 :     byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
    3258                 : 
    3259               3 :     if (byte & (1 << bitNo))
    3260 CBC           3 :         PG_RETURN_INT32(1);
    3261 ECB             :     else
    3262 LBC           0 :         PG_RETURN_INT32(0);
    3263 ECB             : }
    3264                 : 
    3265                 : /*-------------------------------------------------------------
    3266                 :  * byteaSetByte
    3267                 :  *
    3268                 :  * Given an instance of type 'bytea' creates a new one with
    3269                 :  * the Nth byte set to the given value.
    3270                 :  *
    3271                 :  *-------------------------------------------------------------
    3272                 :  */
    3273                 : Datum
    3274 CBC           6 : byteaSetByte(PG_FUNCTION_ARGS)
    3275                 : {
    3276 GIC           6 :     bytea      *res = PG_GETARG_BYTEA_P_COPY(0);
    3277 CBC           6 :     int32       n = PG_GETARG_INT32(1);
    3278               6 :     int32       newByte = PG_GETARG_INT32(2);
    3279                 :     int         len;
    3280 ECB             : 
    3281 GIC           6 :     len = VARSIZE(res) - VARHDRSZ;
    3282                 : 
    3283 CBC           6 :     if (n < 0 || n >= len)
    3284 GIC           3 :         ereport(ERROR,
    3285                 :                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
    3286 ECB             :                  errmsg("index %d out of valid range, 0..%d",
    3287                 :                         n, len - 1)));
    3288                 : 
    3289                 :     /*
    3290                 :      * Now set the byte.
    3291                 :      */
    3292 GIC           3 :     ((unsigned char *) VARDATA(res))[n] = newByte;
    3293                 : 
    3294 CBC           3 :     PG_RETURN_BYTEA_P(res);
    3295                 : }
    3296 ECB             : 
    3297                 : /*-------------------------------------------------------------
    3298                 :  * byteaSetBit
    3299                 :  *
    3300                 :  * Given an instance of type 'bytea' creates a new one with
    3301                 :  * the Nth bit set to the given value.
    3302                 :  *
    3303                 :  *-------------------------------------------------------------
    3304                 :  */
    3305                 : Datum
    3306 GIC           6 : byteaSetBit(PG_FUNCTION_ARGS)
    3307                 : {
    3308               6 :     bytea      *res = PG_GETARG_BYTEA_P_COPY(0);
    3309               6 :     int64       n = PG_GETARG_INT64(1);
    3310               6 :     int32       newBit = PG_GETARG_INT32(2);
    3311                 :     int         len;
    3312                 :     int         oldByte,
    3313                 :                 newByte;
    3314                 :     int         byteNo,
    3315                 :                 bitNo;
    3316                 : 
    3317               6 :     len = VARSIZE(res) - VARHDRSZ;
    3318                 : 
    3319               6 :     if (n < 0 || n >= (int64) len * 8)
    3320               3 :         ereport(ERROR,
    3321                 :                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
    3322                 :                  errmsg("index %lld out of valid range, 0..%lld",
    3323                 :                         (long long) n, (long long) len * 8 - 1)));
    3324                 : 
    3325                 :     /* n/8 is now known < len, so safe to cast to int */
    3326 CBC           3 :     byteNo = (int) (n / 8);
    3327 GIC           3 :     bitNo = (int) (n % 8);
    3328                 : 
    3329 ECB             :     /*
    3330                 :      * sanity check!
    3331                 :      */
    3332 CBC           3 :     if (newBit != 0 && newBit != 1)
    3333 UIC           0 :         ereport(ERROR,
    3334 ECB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    3335 EUB             :                  errmsg("new bit must be 0 or 1")));
    3336                 : 
    3337 ECB             :     /*
    3338                 :      * Update the byte.
    3339                 :      */
    3340 GIC           3 :     oldByte = ((unsigned char *) VARDATA(res))[byteNo];
    3341                 : 
    3342               3 :     if (newBit == 0)
    3343               3 :         newByte = oldByte & (~(1 << bitNo));
    3344                 :     else
    3345 UIC           0 :         newByte = oldByte | (1 << bitNo);
    3346 ECB             : 
    3347 GIC           3 :     ((unsigned char *) VARDATA(res))[byteNo] = newByte;
    3348                 : 
    3349 GBC           3 :     PG_RETURN_BYTEA_P(res);
    3350                 : }
    3351                 : 
    3352 EUB             : 
    3353                 : /* text_name()
    3354                 :  * Converts a text type to a Name type.
    3355                 :  */
    3356                 : Datum
    3357 GIC       15263 : text_name(PG_FUNCTION_ARGS)
    3358 EUB             : {
    3359 GBC       15263 :     text       *s = PG_GETARG_TEXT_PP(0);
    3360                 :     Name        result;
    3361                 :     int         len;
    3362 EUB             : 
    3363 GIC       15263 :     len = VARSIZE_ANY_EXHDR(s);
    3364                 : 
    3365                 :     /* Truncate oversize input */
    3366           15263 :     if (len >= NAMEDATALEN)
    3367 CBC           3 :         len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
    3368 ECB             : 
    3369                 :     /* We use palloc0 here to ensure result is zero-padded */
    3370 GIC       15263 :     result = (Name) palloc0(NAMEDATALEN);
    3371 CBC       15263 :     memcpy(NameStr(*result), VARDATA_ANY(s), len);
    3372 ECB             : 
    3373 CBC       15263 :     PG_RETURN_NAME(result);
    3374                 : }
    3375 ECB             : 
    3376 EUB             : /* name_text()
    3377                 :  * Converts a Name type to a text type.
    3378                 :  */
    3379 ECB             : Datum
    3380 GBC      525697 : name_text(PG_FUNCTION_ARGS)
    3381                 : {
    3382 CBC      525697 :     Name        s = PG_GETARG_NAME(0);
    3383                 : 
    3384 GBC      525697 :     PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
    3385 EUB             : }
    3386                 : 
    3387                 : 
    3388                 : /*
    3389 ECB             :  * textToQualifiedNameList - convert a text object to list of names
    3390                 :  *
    3391                 :  * This implements the input parsing needed by nextval() and other
    3392 EUB             :  * functions that take a text parameter representing a qualified name.
    3393                 :  * We split the name at dots, downcase if not double-quoted, and
    3394                 :  * truncate names if they're too long.
    3395 ECB             :  */
    3396                 : List *
    3397 GIC         685 : textToQualifiedNameList(text *textval)
    3398 ECB             : {
    3399 EUB             :     char       *rawname;
    3400 GIC         685 :     List       *result = NIL;
    3401                 :     List       *namelist;
    3402                 :     ListCell   *l;
    3403                 : 
    3404 ECB             :     /* Convert to C string (handles possible detoasting). */
    3405                 :     /* Note we rely on being able to modify rawname below. */
    3406 CBC         685 :     rawname = text_to_cstring(textval);
    3407                 : 
    3408 GIC         685 :     if (!SplitIdentifierString(rawname, '.', &namelist))
    3409 LBC           0 :         ereport(ERROR,
    3410                 :                 (errcode(ERRCODE_INVALID_NAME),
    3411 ECB             :                  errmsg("invalid name syntax")));
    3412                 : 
    3413 GIC         685 :     if (namelist == NIL)
    3414 UIC           0 :         ereport(ERROR,
    3415                 :                 (errcode(ERRCODE_INVALID_NAME),
    3416                 :                  errmsg("invalid name syntax")));
    3417                 : 
    3418 GIC        1425 :     foreach(l, namelist)
    3419                 :     {
    3420             740 :         char       *curname = (char *) lfirst(l);
    3421                 : 
    3422             740 :         result = lappend(result, makeString(pstrdup(curname)));
    3423                 :     }
    3424                 : 
    3425             685 :     pfree(rawname);
    3426             685 :     list_free(namelist);
    3427                 : 
    3428             685 :     return result;
    3429                 : }
    3430                 : 
    3431                 : /*
    3432                 :  * SplitIdentifierString --- parse a string containing identifiers
    3433                 :  *
    3434                 :  * This is the guts of textToQualifiedNameList, and is exported for use in
    3435                 :  * other situations such as parsing GUC variables.  In the GUC case, it's
    3436                 :  * important to avoid memory leaks, so the API is designed to minimize the
    3437                 :  * amount of stuff that needs to be allocated and freed.
    3438                 :  *
    3439                 :  * Inputs:
    3440                 :  *  rawstring: the input string; must be overwritable!  On return, it's
    3441                 :  *             been modified to contain the separated identifiers.
    3442                 :  *  separator: the separator punctuation expected between identifiers
    3443                 :  *             (typically '.' or ',').  Whitespace may also appear around
    3444                 :  *             identifiers.
    3445                 :  * Outputs:
    3446                 :  *  namelist: filled with a palloc'd list of pointers to identifiers within
    3447 ECB             :  *            rawstring.  Caller should list_free() this even on error return.
    3448                 :  *
    3449                 :  * Returns true if okay, false if there is a syntax error in the string.
    3450                 :  *
    3451                 :  * Note that an empty string is considered okay here, though not in
    3452                 :  * textToQualifiedNameList.
    3453                 :  */
    3454                 : bool
    3455 CBC       72651 : SplitIdentifierString(char *rawstring, char separator,
    3456 EUB             :                       List **namelist)
    3457                 : {
    3458 CBC       72651 :     char       *nextp = rawstring;
    3459           72651 :     bool        done = false;
    3460                 : 
    3461 GIC       72651 :     *namelist = NIL;
    3462                 : 
    3463           72651 :     while (scanner_isspace(*nextp))
    3464 UIC           0 :         nextp++;                /* skip leading whitespace */
    3465                 : 
    3466 GIC       72651 :     if (*nextp == '\0')
    3467 CBC        9918 :         return true;            /* allow empty string */
    3468                 : 
    3469                 :     /* At the top of the loop, we are at start of a new identifier. */
    3470 ECB             :     do
    3471                 :     {
    3472                 :         char       *curname;
    3473                 :         char       *endp;
    3474                 : 
    3475 GBC       99650 :         if (*nextp == '"')
    3476 ECB             :         {
    3477                 :             /* Quoted name --- collapse quote-quote pairs, no downcasing */
    3478 GIC       15347 :             curname = nextp + 1;
    3479 ECB             :             for (;;)
    3480                 :             {
    3481 GIC       15349 :                 endp = strchr(nextp + 1, '"');
    3482           15348 :                 if (endp == NULL)
    3483 LBC           0 :                     return false;   /* mismatched quotes */
    3484 GIC       15348 :                 if (endp[1] != '"')
    3485           15347 :                     break;      /* found end of quoted name */
    3486                 :                 /* Collapse adjacent quotes into one quote, and look again */
    3487               1 :                 memmove(endp, endp + 1, strlen(endp));
    3488 CBC           1 :                 nextp = endp;
    3489 ECB             :             }
    3490                 :             /* endp now points at the terminating quote */
    3491 CBC       15347 :             nextp = endp + 1;
    3492 ECB             :         }
    3493                 :         else
    3494 EUB             :         {
    3495                 :             /* Unquoted name --- extends to separator or whitespace */
    3496                 :             char       *downname;
    3497 ECB             :             int         len;
    3498 EUB             : 
    3499 GIC       84303 :             curname = nextp;
    3500 CBC      727631 :             while (*nextp && *nextp != separator &&
    3501 GIC      643329 :                    !scanner_isspace(*nextp))
    3502 CBC      643328 :                 nextp++;
    3503           84303 :             endp = nextp;
    3504           84303 :             if (curname == nextp)
    3505 UIC           0 :                 return false;   /* empty unquoted name not allowed */
    3506                 : 
    3507 ECB             :             /*
    3508                 :              * Downcase the identifier, using same code as main lexer does.
    3509                 :              *
    3510 EUB             :              * XXX because we want to overwrite the input in-place, we cannot
    3511                 :              * support a downcasing transformation that increases the string
    3512                 :              * length.  This is not a problem given the current implementation
    3513 ECB             :              * of downcase_truncate_identifier, but we'll probably have to do
    3514                 :              * something about this someday.
    3515                 :              */
    3516 GIC       84303 :             len = endp - curname;
    3517           84303 :             downname = downcase_truncate_identifier(curname, len, false);
    3518 CBC       84303 :             Assert(strlen(downname) <= len);
    3519 GIC       84303 :             strncpy(curname, downname, len);    /* strncpy is required here */
    3520           84303 :             pfree(downname);
    3521 ECB             :         }
    3522                 : 
    3523 CBC       99651 :         while (scanner_isspace(*nextp))
    3524 GIC           1 :             nextp++;            /* skip trailing whitespace */
    3525                 : 
    3526           99650 :         if (*nextp == separator)
    3527                 :         {
    3528           36917 :             nextp++;
    3529           59216 :             while (scanner_isspace(*nextp))
    3530           22299 :                 nextp++;        /* skip leading whitespace for next */
    3531                 :             /* we expect another name, so done remains false */
    3532                 :         }
    3533           62733 :         else if (*nextp == '\0')
    3534           62732 :             done = true;
    3535                 :         else
    3536 CBC           1 :             return false;       /* invalid syntax */
    3537                 : 
    3538 ECB             :         /* Now safe to overwrite separator with a null */
    3539 CBC       99649 :         *endp = '\0';
    3540                 : 
    3541                 :         /* Truncate name if it's overlength */
    3542 GIC       99649 :         truncate_identifier(curname, strlen(curname), false);
    3543                 : 
    3544                 :         /*
    3545                 :          * Finished isolating current name --- add it to list
    3546                 :          */
    3547           99649 :         *namelist = lappend(*namelist, curname);
    3548 ECB             : 
    3549                 :         /* Loop back if we didn't reach end of string */
    3550 CBC       99649 :     } while (!done);
    3551 ECB             : 
    3552 GIC       62732 :     return true;
    3553                 : }
    3554 ECB             : 
    3555                 : 
    3556                 : /*
    3557                 :  * SplitDirectoriesString --- parse a string containing file/directory names
    3558                 :  *
    3559                 :  * This works fine on file names too; the function name is historical.
    3560                 :  *
    3561                 :  * This is similar to SplitIdentifierString, except that the parsing
    3562                 :  * rules are meant to handle pathnames instead of identifiers: there is
    3563                 :  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
    3564                 :  * and we apply canonicalize_path() to each extracted string.  Because of the
    3565                 :  * last, the returned strings are separately palloc'd rather than being
    3566                 :  * pointers into rawstring --- but we still scribble on rawstring.
    3567                 :  *
    3568                 :  * Inputs:
    3569                 :  *  rawstring: the input string; must be modifiable!
    3570                 :  *  separator: the separator punctuation expected between directories
    3571                 :  *             (typically ',' or ';').  Whitespace may also appear around
    3572                 :  *             directories.
    3573                 :  * Outputs:
    3574                 :  *  namelist: filled with a palloc'd list of directory names.
    3575                 :  *            Caller should list_free_deep() this even on error return.
    3576                 :  *
    3577                 :  * Returns true if okay, false if there is a syntax error in the string.
    3578                 :  *
    3579                 :  * Note that an empty string is considered okay here.
    3580                 :  */
    3581                 : bool
    3582 CBC         621 : SplitDirectoriesString(char *rawstring, char separator,
    3583 EUB             :                        List **namelist)
    3584                 : {
    3585 GIC         621 :     char       *nextp = rawstring;
    3586 CBC         621 :     bool        done = false;
    3587 ECB             : 
    3588 GIC         621 :     *namelist = NIL;
    3589 ECB             : 
    3590 GIC         621 :     while (scanner_isspace(*nextp))
    3591 UIC           0 :         nextp++;                /* skip leading whitespace */
    3592 ECB             : 
    3593 CBC         621 :     if (*nextp == '\0')
    3594 GIC           3 :         return true;            /* allow empty string */
    3595                 : 
    3596 ECB             :     /* At the top of the loop, we are at start of a new directory. */
    3597                 :     do
    3598                 :     {
    3599                 :         char       *curname;
    3600                 :         char       *endp;
    3601                 : 
    3602 CBC         618 :         if (*nextp == '"')
    3603 ECB             :         {
    3604                 :             /* Quoted name --- collapse quote-quote pairs */
    3605 UIC           0 :             curname = nextp + 1;
    3606                 :             for (;;)
    3607                 :             {
    3608 LBC           0 :                 endp = strchr(nextp + 1, '"');
    3609               0 :                 if (endp == NULL)
    3610 UIC           0 :                     return false;   /* mismatched quotes */
    3611 LBC           0 :                 if (endp[1] != '"')
    3612 UIC           0 :                     break;      /* found end of quoted name */
    3613 ECB             :                 /* Collapse adjacent quotes into one quote, and look again */
    3614 LBC           0 :                 memmove(endp, endp + 1, strlen(endp));
    3615 UIC           0 :                 nextp = endp;
    3616 ECB             :             }
    3617                 :             /* endp now points at the terminating quote */
    3618 UIC           0 :             nextp = endp + 1;
    3619                 :         }
    3620 ECB             :         else
    3621                 :         {
    3622                 :             /* Unquoted name --- extends to separator or end of string */
    3623 CBC         618 :             curname = endp = nextp;
    3624 GIC       10456 :             while (*nextp && *nextp != separator)
    3625                 :             {
    3626                 :                 /* trailing whitespace should not be included in name */
    3627            9838 :                 if (!scanner_isspace(*nextp))
    3628 CBC        9838 :                     endp = nextp + 1;
    3629            9838 :                 nextp++;
    3630                 :             }
    3631             618 :             if (curname == endp)
    3632 UIC           0 :                 return false;   /* empty unquoted name not allowed */
    3633 ECB             :         }
    3634                 : 
    3635 GIC         618 :         while (scanner_isspace(*nextp))
    3636 LBC           0 :             nextp++;            /* skip trailing whitespace */
    3637                 : 
    3638 GIC         618 :         if (*nextp == separator)
    3639                 :         {
    3640 LBC           0 :             nextp++;
    3641 UIC           0 :             while (scanner_isspace(*nextp))
    3642 LBC           0 :                 nextp++;        /* skip leading whitespace for next */
    3643 ECB             :             /* we expect another name, so done remains false */
    3644                 :         }
    3645 GIC         618 :         else if (*nextp == '\0')
    3646             618 :             done = true;
    3647                 :         else
    3648 LBC           0 :             return false;       /* invalid syntax */
    3649 ECB             : 
    3650                 :         /* Now safe to overwrite separator with a null */
    3651 CBC         618 :         *endp = '\0';
    3652                 : 
    3653 ECB             :         /* Truncate path if it's overlength */
    3654 CBC         618 :         if (strlen(curname) >= MAXPGPATH)
    3655 UIC           0 :             curname[MAXPGPATH - 1] = '\0';
    3656 ECB             : 
    3657                 :         /*
    3658                 :          * Finished isolating current name --- add it to list
    3659                 :          */
    3660 CBC         618 :         curname = pstrdup(curname);
    3661 GIC         618 :         canonicalize_path(curname);
    3662 CBC         618 :         *namelist = lappend(*namelist, curname);
    3663 ECB             : 
    3664                 :         /* Loop back if we didn't reach end of string */
    3665 GIC         618 :     } while (!done);
    3666                 : 
    3667             618 :     return true;
    3668 ECB             : }
    3669                 : 
    3670                 : 
    3671                 : /*
    3672                 :  * SplitGUCList --- parse a string containing identifiers or file names
    3673                 :  *
    3674                 :  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
    3675                 :  * presuming whether the elements will be taken as identifiers or file names.
    3676                 :  * We assume the input has already been through flatten_set_variable_args(),
    3677                 :  * so that we need never downcase (if appropriate, that was done already).
    3678                 :  * Nor do we ever truncate, since we don't know the correct max length.
    3679                 :  * We disallow embedded whitespace for simplicity (it shouldn't matter,
    3680                 :  * because any embedded whitespace should have led to double-quoting).
    3681                 :  * Otherwise the API is identical to SplitIdentifierString.
    3682                 :  *
    3683                 :  * XXX it's annoying to have so many copies of this string-splitting logic.
    3684                 :  * However, it's not clear that having one function with a bunch of option
    3685                 :  * flags would be much better.
    3686                 :  *
    3687                 :  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
    3688                 :  * Be sure to update that if you have to change this.
    3689                 :  *
    3690                 :  * Inputs:
    3691                 :  *  rawstring: the input string; must be overwritable!  On return, it's
    3692                 :  *             been modified to contain the separated identifiers.
    3693                 :  *  separator: the separator punctuation expected between identifiers
    3694                 :  *             (typically '.' or ',').  Whitespace may also appear around
    3695                 :  *             identifiers.
    3696                 :  * Outputs:
    3697                 :  *  namelist: filled with a palloc'd list of pointers to identifiers within
    3698                 :  *            rawstring.  Caller should list_free() this even on error return.
    3699                 :  *
    3700                 :  * Returns true if okay, false if there is a syntax error in the string.
    3701                 :  */
    3702                 : bool
    3703 GIC        2458 : SplitGUCList(char *rawstring, char separator,
    3704 ECB             :              List **namelist)
    3705                 : {
    3706 GIC        2458 :     char       *nextp = rawstring;
    3707 CBC        2458 :     bool        done = false;
    3708                 : 
    3709 GIC        2458 :     *namelist = NIL;
    3710 ECB             : 
    3711 GIC        2458 :     while (scanner_isspace(*nextp))
    3712 LBC           0 :         nextp++;                /* skip leading whitespace */
    3713                 : 
    3714 CBC        2458 :     if (*nextp == '\0')
    3715 GIC        2416 :         return true;            /* allow empty string */
    3716                 : 
    3717                 :     /* At the top of the loop, we are at start of a new identifier. */
    3718                 :     do
    3719                 :     {
    3720                 :         char       *curname;
    3721                 :         char       *endp;
    3722                 : 
    3723              55 :         if (*nextp == '"')
    3724 ECB             :         {
    3725                 :             /* Quoted name --- collapse quote-quote pairs */
    3726 CBC          12 :             curname = nextp + 1;
    3727 ECB             :             for (;;)
    3728                 :             {
    3729 GIC          18 :                 endp = strchr(nextp + 1, '"');
    3730              15 :                 if (endp == NULL)
    3731 UIC           0 :                     return false;   /* mismatched quotes */
    3732 GIC          15 :                 if (endp[1] != '"')
    3733              12 :                     break;      /* found end of quoted name */
    3734                 :                 /* Collapse adjacent quotes into one quote, and look again */
    3735               3 :                 memmove(endp, endp + 1, strlen(endp));
    3736               3 :                 nextp = endp;
    3737                 :             }
    3738 ECB             :             /* endp now points at the terminating quote */
    3739 GIC          12 :             nextp = endp + 1;
    3740 ECB             :         }
    3741                 :         else
    3742                 :         {
    3743                 :             /* Unquoted name --- extends to separator or whitespace */
    3744 GIC          43 :             curname = nextp;
    3745             409 :             while (*nextp && *nextp != separator &&
    3746             366 :                    !scanner_isspace(*nextp))
    3747             366 :                 nextp++;
    3748              43 :             endp = nextp;
    3749              43 :             if (curname == nextp)
    3750 UIC           0 :                 return false;   /* empty unquoted name not allowed */
    3751                 :         }
    3752                 : 
    3753 CBC          55 :         while (scanner_isspace(*nextp))
    3754 LBC           0 :             nextp++;            /* skip trailing whitespace */
    3755                 : 
    3756 GIC          55 :         if (*nextp == separator)
    3757 ECB             :         {
    3758 GIC          13 :             nextp++;
    3759 GBC          22 :             while (scanner_isspace(*nextp))
    3760 GIC           9 :                 nextp++;        /* skip leading whitespace for next */
    3761                 :             /* we expect another name, so done remains false */
    3762 ECB             :         }
    3763 GIC          42 :         else if (*nextp == '\0')
    3764 CBC          42 :             done = true;
    3765                 :         else
    3766 UIC           0 :             return false;       /* invalid syntax */
    3767 ECB             : 
    3768                 :         /* Now safe to overwrite separator with a null */
    3769 CBC          55 :         *endp = '\0';
    3770 ECB             : 
    3771                 :         /*
    3772                 :          * Finished isolating current name --- add it to list
    3773                 :          */
    3774 GIC          55 :         *namelist = lappend(*namelist, curname);
    3775 ECB             : 
    3776                 :         /* Loop back if we didn't reach end of string */
    3777 GIC          55 :     } while (!done);
    3778                 : 
    3779 CBC          42 :     return true;
    3780                 : }
    3781                 : 
    3782 ECB             : 
    3783                 : /*****************************************************************************
    3784                 :  *  Comparison Functions used for bytea
    3785                 :  *
    3786                 :  * Note: btree indexes need these routines not to leak memory; therefore,
    3787                 :  * be careful to free working copies of toasted datums.  Most places don't
    3788                 :  * need to be so careful.
    3789                 :  *****************************************************************************/
    3790                 : 
    3791                 : Datum
    3792 GIC        5188 : byteaeq(PG_FUNCTION_ARGS)
    3793 ECB             : {
    3794 GIC        5188 :     Datum       arg1 = PG_GETARG_DATUM(0);
    3795            5188 :     Datum       arg2 = PG_GETARG_DATUM(1);
    3796 ECB             :     bool        result;
    3797                 :     Size        len1,
    3798                 :                 len2;
    3799                 : 
    3800                 :     /*
    3801                 :      * We can use a fast path for unequal lengths, which might save us from
    3802                 :      * having to detoast one or both values.
    3803                 :      */
    3804 CBC        5188 :     len1 = toast_raw_datum_size(arg1);
    3805 GIC        5188 :     len2 = toast_raw_datum_size(arg2);
    3806            5188 :     if (len1 != len2)
    3807            2154 :         result = false;
    3808                 :     else
    3809                 :     {
    3810            3034 :         bytea      *barg1 = DatumGetByteaPP(arg1);
    3811            3034 :         bytea      *barg2 = DatumGetByteaPP(arg2);
    3812                 : 
    3813            3034 :         result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
    3814                 :                          len1 - VARHDRSZ) == 0);
    3815 ECB             : 
    3816 GIC        3034 :         PG_FREE_IF_COPY(barg1, 0);
    3817 CBC        3034 :         PG_FREE_IF_COPY(barg2, 1);
    3818 ECB             :     }
    3819                 : 
    3820 GIC        5188 :     PG_RETURN_BOOL(result);
    3821 ECB             : }
    3822                 : 
    3823                 : Datum
    3824 CBC         384 : byteane(PG_FUNCTION_ARGS)
    3825 ECB             : {
    3826 CBC         384 :     Datum       arg1 = PG_GETARG_DATUM(0);
    3827             384 :     Datum       arg2 = PG_GETARG_DATUM(1);
    3828                 :     bool        result;
    3829 ECB             :     Size        len1,
    3830                 :                 len2;
    3831                 : 
    3832                 :     /*
    3833                 :      * We can use a fast path for unequal lengths, which might save us from
    3834                 :      * having to detoast one or both values.
    3835                 :      */
    3836 GIC         384 :     len1 = toast_raw_datum_size(arg1);
    3837 CBC         384 :     len2 = toast_raw_datum_size(arg2);
    3838 GIC         384 :     if (len1 != len2)
    3839 UIC           0 :         result = true;
    3840                 :     else
    3841                 :     {
    3842 GIC         384 :         bytea      *barg1 = DatumGetByteaPP(arg1);
    3843             384 :         bytea      *barg2 = DatumGetByteaPP(arg2);
    3844                 : 
    3845             384 :         result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
    3846                 :                          len1 - VARHDRSZ) != 0);
    3847                 : 
    3848 CBC         384 :         PG_FREE_IF_COPY(barg1, 0);
    3849 GIC         384 :         PG_FREE_IF_COPY(barg2, 1);
    3850                 :     }
    3851                 : 
    3852 CBC         384 :     PG_RETURN_BOOL(result);
    3853 ECB             : }
    3854                 : 
    3855                 : Datum
    3856 GIC        4158 : bytealt(PG_FUNCTION_ARGS)
    3857 ECB             : {
    3858 GIC        4158 :     bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
    3859            4158 :     bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
    3860                 :     int         len1,
    3861                 :                 len2;
    3862 ECB             :     int         cmp;
    3863                 : 
    3864 CBC        4158 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    3865 GIC        4158 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    3866                 : 
    3867 CBC        4158 :     cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    3868 ECB             : 
    3869 GIC        4158 :     PG_FREE_IF_COPY(arg1, 0);
    3870            4158 :     PG_FREE_IF_COPY(arg2, 1);
    3871 ECB             : 
    3872 CBC        4158 :     PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
    3873 ECB             : }
    3874                 : 
    3875                 : Datum
    3876 GIC        3178 : byteale(PG_FUNCTION_ARGS)
    3877                 : {
    3878 CBC        3178 :     bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
    3879            3178 :     bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
    3880                 :     int         len1,
    3881                 :                 len2;
    3882 ECB             :     int         cmp;
    3883                 : 
    3884 GIC        3178 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    3885 CBC        3178 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    3886                 : 
    3887            3178 :     cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    3888 ECB             : 
    3889 CBC        3178 :     PG_FREE_IF_COPY(arg1, 0);
    3890 GIC        3178 :     PG_FREE_IF_COPY(arg2, 1);
    3891 ECB             : 
    3892 GIC        3178 :     PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
    3893                 : }
    3894 ECB             : 
    3895                 : Datum
    3896 CBC        3114 : byteagt(PG_FUNCTION_ARGS)
    3897                 : {
    3898            3114 :     bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
    3899 GIC        3114 :     bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
    3900                 :     int         len1,
    3901 ECB             :                 len2;
    3902                 :     int         cmp;
    3903                 : 
    3904 GIC        3114 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    3905            3114 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    3906                 : 
    3907            3114 :     cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    3908                 : 
    3909            3114 :     PG_FREE_IF_COPY(arg1, 0);
    3910            3114 :     PG_FREE_IF_COPY(arg2, 1);
    3911                 : 
    3912 CBC        3114 :     PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
    3913 ECB             : }
    3914                 : 
    3915                 : Datum
    3916 CBC        2505 : byteage(PG_FUNCTION_ARGS)
    3917                 : {
    3918 GIC        2505 :     bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
    3919            2505 :     bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
    3920                 :     int         len1,
    3921                 :                 len2;
    3922                 :     int         cmp;
    3923                 : 
    3924            2505 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    3925 CBC        2505 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    3926 ECB             : 
    3927 CBC        2505 :     cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    3928 ECB             : 
    3929 CBC        2505 :     PG_FREE_IF_COPY(arg1, 0);
    3930 GIC        2505 :     PG_FREE_IF_COPY(arg2, 1);
    3931                 : 
    3932 CBC        2505 :     PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
    3933                 : }
    3934                 : 
    3935                 : Datum
    3936 GIC       43878 : byteacmp(PG_FUNCTION_ARGS)
    3937                 : {
    3938           43878 :     bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
    3939           43878 :     bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
    3940                 :     int         len1,
    3941                 :                 len2;
    3942                 :     int         cmp;
    3943                 : 
    3944           43878 :     len1 = VARSIZE_ANY_EXHDR(arg1);
    3945           43878 :     len2 = VARSIZE_ANY_EXHDR(arg2);
    3946                 : 
    3947           43878 :     cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
    3948 CBC       43878 :     if ((cmp == 0) && (len1 != len2))
    3949 GIC        7326 :         cmp = (len1 < len2) ? -1 : 1;
    3950                 : 
    3951           43878 :     PG_FREE_IF_COPY(arg1, 0);
    3952           43878 :     PG_FREE_IF_COPY(arg2, 1);
    3953                 : 
    3954           43878 :     PG_RETURN_INT32(cmp);
    3955 ECB             : }
    3956                 : 
    3957                 : Datum
    3958 GIC          13 : bytea_sortsupport(PG_FUNCTION_ARGS)
    3959 ECB             : {
    3960 GIC          13 :     SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
    3961                 :     MemoryContext oldcontext;
    3962                 : 
    3963              13 :     oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
    3964                 : 
    3965                 :     /* Use generic string SortSupport, forcing "C" collation */
    3966 CBC          13 :     varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
    3967                 : 
    3968 GIC          13 :     MemoryContextSwitchTo(oldcontext);
    3969 ECB             : 
    3970 CBC          13 :     PG_RETURN_VOID();
    3971                 : }
    3972                 : 
    3973 ECB             : /*
    3974                 :  * appendStringInfoText
    3975                 :  *
    3976                 :  * Append a text to str.
    3977                 :  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
    3978                 :  */
    3979                 : static void
    3980 CBC      832552 : appendStringInfoText(StringInfo str, const text *t)
    3981                 : {
    3982 GIC      832552 :     appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
    3983          832552 : }
    3984 ECB             : 
    3985                 : /*
    3986                 :  * replace_text
    3987                 :  * replace all occurrences of 'old_sub_str' in 'orig_str'
    3988                 :  * with 'new_sub_str' to form 'new_str'
    3989                 :  *
    3990                 :  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
    3991                 :  * otherwise returns 'new_str'
    3992                 :  */
    3993                 : Datum
    3994 CBC        1275 : replace_text(PG_FUNCTION_ARGS)
    3995                 : {
    3996            1275 :     text       *src_text = PG_GETARG_TEXT_PP(0);
    3997 GIC        1275 :     text       *from_sub_text = PG_GETARG_TEXT_PP(1);
    3998            1275 :     text       *to_sub_text = PG_GETARG_TEXT_PP(2);
    3999                 :     int         src_text_len;
    4000                 :     int         from_sub_text_len;
    4001                 :     TextPositionState state;
    4002                 :     text       *ret_text;
    4003                 :     int         chunk_len;
    4004                 :     char       *curr_ptr;
    4005 ECB             :     char       *start_ptr;
    4006                 :     StringInfoData str;
    4007                 :     bool        found;
    4008                 : 
    4009 GIC        1275 :     src_text_len = VARSIZE_ANY_EXHDR(src_text);
    4010            1275 :     from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
    4011                 : 
    4012 EUB             :     /* Return unmodified source string if empty source or pattern */
    4013 GBC        1275 :     if (src_text_len < 1 || from_sub_text_len < 1)
    4014                 :     {
    4015 UIC           0 :         PG_RETURN_TEXT_P(src_text);
    4016                 :     }
    4017                 : 
    4018 GIC        1275 :     text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
    4019                 : 
    4020            1275 :     found = text_position_next(&state);
    4021 ECB             : 
    4022                 :     /* When the from_sub_text is not found, there is nothing to do. */
    4023 GIC        1275 :     if (!found)
    4024                 :     {
    4025             390 :         text_position_cleanup(&state);
    4026             390 :         PG_RETURN_TEXT_P(src_text);
    4027                 :     }
    4028             885 :     curr_ptr = text_position_get_match_ptr(&state);
    4029 CBC         885 :     start_ptr = VARDATA_ANY(src_text);
    4030 ECB             : 
    4031 GBC         885 :     initStringInfo(&str);
    4032 ECB             : 
    4033                 :     do
    4034                 :     {
    4035 GIC        3119 :         CHECK_FOR_INTERRUPTS();
    4036                 : 
    4037                 :         /* copy the data skipped over by last text_position_next() */
    4038            3119 :         chunk_len = curr_ptr - start_ptr;
    4039 CBC        3119 :         appendBinaryStringInfo(&str, start_ptr, chunk_len);
    4040                 : 
    4041 GIC        3119 :         appendStringInfoText(&str, to_sub_text);
    4042                 : 
    4043 CBC        3119 :         start_ptr = curr_ptr + from_sub_text_len;
    4044 ECB             : 
    4045 CBC        3119 :         found = text_position_next(&state);
    4046 GIC        3119 :         if (found)
    4047            2234 :             curr_ptr = text_position_get_match_ptr(&state);
    4048                 :     }
    4049            3119 :     while (found);
    4050                 : 
    4051 ECB             :     /* copy trailing data */
    4052 CBC         885 :     chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
    4053 GIC         885 :     appendBinaryStringInfo(&str, start_ptr, chunk_len);
    4054                 : 
    4055             885 :     text_position_cleanup(&state);
    4056                 : 
    4057             885 :     ret_text = cstring_to_text_with_len(str.data, str.len);
    4058 CBC         885 :     pfree(str.data);
    4059 ECB             : 
    4060 GIC         885 :     PG_RETURN_TEXT_P(ret_text);
    4061                 : }
    4062 ECB             : 
    4063                 : /*
    4064                 :  * check_replace_text_has_escape
    4065                 :  *
    4066                 :  * Returns 0 if text contains no backslashes that need processing.
    4067                 :  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
    4068                 :  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
    4069                 :  */
    4070                 : static int
    4071 GIC        5263 : check_replace_text_has_escape(const text *replace_text)
    4072 ECB             : {
    4073 CBC        5263 :     int         result = 0;
    4074 GIC        5263 :     const char *p = VARDATA_ANY(replace_text);
    4075            5263 :     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
    4076                 : 
    4077           10548 :     while (p < p_end)
    4078                 :     {
    4079                 :         /* Find next escape char, if any. */
    4080            4635 :         p = memchr(p, '\\', p_end - p);
    4081 CBC        4635 :         if (p == NULL)
    4082            4296 :             break;
    4083             339 :         p++;
    4084                 :         /* Note: a backslash at the end doesn't require extra processing. */
    4085 GIC         339 :         if (p < p_end)
    4086                 :         {
    4087             339 :             if (*p >= '1' && *p <= '9')
    4088             317 :                 return 2;       /* Found a submatch specifier, so done */
    4089 CBC          22 :             result = 1;         /* Found some other sequence, keep looking */
    4090 GIC          22 :             p++;
    4091                 :         }
    4092                 :     }
    4093 CBC        4946 :     return result;
    4094 ECB             : }
    4095                 : 
    4096                 : /*
    4097                 :  * appendStringInfoRegexpSubstr
    4098                 :  *
    4099                 :  * Append replace_text to str, substituting regexp back references for
    4100                 :  * \n escapes.  start_ptr is the start of the match in the source string,
    4101                 :  * at logical character position data_pos.
    4102                 :  */
    4103                 : static void
    4104 GIC         106 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
    4105                 :                              regmatch_t *pmatch,
    4106                 :                              char *start_ptr, int data_pos)
    4107                 : {
    4108             106 :     const char *p = VARDATA_ANY(replace_text);
    4109             106 :     const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
    4110 ECB             : 
    4111 GIC         263 :     while (p < p_end)
    4112 ECB             :     {
    4113 CBC         235 :         const char *chunk_start = p;
    4114 ECB             :         int         so;
    4115                 :         int         eo;
    4116                 : 
    4117                 :         /* Find next escape char, if any. */
    4118 GIC         235 :         p = memchr(p, '\\', p_end - p);
    4119             235 :         if (p == NULL)
    4120              75 :             p = p_end;
    4121                 : 
    4122                 :         /* Copy the text we just scanned over, if any. */
    4123             235 :         if (p > chunk_start)
    4124 CBC         147 :             appendBinaryStringInfo(str, chunk_start, p - chunk_start);
    4125 ECB             : 
    4126                 :         /* Done if at end of string, else advance over escape char. */
    4127 GIC         235 :         if (p >= p_end)
    4128              75 :             break;
    4129 CBC         160 :         p++;
    4130 ECB             : 
    4131 GIC         160 :         if (p >= p_end)
    4132                 :         {
    4133 ECB             :             /* Escape at very end of input.  Treat same as unexpected char */
    4134 CBC           3 :             appendStringInfoChar(str, '\\');
    4135 GIC           3 :             break;
    4136                 :         }
    4137 ECB             : 
    4138 GIC         157 :         if (*p >= '1' && *p <= '9')
    4139             127 :         {
    4140 ECB             :             /* Use the back reference of regexp. */
    4141 CBC         127 :             int         idx = *p - '0';
    4142                 : 
    4143             127 :             so = pmatch[idx].rm_so;
    4144 GIC         127 :             eo = pmatch[idx].rm_eo;
    4145             127 :             p++;
    4146                 :         }
    4147 CBC          30 :         else if (*p == '&')
    4148                 :         {
    4149 ECB             :             /* Use the entire matched string. */
    4150 GIC           9 :             so = pmatch[0].rm_so;
    4151               9 :             eo = pmatch[0].rm_eo;
    4152 CBC           9 :             p++;
    4153                 :         }
    4154              21 :         else if (*p == '\\')
    4155                 :         {
    4156 ECB             :             /* \\ means transfer one \ to output. */
    4157 CBC          18 :             appendStringInfoChar(str, '\\');
    4158 GIC          18 :             p++;
    4159 CBC          18 :             continue;
    4160                 :         }
    4161                 :         else
    4162                 :         {
    4163                 :             /*
    4164                 :              * If escape char is not followed by any expected char, just treat
    4165                 :              * it as ordinary data to copy.  (XXX would it be better to throw
    4166 ECB             :              * an error?)
    4167                 :              */
    4168 GIC           3 :             appendStringInfoChar(str, '\\');
    4169 CBC           3 :             continue;
    4170                 :         }
    4171 ECB             : 
    4172 CBC         136 :         if (so >= 0 && eo >= 0)
    4173                 :         {
    4174                 :             /*
    4175 ECB             :              * Copy the text that is back reference of regexp.  Note so and eo
    4176                 :              * are counted in characters not bytes.
    4177                 :              */
    4178                 :             char       *chunk_start;
    4179                 :             int         chunk_len;
    4180                 : 
    4181 GIC         136 :             Assert(so >= data_pos);
    4182             136 :             chunk_start = start_ptr;
    4183             136 :             chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
    4184             136 :             chunk_len = charlen_to_bytelen(chunk_start, eo - so);
    4185 CBC         136 :             appendBinaryStringInfo(str, chunk_start, chunk_len);
    4186                 :         }
    4187                 :     }
    4188             106 : }
    4189                 : 
    4190 ECB             : /*
    4191                 :  * replace_text_regexp
    4192                 :  *
    4193                 :  * replace substring(s) in src_text that match pattern with replace_text.
    4194                 :  * The replace_text can contain backslash markers to substitute
    4195                 :  * (parts of) the matched text.
    4196                 :  *
    4197                 :  * cflags: regexp compile flags.
    4198                 :  * collation: collation to use.
    4199                 :  * search_start: the character (not byte) offset in src_text at which to
    4200                 :  * begin searching.
    4201                 :  * n: if 0, replace all matches; if > 0, replace only the N'th match.
    4202                 :  */
    4203                 : text *
    4204 CBC        5263 : replace_text_regexp(text *src_text, text *pattern_text,
    4205                 :                     text *replace_text,
    4206                 :                     int cflags, Oid collation,
    4207 ECB             :                     int search_start, int n)
    4208                 : {
    4209                 :     text       *ret_text;
    4210                 :     regex_t    *re;
    4211 GIC        5263 :     int         src_text_len = VARSIZE_ANY_EXHDR(src_text);
    4212            5263 :     int         nmatches = 0;
    4213 ECB             :     StringInfoData buf;
    4214                 :     regmatch_t  pmatch[10];     /* main match, plus \1 to \9 */
    4215 CBC        5263 :     int         nmatch = lengthof(pmatch);
    4216                 :     pg_wchar   *data;
    4217                 :     size_t      data_len;
    4218                 :     int         data_pos;
    4219 ECB             :     char       *start_ptr;
    4220                 :     int         escape_status;
    4221                 : 
    4222 GIC        5263 :     initStringInfo(&buf);
    4223 ECB             : 
    4224                 :     /* Convert data string to wide characters. */
    4225 GIC        5263 :     data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
    4226            5263 :     data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
    4227 ECB             : 
    4228                 :     /* Check whether replace_text has escapes, especially regexp submatches. */
    4229 GIC        5263 :     escape_status = check_replace_text_has_escape(replace_text);
    4230                 : 
    4231                 :     /* If no regexp submatches, we can use REG_NOSUB. */
    4232 CBC        5263 :     if (escape_status < 2)
    4233                 :     {
    4234 GIC        4946 :         cflags |= REG_NOSUB;
    4235 ECB             :         /* Also tell pg_regexec we only want the whole-match location. */
    4236 GIC        4946 :         nmatch = 1;
    4237                 :     }
    4238                 : 
    4239                 :     /* Prepare the regexp. */
    4240            5263 :     re = RE_compile_and_cache(pattern_text, cflags, collation);
    4241                 : 
    4242 ECB             :     /* start_ptr points to the data_pos'th character of src_text */
    4243 GIC        5263 :     start_ptr = (char *) VARDATA_ANY(src_text);
    4244 CBC        5263 :     data_pos = 0;
    4245                 : 
    4246 GIC        7251 :     while (search_start <= data_len)
    4247                 :     {
    4248                 :         int         regexec_result;
    4249                 : 
    4250            7248 :         CHECK_FOR_INTERRUPTS();
    4251                 : 
    4252            7248 :         regexec_result = pg_regexec(re,
    4253                 :                                     data,
    4254                 :                                     data_len,
    4255                 :                                     search_start,
    4256 ECB             :                                     NULL,   /* no details */
    4257                 :                                     nmatch,
    4258                 :                                     pmatch,
    4259                 :                                     0);
    4260                 : 
    4261 CBC        7248 :         if (regexec_result == REG_NOMATCH)
    4262 GIC        4583 :             break;
    4263 ECB             : 
    4264 CBC        2665 :         if (regexec_result != REG_OKAY)
    4265                 :         {
    4266 ECB             :             char        errMsg[100];
    4267                 : 
    4268 LBC           0 :             pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
    4269 UIC           0 :             ereport(ERROR,
    4270                 :                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
    4271                 :                      errmsg("regular expression failed: %s", errMsg)));
    4272                 :         }
    4273                 : 
    4274                 :         /*
    4275                 :          * Count matches, and decide whether to replace this match.
    4276                 :          */
    4277 GIC        2665 :         nmatches++;
    4278            2665 :         if (n > 0 && nmatches != n)
    4279                 :         {
    4280                 :             /*
    4281 ECB             :              * No, so advance search_start, but not start_ptr/data_pos. (Thus,
    4282                 :              * we treat the matched text as if it weren't matched, and copy it
    4283                 :              * to the output later.)
    4284                 :              */
    4285 GIC          30 :             search_start = pmatch[0].rm_eo;
    4286              30 :             if (pmatch[0].rm_so == pmatch[0].rm_eo)
    4287 UIC           0 :                 search_start++;
    4288 GIC          30 :             continue;
    4289                 :         }
    4290                 : 
    4291                 :         /*
    4292 ECB             :          * Copy the text to the left of the match position.  Note we are given
    4293                 :          * character not byte indexes.
    4294                 :          */
    4295 GIC        2635 :         if (pmatch[0].rm_so - data_pos > 0)
    4296                 :         {
    4297 ECB             :             int         chunk_len;
    4298                 : 
    4299 CBC        2550 :             chunk_len = charlen_to_bytelen(start_ptr,
    4300            2550 :                                            pmatch[0].rm_so - data_pos);
    4301 GIC        2550 :             appendBinaryStringInfo(&buf, start_ptr, chunk_len);
    4302 ECB             : 
    4303                 :             /*
    4304                 :              * Advance start_ptr over that text, to avoid multiple rescans of
    4305                 :              * it if the replace_text contains multiple back-references.
    4306                 :              */
    4307 GIC        2550 :             start_ptr += chunk_len;
    4308            2550 :             data_pos = pmatch[0].rm_so;
    4309                 :         }
    4310                 : 
    4311                 :         /*
    4312                 :          * Copy the replace_text, processing escapes if any are present.
    4313                 :          */
    4314            2635 :         if (escape_status > 0)
    4315             106 :             appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
    4316 ECB             :                                          start_ptr, data_pos);
    4317                 :         else
    4318 CBC        2529 :             appendStringInfoText(&buf, replace_text);
    4319                 : 
    4320                 :         /* Advance start_ptr and data_pos over the matched text. */
    4321 GIC        5270 :         start_ptr += charlen_to_bytelen(start_ptr,
    4322            2635 :                                         pmatch[0].rm_eo - data_pos);
    4323            2635 :         data_pos = pmatch[0].rm_eo;
    4324                 : 
    4325                 :         /*
    4326                 :          * If we only want to replace one occurrence, we're done.
    4327                 :          */
    4328            2635 :         if (n > 0)
    4329             677 :             break;
    4330                 : 
    4331                 :         /*
    4332 ECB             :          * Advance search position.  Normally we start the next search at the
    4333                 :          * end of the previous match; but if the match was of zero length, we
    4334                 :          * have to advance by one character, or we'd just find the same match
    4335                 :          * again.
    4336                 :          */
    4337 CBC        1958 :         search_start = data_pos;
    4338 GIC        1958 :         if (pmatch[0].rm_so == pmatch[0].rm_eo)
    4339               6 :             search_start++;
    4340                 :     }
    4341                 : 
    4342                 :     /*
    4343                 :      * Copy the text to the right of the last match.
    4344 ECB             :      */
    4345 CBC        5263 :     if (data_pos < data_len)
    4346                 :     {
    4347 ECB             :         int         chunk_len;
    4348                 : 
    4349 GIC        5032 :         chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
    4350 CBC        5032 :         appendBinaryStringInfo(&buf, start_ptr, chunk_len);
    4351 ECB             :     }
    4352                 : 
    4353 CBC        5263 :     ret_text = cstring_to_text_with_len(buf.data, buf.len);
    4354 GIC        5263 :     pfree(buf.data);
    4355            5263 :     pfree(data);
    4356 ECB             : 
    4357 CBC        5263 :     return ret_text;
    4358                 : }
    4359 ECB             : 
    4360                 : /*
    4361                 :  * split_part
    4362                 :  * parse input string based on provided field separator
    4363                 :  * return N'th item (1 based, negative counts from end)
    4364                 :  */
    4365                 : Datum
    4366 GIC          51 : split_part(PG_FUNCTION_ARGS)
    4367                 : {
    4368              51 :     text       *inputstring = PG_GETARG_TEXT_PP(0);
    4369 CBC          51 :     text       *fldsep = PG_GETARG_TEXT_PP(1);
    4370              51 :     int         fldnum = PG_GETARG_INT32(2);
    4371                 :     int         inputstring_len;
    4372                 :     int         fldsep_len;
    4373 ECB             :     TextPositionState state;
    4374                 :     char       *start_ptr;
    4375                 :     char       *end_ptr;
    4376                 :     text       *result_text;
    4377                 :     bool        found;
    4378                 : 
    4379                 :     /* field number is 1 based */
    4380 GIC          51 :     if (fldnum == 0)
    4381 CBC           3 :         ereport(ERROR,
    4382                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    4383                 :                  errmsg("field position must not be zero")));
    4384 ECB             : 
    4385 GIC          48 :     inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    4386 CBC          48 :     fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
    4387                 : 
    4388                 :     /* return empty string for empty input string */
    4389              48 :     if (inputstring_len < 1)
    4390 GIC           6 :         PG_RETURN_TEXT_P(cstring_to_text(""));
    4391                 : 
    4392                 :     /* handle empty field separator */
    4393              42 :     if (fldsep_len < 1)
    4394 ECB             :     {
    4395                 :         /* if first or last field, return input string, else empty string */
    4396 CBC          12 :         if (fldnum == 1 || fldnum == -1)
    4397               6 :             PG_RETURN_TEXT_P(inputstring);
    4398                 :         else
    4399 GIC           6 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    4400 ECB             :     }
    4401                 : 
    4402                 :     /* find the first field separator */
    4403 GIC          30 :     text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
    4404                 : 
    4405              30 :     found = text_position_next(&state);
    4406 ECB             : 
    4407                 :     /* special case if fldsep not found at all */
    4408 GIC          30 :     if (!found)
    4409                 :     {
    4410               6 :         text_position_cleanup(&state);
    4411 ECB             :         /* if first or last field, return input string, else empty string */
    4412 GIC           6 :         if (fldnum == 1 || fldnum == -1)
    4413               3 :             PG_RETURN_TEXT_P(inputstring);
    4414 ECB             :         else
    4415 GIC           3 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    4416                 :     }
    4417 ECB             : 
    4418                 :     /*
    4419                 :      * take care of a negative field number (i.e. count from the right) by
    4420                 :      * converting to a positive field number; we need total number of fields
    4421                 :      */
    4422 CBC          24 :     if (fldnum < 0)
    4423                 :     {
    4424                 :         /* we found a fldsep, so there are at least two fields */
    4425              12 :         int         numfields = 2;
    4426                 : 
    4427 GIC          18 :         while (text_position_next(&state))
    4428               6 :             numfields++;
    4429                 : 
    4430                 :         /* special case of last field does not require an extra pass */
    4431              12 :         if (fldnum == -1)
    4432                 :         {
    4433               3 :             start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
    4434 CBC           3 :             end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
    4435 GIC           3 :             text_position_cleanup(&state);
    4436 CBC           3 :             PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
    4437                 :                                                       end_ptr - start_ptr));
    4438 ECB             :         }
    4439                 : 
    4440                 :         /* else, convert fldnum to positive notation */
    4441 GIC           9 :         fldnum += numfields + 1;
    4442 ECB             : 
    4443                 :         /* if nonexistent field, return empty string */
    4444 GIC           9 :         if (fldnum <= 0)
    4445 ECB             :         {
    4446 GIC           3 :             text_position_cleanup(&state);
    4447               3 :             PG_RETURN_TEXT_P(cstring_to_text(""));
    4448 ECB             :         }
    4449                 : 
    4450                 :         /* reset to pointing at first match, but now with positive fldnum */
    4451 CBC           6 :         text_position_reset(&state);
    4452 GIC           6 :         found = text_position_next(&state);
    4453 CBC           6 :         Assert(found);
    4454 ECB             :     }
    4455                 : 
    4456                 :     /* identify bounds of first field */
    4457 GIC          18 :     start_ptr = VARDATA_ANY(inputstring);
    4458 CBC          18 :     end_ptr = text_position_get_match_ptr(&state);
    4459                 : 
    4460 GIC          33 :     while (found && --fldnum > 0)
    4461                 :     {
    4462                 :         /* identify bounds of next field */
    4463              15 :         start_ptr = end_ptr + fldsep_len;
    4464              15 :         found = text_position_next(&state);
    4465              15 :         if (found)
    4466               9 :             end_ptr = text_position_get_match_ptr(&state);
    4467                 :     }
    4468 ECB             : 
    4469 GIC          18 :     text_position_cleanup(&state);
    4470                 : 
    4471              18 :     if (fldnum > 0)
    4472                 :     {
    4473 ECB             :         /* N'th field separator not found */
    4474                 :         /* if last field requested, return it, else empty string */
    4475 CBC           6 :         if (fldnum == 1)
    4476 ECB             :         {
    4477 GIC           3 :             int         last_len = start_ptr - VARDATA_ANY(inputstring);
    4478 ECB             : 
    4479 GIC           3 :             result_text = cstring_to_text_with_len(start_ptr,
    4480                 :                                                    inputstring_len - last_len);
    4481                 :         }
    4482                 :         else
    4483 CBC           3 :             result_text = cstring_to_text("");
    4484 ECB             :     }
    4485                 :     else
    4486                 :     {
    4487                 :         /* non-last field requested */
    4488 GIC          12 :         result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
    4489                 :     }
    4490                 : 
    4491              18 :     PG_RETURN_TEXT_P(result_text);
    4492                 : }
    4493 ECB             : 
    4494                 : /*
    4495                 :  * Convenience function to return true when two text params are equal.
    4496                 :  */
    4497                 : static bool
    4498 GIC         174 : text_isequal(text *txt1, text *txt2, Oid collid)
    4499 ECB             : {
    4500 GIC         174 :     return DatumGetBool(DirectFunctionCall2Coll(texteq,
    4501                 :                                                 collid,
    4502                 :                                                 PointerGetDatum(txt1),
    4503                 :                                                 PointerGetDatum(txt2)));
    4504                 : }
    4505                 : 
    4506                 : /*
    4507 ECB             :  * text_to_array
    4508                 :  * parse input string and return text array of elements,
    4509                 :  * based on provided field separator
    4510                 :  */
    4511                 : Datum
    4512 CBC          73 : text_to_array(PG_FUNCTION_ARGS)
    4513                 : {
    4514                 :     SplitTextOutputData tstate;
    4515                 : 
    4516                 :     /* For array output, tstate should start as all zeroes */
    4517 GIC          73 :     memset(&tstate, 0, sizeof(tstate));
    4518                 : 
    4519              73 :     if (!split_text(fcinfo, &tstate))
    4520               3 :         PG_RETURN_NULL();
    4521                 : 
    4522              64 :     if (tstate.astate == NULL)
    4523 CBC           3 :         PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
    4524                 : 
    4525 GNC          61 :     PG_RETURN_DATUM(makeArrayResult(tstate.astate,
    4526                 :                                           CurrentMemoryContext));
    4527                 : }
    4528                 : 
    4529                 : /*
    4530 ECB             :  * text_to_array_null
    4531 EUB             :  * parse input string and return text array of elements,
    4532                 :  * based on provided field separator and null string
    4533 ECB             :  *
    4534                 :  * This is a separate entry point only to prevent the regression tests from
    4535                 :  * complaining about different argument sets for the same internal function.
    4536                 :  */
    4537                 : Datum
    4538 CBC          30 : text_to_array_null(PG_FUNCTION_ARGS)
    4539                 : {
    4540              30 :     return text_to_array(fcinfo);
    4541                 : }
    4542 ECB             : 
    4543                 : /*
    4544                 :  * text_to_table
    4545                 :  * parse input string and return table of elements,
    4546                 :  * based on provided field separator
    4547                 :  */
    4548                 : Datum
    4549 CBC          42 : text_to_table(PG_FUNCTION_ARGS)
    4550                 : {
    4551 GIC          42 :     ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
    4552                 :     SplitTextOutputData tstate;
    4553                 : 
    4554              42 :     tstate.astate = NULL;
    4555              42 :     InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
    4556              42 :     tstate.tupstore = rsi->setResult;
    4557              42 :     tstate.tupdesc = rsi->setDesc;
    4558                 : 
    4559              42 :     (void) split_text(fcinfo, &tstate);
    4560                 : 
    4561 CBC          42 :     return (Datum) 0;
    4562                 : }
    4563                 : 
    4564                 : /*
    4565                 :  * text_to_table_null
    4566                 :  * parse input string and return table of elements,
    4567                 :  * based on provided field separator and null string
    4568 ECB             :  *
    4569                 :  * This is a separate entry point only to prevent the regression tests from
    4570                 :  * complaining about different argument sets for the same internal function.
    4571                 :  */
    4572                 : Datum
    4573 CBC          12 : text_to_table_null(PG_FUNCTION_ARGS)
    4574 ECB             : {
    4575 GIC          12 :     return text_to_table(fcinfo);
    4576 ECB             : }
    4577                 : 
    4578                 : /*
    4579                 :  * Common code for text_to_array, text_to_array_null, text_to_table
    4580                 :  * and text_to_table_null functions.
    4581                 :  *
    4582                 :  * These are not strict so we have to test for null inputs explicitly.
    4583                 :  * Returns false if result is to be null, else returns true.
    4584                 :  *
    4585                 :  * Note that if the result is valid but empty (zero elements), we return
    4586                 :  * without changing *tstate --- caller must handle that case, too.
    4587                 :  */
    4588                 : static bool
    4589 CBC         115 : split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
    4590 ECB             : {
    4591                 :     text       *inputstring;
    4592                 :     text       *fldsep;
    4593                 :     text       *null_string;
    4594 GIC         115 :     Oid         collation = PG_GET_COLLATION();
    4595                 :     int         inputstring_len;
    4596                 :     int         fldsep_len;
    4597                 :     char       *start_ptr;
    4598 ECB             :     text       *result_text;
    4599                 : 
    4600                 :     /* when input string is NULL, then result is NULL too */
    4601 GIC         115 :     if (PG_ARGISNULL(0))
    4602 CBC           6 :         return false;
    4603 ECB             : 
    4604 CBC         109 :     inputstring = PG_GETARG_TEXT_PP(0);
    4605                 : 
    4606 ECB             :     /* fldsep can be NULL */
    4607 CBC         109 :     if (!PG_ARGISNULL(1))
    4608              94 :         fldsep = PG_GETARG_TEXT_PP(1);
    4609                 :     else
    4610              15 :         fldsep = NULL;
    4611 ECB             : 
    4612                 :     /* null_string can be NULL or omitted */
    4613 GIC         109 :     if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
    4614 CBC          42 :         null_string = PG_GETARG_TEXT_PP(2);
    4615                 :     else
    4616 GIC          67 :         null_string = NULL;
    4617                 : 
    4618             109 :     if (fldsep != NULL)
    4619                 :     {
    4620 ECB             :         /*
    4621                 :          * Normal case with non-null fldsep.  Use the text_position machinery
    4622                 :          * to search for occurrences of fldsep.
    4623                 :          */
    4624                 :         TextPositionState state;
    4625                 : 
    4626 CBC          94 :         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    4627 GIC          94 :         fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
    4628 EUB             : 
    4629 ECB             :         /* return empty set for empty input string */
    4630 GIC          94 :         if (inputstring_len < 1)
    4631              30 :             return true;
    4632                 : 
    4633                 :         /* empty field separator: return input string as a one-element set */
    4634 CBC          88 :         if (fldsep_len < 1)
    4635                 :         {
    4636              24 :             split_text_accum_result(tstate, inputstring,
    4637                 :                                     null_string, collation);
    4638              24 :             return true;
    4639 ECB             :         }
    4640                 : 
    4641 CBC          64 :         text_position_setup(inputstring, fldsep, collation, &state);
    4642 ECB             : 
    4643 GIC          58 :         start_ptr = VARDATA_ANY(inputstring);
    4644 ECB             : 
    4645                 :         for (;;)
    4646 GIC         232 :         {
    4647                 :             bool        found;
    4648                 :             char       *end_ptr;
    4649 ECB             :             int         chunk_len;
    4650                 : 
    4651 CBC         290 :             CHECK_FOR_INTERRUPTS();
    4652 ECB             : 
    4653 GIC         290 :             found = text_position_next(&state);
    4654 GBC         290 :             if (!found)
    4655 EUB             :             {
    4656                 :                 /* fetch last field */
    4657 GIC          58 :                 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
    4658              58 :                 end_ptr = NULL; /* not used, but some compilers complain */
    4659                 :             }
    4660 ECB             :             else
    4661                 :             {
    4662                 :                 /* fetch non-last field */
    4663 CBC         232 :                 end_ptr = text_position_get_match_ptr(&state);
    4664 GIC         232 :                 chunk_len = end_ptr - start_ptr;
    4665                 :             }
    4666                 : 
    4667                 :             /* build a temp text datum to pass to split_text_accum_result */
    4668             290 :             result_text = cstring_to_text_with_len(start_ptr, chunk_len);
    4669                 : 
    4670                 :             /* stash away this field */
    4671             290 :             split_text_accum_result(tstate, result_text,
    4672 ECB             :                                     null_string, collation);
    4673                 : 
    4674 CBC         290 :             pfree(result_text);
    4675                 : 
    4676             290 :             if (!found)
    4677 GIC          58 :                 break;
    4678                 : 
    4679 CBC         232 :             start_ptr = end_ptr + fldsep_len;
    4680 ECB             :         }
    4681                 : 
    4682 GIC          58 :         text_position_cleanup(&state);
    4683                 :     }
    4684 ECB             :     else
    4685                 :     {
    4686                 :         /*
    4687                 :          * When fldsep is NULL, each character in the input string becomes a
    4688                 :          * separate element in the result set.  The separator is effectively
    4689                 :          * the space between characters.
    4690                 :          */
    4691 GIC          15 :         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
    4692                 : 
    4693              15 :         start_ptr = VARDATA_ANY(inputstring);
    4694                 : 
    4695             126 :         while (inputstring_len > 0)
    4696 ECB             :         {
    4697 GIC         111 :             int         chunk_len = pg_mblen(start_ptr);
    4698 ECB             : 
    4699 GIC         111 :             CHECK_FOR_INTERRUPTS();
    4700 ECB             : 
    4701                 :             /* build a temp text datum to pass to split_text_accum_result */
    4702 GIC         111 :             result_text = cstring_to_text_with_len(start_ptr, chunk_len);
    4703 ECB             : 
    4704                 :             /* stash away this field */
    4705 GIC         111 :             split_text_accum_result(tstate, result_text,
    4706                 :                                     null_string, collation);
    4707                 : 
    4708 CBC         111 :             pfree(result_text);
    4709 ECB             : 
    4710 CBC         111 :             start_ptr += chunk_len;
    4711 GIC         111 :             inputstring_len -= chunk_len;
    4712 ECB             :         }
    4713                 :     }
    4714                 : 
    4715 GIC          73 :     return true;
    4716                 : }
    4717                 : 
    4718                 : /*
    4719                 :  * Add text item to result set (table or array).
    4720                 :  *
    4721 ECB             :  * This is also responsible for checking to see if the item matches
    4722                 :  * the null_string, in which case we should emit NULL instead.
    4723                 :  */
    4724                 : static void
    4725 GIC         425 : split_text_accum_result(SplitTextOutputData *tstate,
    4726                 :                         text *field_value,
    4727                 :                         text *null_string,
    4728 ECB             :                         Oid collation)
    4729                 : {
    4730 GIC         425 :     bool        is_null = false;
    4731 ECB             : 
    4732 GIC         425 :     if (null_string && text_isequal(field_value, null_string, collation))
    4733 CBC          30 :         is_null = true;
    4734 ECB             : 
    4735 GBC         425 :     if (tstate->tupstore)
    4736                 :     {
    4737 ECB             :         Datum       values[1];
    4738                 :         bool        nulls[1];
    4739                 : 
    4740 GIC         114 :         values[0] = PointerGetDatum(field_value);
    4741             114 :         nulls[0] = is_null;
    4742 EUB             : 
    4743 GIC         114 :         tuplestore_putvalues(tstate->tupstore,
    4744 ECB             :                              tstate->tupdesc,
    4745                 :                              values,
    4746                 :                              nulls);
    4747                 :     }
    4748                 :     else
    4749 EUB             :     {
    4750 GIC         311 :         tstate->astate = accumArrayResult(tstate->astate,
    4751                 :                                           PointerGetDatum(field_value),
    4752 EUB             :                                           is_null,
    4753                 :                                           TEXTOID,
    4754                 :                                           CurrentMemoryContext);
    4755                 :     }
    4756 GIC         425 : }
    4757 EUB             : 
    4758                 : /*
    4759                 :  * array_to_text
    4760 ECB             :  * concatenate Cstring representation of input array elements
    4761                 :  * using provided field separator
    4762                 :  */
    4763                 : Datum
    4764 GIC       30154 : array_to_text(PG_FUNCTION_ARGS)
    4765                 : {
    4766           30154 :     ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
    4767           30154 :     char       *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
    4768 ECB             : 
    4769 GIC       30154 :     PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
    4770                 : }
    4771                 : 
    4772                 : /*
    4773                 :  * array_to_text_null
    4774                 :  * concatenate Cstring representation of input array elements
    4775 ECB             :  * using provided field separator and null string
    4776                 :  *
    4777                 :  * This version is not strict so we have to test for null inputs explicitly.
    4778                 :  */
    4779                 : Datum
    4780 CBC           6 : array_to_text_null(PG_FUNCTION_ARGS)
    4781 ECB             : {
    4782 EUB             :     ArrayType  *v;
    4783                 :     char       *fldsep;
    4784 ECB             :     char       *null_string;
    4785                 : 
    4786                 :     /* returns NULL when first or second parameter is NULL */
    4787 GIC           6 :     if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
    4788 UIC           0 :         PG_RETURN_NULL();
    4789 ECB             : 
    4790 GIC           6 :     v = PG_GETARG_ARRAYTYPE_P(0);
    4791 CBC           6 :     fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
    4792 EUB             : 
    4793                 :     /* NULL null string is passed through as a null pointer */
    4794 GIC           6 :     if (!PG_ARGISNULL(2))
    4795 CBC           3 :         null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
    4796 ECB             :     else
    4797 CBC           3 :         null_string = NULL;
    4798 ECB             : 
    4799 GIC           6 :     PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
    4800                 : }
    4801 ECB             : 
    4802                 : /*
    4803                 :  * common code for array_to_text and array_to_text_null functions
    4804                 :  */
    4805                 : static text *
    4806 CBC       30169 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
    4807 ECB             :                        const char *fldsep, const char *null_string)
    4808                 : {
    4809 EUB             :     text       *result;
    4810                 :     int         nitems,
    4811                 :                *dims,
    4812                 :                 ndims;
    4813 ECB             :     Oid         element_type;
    4814                 :     int         typlen;
    4815                 :     bool        typbyval;
    4816                 :     char        typalign;
    4817                 :     StringInfoData buf;
    4818 GIC       30169 :     bool        printed = false;
    4819                 :     char       *p;
    4820                 :     bits8      *bitmap;
    4821                 :     int         bitmask;
    4822                 :     int         i;
    4823                 :     ArrayMetaState *my_extra;
    4824                 : 
    4825           30169 :     ndims = ARR_NDIM(v);
    4826           30169 :     dims = ARR_DIMS(v);
    4827           30169 :     nitems = ArrayGetNItems(ndims, dims);
    4828 ECB             : 
    4829                 :     /* if there are no elements, return an empty string */
    4830 GIC       30169 :     if (nitems == 0)
    4831           18246 :         return cstring_to_text_with_len("", 0);
    4832                 : 
    4833           11923 :     element_type = ARR_ELEMTYPE(v);
    4834 CBC       11923 :     initStringInfo(&buf);
    4835                 : 
    4836                 :     /*
    4837 EUB             :      * We arrange to look up info about element type, including its output
    4838                 :      * conversion proc, only once per series of calls, assuming the element
    4839                 :      * type doesn't change underneath us.
    4840                 :      */
    4841 GIC       11923 :     my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
    4842           11923 :     if (my_extra == NULL)
    4843                 :     {
    4844 CBC         695 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4845 ECB             :                                                       sizeof(ArrayMetaState));
    4846 CBC         695 :         my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
    4847 GIC         695 :         my_extra->element_type = ~element_type;
    4848 ECB             :     }
    4849                 : 
    4850 GIC       11923 :     if (my_extra->element_type != element_type)
    4851                 :     {
    4852 ECB             :         /*
    4853                 :          * Get info about element type, including its output conversion proc
    4854                 :          */
    4855 GIC         695 :         get_type_io_data(element_type, IOFunc_output,
    4856 ECB             :                          &my_extra->typlen, &my_extra->typbyval,
    4857                 :                          &my_extra->typalign, &my_extra->typdelim,
    4858                 :                          &my_extra->typioparam, &my_extra->typiofunc);
    4859 CBC         695 :         fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
    4860 GIC         695 :                       fcinfo->flinfo->fn_mcxt);
    4861 CBC         695 :         my_extra->element_type = element_type;
    4862 ECB             :     }
    4863 GIC       11923 :     typlen = my_extra->typlen;
    4864           11923 :     typbyval = my_extra->typbyval;
    4865           11923 :     typalign = my_extra->typalign;
    4866                 : 
    4867           11923 :     p = ARR_DATA_PTR(v);
    4868           11923 :     bitmap = ARR_NULLBITMAP(v);
    4869           11923 :     bitmask = 1;
    4870                 : 
    4871           40628 :     for (i = 0; i < nitems; i++)
    4872                 :     {
    4873                 :         Datum       itemvalue;
    4874                 :         char       *value;
    4875 ECB             : 
    4876                 :         /* Get source element, checking for NULL */
    4877 CBC       28705 :         if (bitmap && (*bitmap & bitmask) == 0)
    4878 ECB             :         {
    4879                 :             /* if null_string is NULL, we just ignore null elements */
    4880 GIC           9 :             if (null_string != NULL)
    4881 ECB             :             {
    4882 GIC           3 :                 if (printed)
    4883 CBC           3 :                     appendStringInfo(&buf, "%s%s", fldsep, null_string);
    4884                 :                 else
    4885 LBC           0 :                     appendStringInfoString(&buf, null_string);
    4886 CBC           3 :                 printed = true;
    4887 ECB             :             }
    4888                 :         }
    4889                 :         else
    4890                 :         {
    4891 GIC       28696 :             itemvalue = fetch_att(p, typbyval, typlen);
    4892                 : 
    4893           28696 :             value = OutputFunctionCall(&my_extra->proc, itemvalue);
    4894                 : 
    4895           28696 :             if (printed)
    4896           16773 :                 appendStringInfo(&buf, "%s%s", fldsep, value);
    4897 ECB             :             else
    4898 CBC       11923 :                 appendStringInfoString(&buf, value);
    4899           28696 :             printed = true;
    4900                 : 
    4901 GIC       28696 :             p = att_addlength_pointer(p, typlen, p);
    4902           28696 :             p = (char *) att_align_nominal(p, typalign);
    4903                 :         }
    4904                 : 
    4905                 :         /* advance bitmap pointer if any */
    4906           28705 :         if (bitmap)
    4907 ECB             :         {
    4908 GIC          54 :             bitmask <<= 1;
    4909              54 :             if (bitmask == 0x100)
    4910                 :             {
    4911 UIC           0 :                 bitmap++;
    4912               0 :                 bitmask = 1;
    4913 ECB             :             }
    4914 EUB             :         }
    4915                 :     }
    4916 ECB             : 
    4917 CBC       11923 :     result = cstring_to_text_with_len(buf.data, buf.len);
    4918 GIC       11923 :     pfree(buf.data);
    4919 ECB             : 
    4920 GIC       11923 :     return result;
    4921                 : }
    4922                 : 
    4923                 : #define HEXBASE 16
    4924                 : /*
    4925 EUB             :  * Convert an int32 to a string containing a base 16 (hex) representation of
    4926                 :  * the number.
    4927                 :  */
    4928                 : Datum
    4929 GIC       19342 : to_hex32(PG_FUNCTION_ARGS)
    4930 ECB             : {
    4931 GIC       19342 :     uint32      value = (uint32) PG_GETARG_INT32(0);
    4932                 :     char       *ptr;
    4933           19342 :     const char *digits = "0123456789abcdef";
    4934                 :     char        buf[32];        /* bigger than needed, but reasonable */
    4935 ECB             : 
    4936 CBC       19342 :     ptr = buf + sizeof(buf) - 1;
    4937           19342 :     *ptr = '\0';
    4938 ECB             : 
    4939                 :     do
    4940                 :     {
    4941 CBC       37279 :         *--ptr = digits[value % HEXBASE];
    4942 GIC       37279 :         value /= HEXBASE;
    4943           37279 :     } while (ptr > buf && value);
    4944 ECB             : 
    4945 GIC       19342 :     PG_RETURN_TEXT_P(cstring_to_text(ptr));
    4946                 : }
    4947 ECB             : 
    4948                 : /*
    4949                 :  * Convert an int64 to a string containing a base 16 (hex) representation of
    4950                 :  * the number.
    4951                 :  */
    4952                 : Datum
    4953 GIC           3 : to_hex64(PG_FUNCTION_ARGS)
    4954                 : {
    4955               3 :     uint64      value = (uint64) PG_GETARG_INT64(0);
    4956                 :     char       *ptr;
    4957 CBC           3 :     const char *digits = "0123456789abcdef";
    4958                 :     char        buf[32];        /* bigger than needed, but reasonable */
    4959                 : 
    4960 GIC           3 :     ptr = buf + sizeof(buf) - 1;
    4961               3 :     *ptr = '\0';
    4962                 : 
    4963                 :     do
    4964 ECB             :     {
    4965 GIC          24 :         *--ptr = digits[value % HEXBASE];
    4966 CBC          24 :         value /= HEXBASE;
    4967 GIC          24 :     } while (ptr > buf && value);
    4968 ECB             : 
    4969 GIC           3 :     PG_RETURN_TEXT_P(cstring_to_text(ptr));
    4970                 : }
    4971 ECB             : 
    4972                 : /*
    4973                 :  * Return the size of a datum, possibly compressed
    4974                 :  *
    4975                 :  * Works on any data type
    4976                 :  */
    4977                 : Datum
    4978 CBC          61 : pg_column_size(PG_FUNCTION_ARGS)
    4979                 : {
    4980 GIC          61 :     Datum       value = PG_GETARG_DATUM(0);
    4981                 :     int32       result;
    4982                 :     int         typlen;
    4983                 : 
    4984                 :     /* On first call, get the input type's typlen, and save at *fn_extra */
    4985              61 :     if (fcinfo->flinfo->fn_extra == NULL)
    4986                 :     {
    4987                 :         /* Lookup the datatype of the supplied argument */
    4988 CBC          61 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    4989                 : 
    4990 GIC          61 :         typlen = get_typlen(argtypeid);
    4991              61 :         if (typlen == 0)        /* should not happen */
    4992 UIC           0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    4993                 : 
    4994 GIC          61 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    4995                 :                                                       sizeof(int));
    4996              61 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    4997 ECB             :     }
    4998                 :     else
    4999 LBC           0 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    5000                 : 
    5001 GIC          61 :     if (typlen == -1)
    5002                 :     {
    5003                 :         /* varlena type, possibly toasted */
    5004              61 :         result = toast_datum_size(value);
    5005 ECB             :     }
    5006 LBC           0 :     else if (typlen == -2)
    5007 ECB             :     {
    5008                 :         /* cstring */
    5009 LBC           0 :         result = strlen(DatumGetCString(value)) + 1;
    5010                 :     }
    5011                 :     else
    5012 ECB             :     {
    5013                 :         /* ordinary fixed-width type */
    5014 UIC           0 :         result = typlen;
    5015 ECB             :     }
    5016                 : 
    5017 CBC          61 :     PG_RETURN_INT32(result);
    5018                 : }
    5019 ECB             : 
    5020                 : /*
    5021                 :  * Return the compression method stored in the compressed attribute.  Return
    5022                 :  * NULL for non varlena type or uncompressed data.
    5023                 :  */
    5024                 : Datum
    5025 GIC          81 : pg_column_compression(PG_FUNCTION_ARGS)
    5026 ECB             : {
    5027                 :     int         typlen;
    5028                 :     char       *result;
    5029                 :     ToastCompressionId cmid;
    5030                 : 
    5031                 :     /* On first call, get the input type's typlen, and save at *fn_extra */
    5032 GIC          81 :     if (fcinfo->flinfo->fn_extra == NULL)
    5033 ECB             :     {
    5034                 :         /* Lookup the datatype of the supplied argument */
    5035 CBC          54 :         Oid         argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
    5036                 : 
    5037 GIC          54 :         typlen = get_typlen(argtypeid);
    5038 CBC          54 :         if (typlen == 0)        /* should not happen */
    5039 UIC           0 :             elog(ERROR, "cache lookup failed for type %u", argtypeid);
    5040                 : 
    5041 GIC          54 :         fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    5042 ECB             :                                                       sizeof(int));
    5043 GIC          54 :         *((int *) fcinfo->flinfo->fn_extra) = typlen;
    5044                 :     }
    5045                 :     else
    5046              27 :         typlen = *((int *) fcinfo->flinfo->fn_extra);
    5047                 : 
    5048              81 :     if (typlen != -1)
    5049 UIC           0 :         PG_RETURN_NULL();
    5050                 : 
    5051                 :     /* get the compression method id stored in the compressed varlena */
    5052 CBC          81 :     cmid = toast_get_compression_id((struct varlena *)
    5053 GIC          81 :                                     DatumGetPointer(PG_GETARG_DATUM(0)));
    5054              81 :     if (cmid == TOAST_INVALID_COMPRESSION_ID)
    5055               3 :         PG_RETURN_NULL();
    5056                 : 
    5057                 :     /* convert compression method id to compression method name */
    5058 CBC          78 :     switch (cmid)
    5059 ECB             :     {
    5060 GIC          33 :         case TOAST_PGLZ_COMPRESSION_ID:
    5061 CBC          33 :             result = "pglz";
    5062 GIC          33 :             break;
    5063              45 :         case TOAST_LZ4_COMPRESSION_ID:
    5064              45 :             result = "lz4";
    5065              45 :             break;
    5066 UIC           0 :         default:
    5067 LBC           0 :             elog(ERROR, "invalid compression method id %d", cmid);
    5068 ECB             :     }
    5069 EUB             : 
    5070 GIC          78 :     PG_RETURN_TEXT_P(cstring_to_text(result));
    5071 ECB             : }
    5072                 : 
    5073                 : /*
    5074                 :  * string_agg - Concatenates values and returns string.
    5075                 :  *
    5076                 :  * Syntax: string_agg(value text, delimiter text) RETURNS text
    5077                 :  *
    5078                 :  * Note: Any NULL values are ignored. The first-call delimiter isn't
    5079                 :  * actually used at all, and on subsequent calls the delimiter precedes
    5080                 :  * the associated value.
    5081                 :  */
    5082                 : 
    5083                 : /* subroutine to initialize state */
    5084                 : static StringInfo
    5085 GIC        1026 : makeStringAggState(FunctionCallInfo fcinfo)
    5086                 : {
    5087                 :     StringInfo  state;
    5088                 :     MemoryContext aggcontext;
    5089                 :     MemoryContext oldcontext;
    5090 ECB             : 
    5091 GIC        1026 :     if (!AggCheckCallContext(fcinfo, &aggcontext))
    5092                 :     {
    5093                 :         /* cannot be called directly because of internal-type argument */
    5094 UIC           0 :         elog(ERROR, "string_agg_transfn called in non-aggregate context");
    5095                 :     }
    5096 ECB             : 
    5097                 :     /*
    5098                 :      * Create state in aggregate context.  It'll stay there across subsequent
    5099                 :      * calls.
    5100                 :      */
    5101 GIC        1026 :     oldcontext = MemoryContextSwitchTo(aggcontext);
    5102            1026 :     state = makeStringInfo();
    5103            1026 :     MemoryContextSwitchTo(oldcontext);
    5104 ECB             : 
    5105 GIC        1026 :     return state;
    5106                 : }
    5107                 : 
    5108                 : Datum
    5109 CBC      420976 : string_agg_transfn(PG_FUNCTION_ARGS)
    5110                 : {
    5111                 :     StringInfo  state;
    5112 ECB             : 
    5113 CBC      420976 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    5114                 : 
    5115                 :     /* Append the value unless null, preceding it with the delimiter. */
    5116 GIC      420976 :     if (!PG_ARGISNULL(1))
    5117                 :     {
    5118 GNC      413452 :         text       *value = PG_GETARG_TEXT_PP(1);
    5119          413452 :         bool        isfirst = false;
    5120                 : 
    5121                 :         /*
    5122                 :          * You might think we can just throw away the first delimiter, however
    5123                 :          * we must keep it as we may be a parallel worker doing partial
    5124                 :          * aggregation building a state to send to the main process.  We need
    5125                 :          * to keep the delimiter of every aggregation so that the combine
    5126                 :          * function can properly join up the strings of two separately
    5127                 :          * partially aggregated results.  The first delimiter is only stripped
    5128                 :          * off in the final function.  To know how much to strip off the front
    5129                 :          * of the string, we store the length of the first delimiter in the
    5130                 :          * StringInfo's cursor field, which we don't otherwise need here.
    5131                 :          */
    5132          413452 :         if (state == NULL)
    5133                 :         {
    5134             773 :             state = makeStringAggState(fcinfo);
    5135             773 :             isfirst = true;
    5136                 :         }
    5137                 : 
    5138          413452 :         if (!PG_ARGISNULL(2))
    5139                 :         {
    5140          413452 :             text       *delim = PG_GETARG_TEXT_PP(2);
    5141                 : 
    5142          413452 :             appendStringInfoText(state, delim);
    5143          413452 :             if (isfirst)
    5144             773 :                 state->cursor = VARSIZE_ANY_EXHDR(delim);
    5145                 :         }
    5146                 : 
    5147          413452 :         appendStringInfoText(state, value);
    5148 ECB             :     }
    5149                 : 
    5150                 :     /*
    5151                 :      * The transition type for string_agg() is declared to be "internal",
    5152                 :      * which is a pass-by-value type the same size as a pointer.
    5153                 :      */
    5154 GNC      420976 :     if (state)
    5155          420931 :         PG_RETURN_POINTER(state);
    5156              45 :     PG_RETURN_NULL();
    5157                 : }
    5158                 : 
    5159                 : /*
    5160                 :  * string_agg_combine
    5161                 :  *      Aggregate combine function for string_agg(text) and string_agg(bytea)
    5162                 :  */
    5163                 : Datum
    5164             100 : string_agg_combine(PG_FUNCTION_ARGS)
    5165                 : {
    5166                 :     StringInfo  state1;
    5167                 :     StringInfo  state2;
    5168                 :     MemoryContext agg_context;
    5169                 : 
    5170             100 :     if (!AggCheckCallContext(fcinfo, &agg_context))
    5171 UNC           0 :         elog(ERROR, "aggregate function called in non-aggregate context");
    5172                 : 
    5173 GNC         100 :     state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    5174             100 :     state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
    5175                 : 
    5176             100 :     if (state2 == NULL)
    5177                 :     {
    5178                 :         /*
    5179                 :          * NULL state2 is easy, just return state1, which we know is already
    5180                 :          * in the agg_context
    5181                 :          */
    5182 UNC           0 :         if (state1 == NULL)
    5183               0 :             PG_RETURN_NULL();
    5184               0 :         PG_RETURN_POINTER(state1);
    5185                 :     }
    5186                 : 
    5187 GNC         100 :     if (state1 == NULL)
    5188                 :     {
    5189                 :         /* We must copy state2's data into the agg_context */
    5190                 :         MemoryContext old_context;
    5191                 : 
    5192              60 :         old_context = MemoryContextSwitchTo(agg_context);
    5193              60 :         state1 = makeStringAggState(fcinfo);
    5194              60 :         appendBinaryStringInfo(state1, state2->data, state2->len);
    5195              60 :         state1->cursor = state2->cursor;
    5196              60 :         MemoryContextSwitchTo(old_context);
    5197                 :     }
    5198              40 :     else if (state2->len > 0)
    5199                 :     {
    5200                 :         /* Combine ... state1->cursor does not change in this case */
    5201              40 :         appendBinaryStringInfo(state1, state2->data, state2->len);
    5202                 :     }
    5203                 : 
    5204             100 :     PG_RETURN_POINTER(state1);
    5205                 : }
    5206                 : 
    5207                 : /*
    5208                 :  * string_agg_serialize
    5209                 :  *      Aggregate serialize function for string_agg(text) and string_agg(bytea)
    5210                 :  *
    5211                 :  * This is strict, so we need not handle NULL input
    5212                 :  */
    5213                 : Datum
    5214             100 : string_agg_serialize(PG_FUNCTION_ARGS)
    5215                 : {
    5216                 :     StringInfo  state;
    5217                 :     StringInfoData buf;
    5218                 :     bytea      *result;
    5219                 : 
    5220                 :     /* cannot be called directly because of internal-type argument */
    5221             100 :     Assert(AggCheckCallContext(fcinfo, NULL));
    5222                 : 
    5223             100 :     state = (StringInfo) PG_GETARG_POINTER(0);
    5224                 : 
    5225             100 :     pq_begintypsend(&buf);
    5226                 : 
    5227                 :     /* cursor */
    5228             100 :     pq_sendint(&buf, state->cursor, 4);
    5229                 : 
    5230                 :     /* data */
    5231             100 :     pq_sendbytes(&buf, state->data, state->len);
    5232                 : 
    5233             100 :     result = pq_endtypsend(&buf);
    5234                 : 
    5235             100 :     PG_RETURN_BYTEA_P(result);
    5236                 : }
    5237                 : 
    5238                 : /*
    5239                 :  * string_agg_deserialize
    5240                 :  *      Aggregate deserial function for string_agg(text) and string_agg(bytea)
    5241                 :  *
    5242                 :  * This is strict, so we need not handle NULL input
    5243                 :  */
    5244                 : Datum
    5245             100 : string_agg_deserialize(PG_FUNCTION_ARGS)
    5246                 : {
    5247                 :     bytea      *sstate;
    5248                 :     StringInfo  result;
    5249                 :     StringInfoData buf;
    5250                 :     char       *data;
    5251                 :     int         datalen;
    5252                 : 
    5253                 :     /* cannot be called directly because of internal-type argument */
    5254             100 :     Assert(AggCheckCallContext(fcinfo, NULL));
    5255                 : 
    5256             100 :     sstate = PG_GETARG_BYTEA_PP(0);
    5257                 : 
    5258                 :     /*
    5259                 :      * Copy the bytea into a StringInfo so that we can "receive" it using the
    5260                 :      * standard recv-function infrastructure.
    5261                 :      */
    5262             100 :     initStringInfo(&buf);
    5263             200 :     appendBinaryStringInfo(&buf,
    5264             200 :                            VARDATA_ANY(sstate), VARSIZE_ANY_EXHDR(sstate));
    5265                 : 
    5266             100 :     result = makeStringAggState(fcinfo);
    5267                 : 
    5268                 :     /* cursor */
    5269             100 :     result->cursor = pq_getmsgint(&buf, 4);
    5270                 : 
    5271                 :     /* data */
    5272             100 :     datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
    5273             100 :     data = (char *) pq_getmsgbytes(&buf, datalen);
    5274             100 :     appendBinaryStringInfo(result, data, datalen);
    5275                 : 
    5276             100 :     pq_getmsgend(&buf);
    5277             100 :     pfree(buf.data);
    5278                 : 
    5279             100 :     PG_RETURN_POINTER(result);
    5280                 : }
    5281                 : 
    5282                 : Datum
    5283 CBC         789 : string_agg_finalfn(PG_FUNCTION_ARGS)
    5284                 : {
    5285                 :     StringInfo  state;
    5286 ECB             : 
    5287                 :     /* cannot be called directly because of internal-type argument */
    5288 CBC         789 :     Assert(AggCheckCallContext(fcinfo, NULL));
    5289                 : 
    5290             789 :     state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
    5291                 : 
    5292             789 :     if (state != NULL)
    5293                 :     {
    5294                 :         /* As per comment in transfn, strip data before the cursor position */
    5295 GNC         753 :         PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
    5296                 :                                                   state->len - state->cursor));
    5297                 :     }
    5298 ECB             :     else
    5299 GIC          36 :         PG_RETURN_NULL();
    5300                 : }
    5301 ECB             : 
    5302                 : /*
    5303                 :  * Prepare cache with fmgr info for the output functions of the datatypes of
    5304                 :  * the arguments of a concat-like function, beginning with argument "argidx".
    5305                 :  * (Arguments before that will have corresponding slots in the resulting
    5306                 :  * FmgrInfo array, but we don't fill those slots.)
    5307                 :  */
    5308                 : static FmgrInfo *
    5309 GIC          20 : build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
    5310                 : {
    5311                 :     FmgrInfo   *foutcache;
    5312 ECB             :     int         i;
    5313                 : 
    5314                 :     /* We keep the info in fn_mcxt so it survives across calls */
    5315 CBC          20 :     foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
    5316 GIC          20 :                                                 PG_NARGS() * sizeof(FmgrInfo));
    5317                 : 
    5318              98 :     for (i = argidx; i < PG_NARGS(); i++)
    5319                 :     {
    5320                 :         Oid         valtype;
    5321                 :         Oid         typOutput;
    5322 ECB             :         bool        typIsVarlena;
    5323                 : 
    5324 GIC          78 :         valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
    5325              78 :         if (!OidIsValid(valtype))
    5326 LBC           0 :             elog(ERROR, "could not determine data type of concat() input");
    5327 ECB             : 
    5328 CBC          78 :         getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
    5329              78 :         fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
    5330                 :     }
    5331                 : 
    5332 GIC          20 :     fcinfo->flinfo->fn_extra = foutcache;
    5333                 : 
    5334              20 :     return foutcache;
    5335                 : }
    5336                 : 
    5337 ECB             : /*
    5338                 :  * Implementation of both concat() and concat_ws().
    5339                 :  *
    5340                 :  * sepstr is the separator string to place between values.
    5341                 :  * argidx identifies the first argument to concatenate (counting from zero);
    5342                 :  * note that this must be constant across any one series of calls.
    5343                 :  *
    5344                 :  * Returns NULL if result should be NULL, else text value.
    5345                 :  */
    5346                 : static text *
    5347 CBC          36 : concat_internal(const char *sepstr, int argidx,
    5348 ECB             :                 FunctionCallInfo fcinfo)
    5349                 : {
    5350                 :     text       *result;
    5351                 :     StringInfoData str;
    5352                 :     FmgrInfo   *foutcache;
    5353 GIC          36 :     bool        first_arg = true;
    5354                 :     int         i;
    5355                 : 
    5356                 :     /*
    5357                 :      * concat(VARIADIC some-array) is essentially equivalent to
    5358 ECB             :      * array_to_text(), ie concat the array elements with the given separator.
    5359                 :      * So we just pass the case off to that code.
    5360                 :      */
    5361 GIC          36 :     if (get_fn_expr_variadic(fcinfo->flinfo))
    5362 ECB             :     {
    5363                 :         ArrayType  *arr;
    5364                 : 
    5365                 :         /* Should have just the one argument */
    5366 CBC          15 :         Assert(argidx == PG_NARGS() - 1);
    5367                 : 
    5368                 :         /* concat(VARIADIC NULL) is defined as NULL */
    5369              15 :         if (PG_ARGISNULL(argidx))
    5370               6 :             return NULL;
    5371 ECB             : 
    5372                 :         /*
    5373                 :          * Non-null argument had better be an array.  We assume that any call
    5374                 :          * context that could let get_fn_expr_variadic return true will have
    5375                 :          * checked that a VARIADIC-labeled parameter actually is an array.  So
    5376                 :          * it should be okay to just Assert that it's an array rather than
    5377                 :          * doing a full-fledged error check.
    5378                 :          */
    5379 GIC           9 :         Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
    5380                 : 
    5381                 :         /* OK, safe to fetch the array value */
    5382 CBC           9 :         arr = PG_GETARG_ARRAYTYPE_P(argidx);
    5383                 : 
    5384 ECB             :         /*
    5385                 :          * And serialize the array.  We tell array_to_text to ignore null
    5386                 :          * elements, which matches the behavior of the loop below.
    5387                 :          */
    5388 GIC           9 :         return array_to_text_internal(fcinfo, arr, sepstr, NULL);
    5389                 :     }
    5390 ECB             : 
    5391                 :     /* Normal case without explicit VARIADIC marker */
    5392 GIC          21 :     initStringInfo(&str);
    5393 ECB             : 
    5394                 :     /* Get output function info, building it if first time through */
    5395 GIC          21 :     foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
    5396 CBC          21 :     if (foutcache == NULL)
    5397 GIC          20 :         foutcache = build_concat_foutcache(fcinfo, argidx);
    5398                 : 
    5399             102 :     for (i = argidx; i < PG_NARGS(); i++)
    5400                 :     {
    5401              81 :         if (!PG_ARGISNULL(i))
    5402                 :         {
    5403 CBC          75 :             Datum       value = PG_GETARG_DATUM(i);
    5404                 : 
    5405 ECB             :             /* add separator if appropriate */
    5406 CBC          75 :             if (first_arg)
    5407              21 :                 first_arg = false;
    5408 ECB             :             else
    5409 GIC          54 :                 appendStringInfoString(&str, sepstr);
    5410                 : 
    5411                 :             /* call the appropriate type output function, append the result */
    5412 CBC          75 :             appendStringInfoString(&str,
    5413              75 :                                    OutputFunctionCall(&foutcache[i], value));
    5414 ECB             :         }
    5415                 :     }
    5416                 : 
    5417 GIC          21 :     result = cstring_to_text_with_len(str.data, str.len);
    5418              21 :     pfree(str.data);
    5419 ECB             : 
    5420 GIC          21 :     return result;
    5421                 : }
    5422                 : 
    5423 ECB             : /*
    5424                 :  * Concatenate all arguments. NULL arguments are ignored.
    5425                 :  */
    5426                 : Datum
    5427 GIC          18 : text_concat(PG_FUNCTION_ARGS)
    5428                 : {
    5429                 :     text       *result;
    5430                 : 
    5431              18 :     result = concat_internal("", 0, fcinfo);
    5432 GBC          18 :     if (result == NULL)
    5433               3 :         PG_RETURN_NULL();
    5434 GIC          15 :     PG_RETURN_TEXT_P(result);
    5435                 : }
    5436 ECB             : 
    5437                 : /*
    5438                 :  * Concatenate all but first argument value with separators. The first
    5439                 :  * parameter is used as the separator. NULL arguments are ignored.
    5440                 :  */
    5441                 : Datum
    5442 GIC          21 : text_concat_ws(PG_FUNCTION_ARGS)
    5443                 : {
    5444                 :     char       *sep;
    5445                 :     text       *result;
    5446                 : 
    5447                 :     /* return NULL when separator is NULL */
    5448              21 :     if (PG_ARGISNULL(0))
    5449               3 :         PG_RETURN_NULL();
    5450              18 :     sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
    5451                 : 
    5452              18 :     result = concat_internal(sep, 1, fcinfo);
    5453              18 :     if (result == NULL)
    5454               3 :         PG_RETURN_NULL();
    5455              15 :     PG_RETURN_TEXT_P(result);
    5456                 : }
    5457                 : 
    5458 ECB             : /*
    5459                 :  * Return first n characters in the string. When n is negative,
    5460                 :  * return all but last |n| characters.
    5461                 :  */
    5462                 : Datum
    5463 GIC         942 : text_left(PG_FUNCTION_ARGS)
    5464                 : {
    5465             942 :     int         n = PG_GETARG_INT32(1);
    5466                 : 
    5467             942 :     if (n < 0)
    5468                 :     {
    5469 CBC          15 :         text       *str = PG_GETARG_TEXT_PP(0);
    5470              15 :         const char *p = VARDATA_ANY(str);
    5471              15 :         int         len = VARSIZE_ANY_EXHDR(str);
    5472 ECB             :         int         rlen;
    5473                 : 
    5474 GIC          15 :         n = pg_mbstrlen_with_len(p, len) + n;
    5475              15 :         rlen = pg_mbcharcliplen(p, len, n);
    5476              15 :         PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
    5477                 :     }
    5478 ECB             :     else
    5479 CBC         927 :         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
    5480                 : }
    5481                 : 
    5482 ECB             : /*
    5483                 :  * Return last n characters in the string. When n is negative,
    5484                 :  * return all but first |n| characters.
    5485                 :  */
    5486                 : Datum
    5487 GIC          33 : text_right(PG_FUNCTION_ARGS)
    5488                 : {
    5489              33 :     text       *str = PG_GETARG_TEXT_PP(0);
    5490              33 :     const char *p = VARDATA_ANY(str);
    5491 CBC          33 :     int         len = VARSIZE_ANY_EXHDR(str);
    5492 GIC          33 :     int         n = PG_GETARG_INT32(1);
    5493                 :     int         off;
    5494 ECB             : 
    5495 CBC          33 :     if (n < 0)
    5496 GIC          15 :         n = -n;
    5497                 :     else
    5498              18 :         n = pg_mbstrlen_with_len(p, len) - n;
    5499              33 :     off = pg_mbcharcliplen(p, len, n);
    5500                 : 
    5501              33 :     PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
    5502                 : }
    5503                 : 
    5504                 : /*
    5505 ECB             :  * Return reversed string
    5506                 :  */
    5507                 : Datum
    5508 CBC           3 : text_reverse(PG_FUNCTION_ARGS)
    5509                 : {
    5510 GIC           3 :     text       *str = PG_GETARG_TEXT_PP(0);
    5511 CBC           3 :     const char *p = VARDATA_ANY(str);
    5512               3 :     int         len = VARSIZE_ANY_EXHDR(str);
    5513 GIC           3 :     const char *endp = p + len;
    5514                 :     text       *result;
    5515                 :     char       *dst;
    5516 ECB             : 
    5517 GIC           3 :     result = palloc(len + VARHDRSZ);
    5518               3 :     dst = (char *) VARDATA(result) + len;
    5519               3 :     SET_VARSIZE(result, len + VARHDRSZ);
    5520 ECB             : 
    5521 CBC           3 :     if (pg_database_encoding_max_length() > 1)
    5522                 :     {
    5523                 :         /* multibyte version */
    5524 GIC          18 :         while (p < endp)
    5525                 :         {
    5526 ECB             :             int         sz;
    5527                 : 
    5528 GIC          15 :             sz = pg_mblen(p);
    5529              15 :             dst -= sz;
    5530              15 :             memcpy(dst, p, sz);
    5531 CBC          15 :             p += sz;
    5532 ECB             :         }
    5533                 :     }
    5534                 :     else
    5535                 :     {
    5536                 :         /* single byte version */
    5537 UIC           0 :         while (p < endp)
    5538 LBC           0 :             *(--dst) = *p++;
    5539                 :     }
    5540                 : 
    5541 GIC           3 :     PG_RETURN_TEXT_P(result);
    5542                 : }
    5543                 : 
    5544                 : 
    5545                 : /*
    5546                 :  * Support macros for text_format()
    5547                 :  */
    5548                 : #define TEXT_FORMAT_FLAG_MINUS  0x0001  /* is minus flag present? */
    5549                 : 
    5550                 : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
    5551                 :     do { \
    5552 ECB             :         if (++(ptr) >= (end_ptr)) \
    5553                 :             ereport(ERROR, \
    5554                 :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
    5555                 :                      errmsg("unterminated format() type specifier"), \
    5556                 :                      errhint("For a single \"%%\" use \"%%%%\"."))); \
    5557                 :     } while (0)
    5558                 : 
    5559                 : /*
    5560                 :  * Returns a formatted string
    5561                 :  */
    5562                 : Datum
    5563 CBC       12534 : text_format(PG_FUNCTION_ARGS)
    5564 ECB             : {
    5565                 :     text       *fmt;
    5566                 :     StringInfoData str;
    5567                 :     const char *cp;
    5568                 :     const char *start_ptr;
    5569                 :     const char *end_ptr;
    5570                 :     text       *result;
    5571                 :     int         arg;
    5572                 :     bool        funcvariadic;
    5573                 :     int         nargs;
    5574 GIC       12534 :     Datum      *elements = NULL;
    5575           12534 :     bool       *nulls = NULL;
    5576           12534 :     Oid         element_type = InvalidOid;
    5577           12534 :     Oid         prev_type = InvalidOid;
    5578           12534 :     Oid         prev_width_type = InvalidOid;
    5579                 :     FmgrInfo    typoutputfinfo;
    5580 ECB             :     FmgrInfo    typoutputinfo_width;
    5581                 : 
    5582                 :     /* When format string is null, immediately return null */
    5583 GIC       12534 :     if (PG_ARGISNULL(0))
    5584               3 :         PG_RETURN_NULL();
    5585                 : 
    5586                 :     /* If argument is marked VARIADIC, expand array into elements */
    5587           12531 :     if (get_fn_expr_variadic(fcinfo->flinfo))
    5588 ECB             :     {
    5589                 :         ArrayType  *arr;
    5590                 :         int16       elmlen;
    5591                 :         bool        elmbyval;
    5592                 :         char        elmalign;
    5593                 :         int         nitems;
    5594 EUB             : 
    5595                 :         /* Should have just the one argument */
    5596 GIC          24 :         Assert(PG_NARGS() == 2);
    5597                 : 
    5598                 :         /* If argument is NULL, we treat it as zero-length array */
    5599 CBC          24 :         if (PG_ARGISNULL(1))
    5600 GIC           3 :             nitems = 0;
    5601 ECB             :         else
    5602                 :         {
    5603                 :             /*
    5604                 :              * Non-null argument had better be an array.  We assume that any
    5605                 :              * call context that could let get_fn_expr_variadic return true
    5606                 :              * will have checked that a VARIADIC-labeled parameter actually is
    5607 EUB             :              * an array.  So it should be okay to just Assert that it's an
    5608                 :              * array rather than doing a full-fledged error check.
    5609                 :              */
    5610 GIC          21 :             Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
    5611 ECB             : 
    5612 EUB             :             /* OK, safe to fetch the array value */
    5613 GIC          21 :             arr = PG_GETARG_ARRAYTYPE_P(1);
    5614 ECB             : 
    5615                 :             /* Get info about array element type */
    5616 GIC          21 :             element_type = ARR_ELEMTYPE(arr);
    5617 CBC          21 :             get_typlenbyvalalign(element_type,
    5618 ECB             :                                  &elmlen, &elmbyval, &elmalign);
    5619                 : 
    5620                 :             /* Extract all array elements */
    5621 GBC          21 :             deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
    5622 EUB             :                               &elements, &nulls, &nitems);
    5623                 :         }
    5624                 : 
    5625 GIC          24 :         nargs = nitems + 1;
    5626              24 :         funcvariadic = true;
    5627                 :     }
    5628 EUB             :     else
    5629                 :     {
    5630                 :         /* Non-variadic case, we'll process the arguments individually */
    5631 GIC       12507 :         nargs = PG_NARGS();
    5632           12507 :         funcvariadic = false;
    5633 EUB             :     }
    5634                 : 
    5635                 :     /* Setup for main loop. */
    5636 GIC       12531 :     fmt = PG_GETARG_TEXT_PP(0);
    5637           12531 :     start_ptr = VARDATA_ANY(fmt);
    5638 GBC       12531 :     end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
    5639 GIC       12531 :     initStringInfo(&str);
    5640           12531 :     arg = 1;                    /* next argument position to print */
    5641 EUB             : 
    5642                 :     /* Scan format string, looking for conversion specifiers. */
    5643 GBC      363849 :     for (cp = start_ptr; cp < end_ptr; cp++)
    5644                 :     {
    5645                 :         int         argpos;
    5646                 :         int         widthpos;
    5647                 :         int         flags;
    5648 ECB             :         int         width;
    5649                 :         Datum       value;
    5650                 :         bool        isNull;
    5651                 :         Oid         typid;
    5652                 : 
    5653                 :         /*
    5654                 :          * If it's not the start of a conversion specifier, just copy it to
    5655                 :          * the output buffer.
    5656                 :          */
    5657 GIC      351348 :         if (*cp != '%')
    5658 ECB             :         {
    5659 CBC      321807 :             appendStringInfoCharMacro(&str, *cp);
    5660          321816 :             continue;
    5661                 :         }
    5662                 : 
    5663 GIC       29541 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5664 ECB             : 
    5665                 :         /* Easy case: %% outputs a single % */
    5666 CBC       29541 :         if (*cp == '%')
    5667                 :         {
    5668               9 :             appendStringInfoCharMacro(&str, *cp);
    5669 GBC           9 :             continue;
    5670                 :         }
    5671 ECB             : 
    5672                 :         /* Parse the optional portions of the format specifier */
    5673 GIC       29532 :         cp = text_format_parse_format(cp, end_ptr,
    5674                 :                                       &argpos, &widthpos,
    5675                 :                                       &flags, &width);
    5676                 : 
    5677                 :         /*
    5678 ECB             :          * Next we should see the main conversion specifier.  Whether or not
    5679                 :          * an argument position was present, it's known that at least one
    5680                 :          * character remains in the string at this point.  Experience suggests
    5681                 :          * that it's worth checking that that character is one of the expected
    5682                 :          * ones before we try to fetch arguments, so as to produce the least
    5683                 :          * confusing response to a mis-formatted specifier.
    5684                 :          */
    5685 CBC       29520 :         if (strchr("sIL", *cp) == NULL)
    5686 GIC           3 :             ereport(ERROR,
    5687                 :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5688                 :                      errmsg("unrecognized format() type specifier \"%.*s\"",
    5689                 :                             pg_mblen(cp), cp),
    5690                 :                      errhint("For a single \"%%\" use \"%%%%\".")));
    5691 ECB             : 
    5692                 :         /* If indirect width was specified, get its value */
    5693 CBC       29517 :         if (widthpos >= 0)
    5694                 :         {
    5695                 :             /* Collect the specified or next argument position */
    5696              21 :             if (widthpos > 0)
    5697 GIC          18 :                 arg = widthpos;
    5698              21 :             if (arg >= nargs)
    5699 LBC           0 :                 ereport(ERROR,
    5700 EUB             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5701                 :                          errmsg("too few arguments for format()")));
    5702                 : 
    5703                 :             /* Get the value and type of the selected argument */
    5704 GIC          21 :             if (!funcvariadic)
    5705                 :             {
    5706              21 :                 value = PG_GETARG_DATUM(arg);
    5707              21 :                 isNull = PG_ARGISNULL(arg);
    5708              21 :                 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
    5709                 :             }
    5710                 :             else
    5711                 :             {
    5712 LBC           0 :                 value = elements[arg - 1];
    5713               0 :                 isNull = nulls[arg - 1];
    5714               0 :                 typid = element_type;
    5715 ECB             :             }
    5716 GIC          21 :             if (!OidIsValid(typid))
    5717 UIC           0 :                 elog(ERROR, "could not determine data type of format() input");
    5718 ECB             : 
    5719 CBC          21 :             arg++;
    5720                 : 
    5721 ECB             :             /* We can treat NULL width the same as zero */
    5722 GIC          21 :             if (isNull)
    5723               3 :                 width = 0;
    5724              18 :             else if (typid == INT4OID)
    5725              18 :                 width = DatumGetInt32(value);
    5726 UIC           0 :             else if (typid == INT2OID)
    5727               0 :                 width = DatumGetInt16(value);
    5728                 :             else
    5729                 :             {
    5730                 :                 /* For less-usual datatypes, convert to text then to int */
    5731                 :                 char       *str;
    5732                 : 
    5733               0 :                 if (typid != prev_width_type)
    5734                 :                 {
    5735 ECB             :                     Oid         typoutputfunc;
    5736                 :                     bool        typIsVarlena;
    5737                 : 
    5738 LBC           0 :                     getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
    5739               0 :                     fmgr_info(typoutputfunc, &typoutputinfo_width);
    5740 UIC           0 :                     prev_width_type = typid;
    5741 ECB             :                 }
    5742                 : 
    5743 LBC           0 :                 str = OutputFunctionCall(&typoutputinfo_width, value);
    5744                 : 
    5745 ECB             :                 /* pg_strtoint32 will complain about bad data or overflow */
    5746 LBC           0 :                 width = pg_strtoint32(str);
    5747 EUB             : 
    5748 UIC           0 :                 pfree(str);
    5749                 :             }
    5750 ECB             :         }
    5751                 : 
    5752                 :         /* Collect the specified or next argument position */
    5753 GIC       29517 :         if (argpos > 0)
    5754 CBC          66 :             arg = argpos;
    5755           29517 :         if (arg >= nargs)
    5756 GIC          12 :             ereport(ERROR,
    5757 ECB             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5758                 :                      errmsg("too few arguments for format()")));
    5759                 : 
    5760                 :         /* Get the value and type of the selected argument */
    5761 GIC       29505 :         if (!funcvariadic)
    5762                 :         {
    5763           28869 :             value = PG_GETARG_DATUM(arg);
    5764           28869 :             isNull = PG_ARGISNULL(arg);
    5765           28869 :             typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
    5766                 :         }
    5767                 :         else
    5768                 :         {
    5769             636 :             value = elements[arg - 1];
    5770             636 :             isNull = nulls[arg - 1];
    5771             636 :             typid = element_type;
    5772                 :         }
    5773           29505 :         if (!OidIsValid(typid))
    5774 UIC           0 :             elog(ERROR, "could not determine data type of format() input");
    5775                 : 
    5776 GIC       29505 :         arg++;
    5777                 : 
    5778                 :         /*
    5779                 :          * Get the appropriate typOutput function, reusing previous one if
    5780                 :          * same type as previous argument.  That's particularly useful in the
    5781                 :          * variadic-array case, but often saves work even for ordinary calls.
    5782                 :          */
    5783           29505 :         if (typid != prev_type)
    5784 ECB             :         {
    5785                 :             Oid         typoutputfunc;
    5786                 :             bool        typIsVarlena;
    5787                 : 
    5788 CBC       13980 :             getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
    5789 GIC       13980 :             fmgr_info(typoutputfunc, &typoutputfinfo);
    5790           13980 :             prev_type = typid;
    5791                 :         }
    5792 ECB             : 
    5793                 :         /*
    5794                 :          * And now we can format the value.
    5795                 :          */
    5796 GIC       29505 :         switch (*cp)
    5797                 :         {
    5798 CBC       29505 :             case 's':
    5799                 :             case 'I':
    5800 ECB             :             case 'L':
    5801 GIC       29505 :                 text_format_string_conversion(&str, *cp, &typoutputfinfo,
    5802                 :                                               value, isNull,
    5803 ECB             :                                               flags, width);
    5804 CBC       29502 :                 break;
    5805 UIC           0 :             default:
    5806                 :                 /* should not get here, because of previous check */
    5807 LBC           0 :                 ereport(ERROR,
    5808                 :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5809 ECB             :                          errmsg("unrecognized format() type specifier \"%.*s\"",
    5810                 :                                 pg_mblen(cp), cp),
    5811                 :                          errhint("For a single \"%%\" use \"%%%%\".")));
    5812                 :                 break;
    5813                 :         }
    5814                 :     }
    5815                 : 
    5816                 :     /* Don't need deconstruct_array results anymore. */
    5817 CBC       12501 :     if (elements != NULL)
    5818 GIC          21 :         pfree(elements);
    5819 CBC       12501 :     if (nulls != NULL)
    5820              21 :         pfree(nulls);
    5821                 : 
    5822                 :     /* Generate results. */
    5823           12501 :     result = cstring_to_text_with_len(str.data, str.len);
    5824 GIC       12501 :     pfree(str.data);
    5825                 : 
    5826 CBC       12501 :     PG_RETURN_TEXT_P(result);
    5827 ECB             : }
    5828                 : 
    5829                 : /*
    5830                 :  * Parse contiguous digits as a decimal number.
    5831 EUB             :  *
    5832                 :  * Returns true if some digits could be parsed.
    5833                 :  * The value is returned into *value, and *ptr is advanced to the next
    5834                 :  * character to be parsed.
    5835 ECB             :  *
    5836                 :  * Note parsing invariant: at least one character is known available before
    5837                 :  * string end (end_ptr) at entry, and this is still true at exit.
    5838                 :  */
    5839                 : static bool
    5840 GIC       59046 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
    5841 ECB             : {
    5842 GIC       59046 :     bool        found = false;
    5843           59046 :     const char *cp = *ptr;
    5844 CBC       59046 :     int         val = 0;
    5845                 : 
    5846 GIC       59202 :     while (*cp >= '0' && *cp <= '9')
    5847                 :     {
    5848             159 :         int8        digit = (*cp - '0');
    5849 ECB             : 
    5850 CBC         159 :         if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
    5851 GIC         159 :             unlikely(pg_add_s32_overflow(val, digit, &val)))
    5852 UIC           0 :             ereport(ERROR,
    5853                 :                     (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    5854 ECB             :                      errmsg("number is out of range")));
    5855 GIC         159 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5856             156 :         found = true;
    5857                 :     }
    5858                 : 
    5859           59043 :     *ptr = cp;
    5860           59043 :     *value = val;
    5861 ECB             : 
    5862 GIC       59043 :     return found;
    5863                 : }
    5864                 : 
    5865                 : /*
    5866                 :  * Parse a format specifier (generally following the SUS printf spec).
    5867                 :  *
    5868                 :  * We have already advanced over the initial '%', and we are looking for
    5869 ECB             :  * [argpos][flags][width]type (but the type character is not consumed here).
    5870                 :  *
    5871                 :  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
    5872                 :  * Output parameters:
    5873                 :  *  argpos: argument position for value to be printed.  -1 means unspecified.
    5874                 :  *  widthpos: argument position for width.  Zero means the argument position
    5875                 :  *          was unspecified (ie, take the next arg) and -1 means no width
    5876                 :  *          argument (width was omitted or specified as a constant).
    5877                 :  *  flags: bitmask of flags.
    5878                 :  *  width: directly-specified width value.  Zero means the width was omitted
    5879                 :  *          (note it's not necessary to distinguish this case from an explicit
    5880                 :  *          zero width value).
    5881                 :  *
    5882                 :  * The function result is the next character position to be parsed, ie, the
    5883                 :  * location where the type character is/should be.
    5884                 :  *
    5885                 :  * Note parsing invariant: at least one character is known available before
    5886                 :  * string end (end_ptr) at entry, and this is still true at exit.
    5887                 :  */
    5888                 : static const char *
    5889 CBC       29532 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
    5890                 :                          int *argpos, int *widthpos,
    5891 ECB             :                          int *flags, int *width)
    5892                 : {
    5893 CBC       29532 :     const char *cp = start_ptr;
    5894                 :     int         n;
    5895 ECB             : 
    5896                 :     /* set defaults for output parameters */
    5897 CBC       29532 :     *argpos = -1;
    5898 GIC       29532 :     *widthpos = -1;
    5899           29532 :     *flags = 0;
    5900 CBC       29532 :     *width = 0;
    5901                 : 
    5902                 :     /* try to identify first number */
    5903           29532 :     if (text_format_parse_digits(&cp, end_ptr, &n))
    5904                 :     {
    5905 GIC          87 :         if (*cp != '$')
    5906                 :         {
    5907                 :             /* Must be just a width and a type, so we're done */
    5908              12 :             *width = n;
    5909              12 :             return cp;
    5910 ECB             :         }
    5911                 :         /* The number was argument position */
    5912 GIC          75 :         *argpos = n;
    5913 ECB             :         /* Explicit 0 for argument index is immediately refused */
    5914 GIC          75 :         if (n == 0)
    5915               3 :             ereport(ERROR,
    5916                 :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5917 ECB             :                      errmsg("format specifies argument 0, but arguments are numbered from 1")));
    5918 GIC          72 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5919 ECB             :     }
    5920                 : 
    5921                 :     /* Handle flags (only minus is supported now) */
    5922 GIC       29529 :     while (*cp == '-')
    5923 ECB             :     {
    5924 GIC          15 :         *flags |= TEXT_FORMAT_FLAG_MINUS;
    5925              15 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5926 ECB             :     }
    5927                 : 
    5928 CBC       29514 :     if (*cp == '*')
    5929 EUB             :     {
    5930                 :         /* Handle indirect width */
    5931 GIC          24 :         ADVANCE_PARSE_POINTER(cp, end_ptr);
    5932 CBC          24 :         if (text_format_parse_digits(&cp, end_ptr, &n))
    5933                 :         {
    5934 ECB             :             /* number in this position must be closed by $ */
    5935 CBC          21 :             if (*cp != '$')
    5936 UIC           0 :                 ereport(ERROR,
    5937 ECB             :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5938                 :                          errmsg("width argument position must be ended by \"$\"")));
    5939                 :             /* The number was width argument position */
    5940 GIC          21 :             *widthpos = n;
    5941 ECB             :             /* Explicit 0 for argument index is immediately refused */
    5942 CBC          21 :             if (n == 0)
    5943               3 :                 ereport(ERROR,
    5944                 :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    5945                 :                          errmsg("format specifies argument 0, but arguments are numbered from 1")));
    5946 GIC          18 :             ADVANCE_PARSE_POINTER(cp, end_ptr);
    5947                 :         }
    5948 ECB             :         else
    5949 CBC           3 :             *widthpos = 0;      /* width's argument position is unspecified */
    5950 ECB             :     }
    5951                 :     else
    5952                 :     {
    5953                 :         /* Check for direct width specification */
    5954 GIC       29490 :         if (text_format_parse_digits(&cp, end_ptr, &n))
    5955              15 :             *width = n;
    5956                 :     }
    5957                 : 
    5958                 :     /* cp should now be pointing at type character */
    5959           29508 :     return cp;
    5960                 : }
    5961                 : 
    5962 ECB             : /*
    5963                 :  * Format a %s, %I, or %L conversion
    5964                 :  */
    5965                 : static void
    5966 GIC       29505 : text_format_string_conversion(StringInfo buf, char conversion,
    5967                 :                               FmgrInfo *typOutputInfo,
    5968                 :                               Datum value, bool isNull,
    5969                 :                               int flags, int width)
    5970                 : {
    5971                 :     char       *str;
    5972 EUB             : 
    5973                 :     /* Handle NULL arguments before trying to stringify the value. */
    5974 GBC       29505 :     if (isNull)
    5975                 :     {
    5976             153 :         if (conversion == 's')
    5977             117 :             text_format_append_string(buf, "", flags, width);
    5978              36 :         else if (conversion == 'L')
    5979 GIC          33 :             text_format_append_string(buf, "NULL", flags, width);
    5980 GBC           3 :         else if (conversion == 'I')
    5981 GIC           3 :             ereport(ERROR,
    5982                 :                     (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
    5983                 :                      errmsg("null values cannot be formatted as an SQL identifier")));
    5984             150 :         return;
    5985                 :     }
    5986                 : 
    5987                 :     /* Stringify. */
    5988           29352 :     str = OutputFunctionCall(typOutputInfo, value);
    5989                 : 
    5990                 :     /* Escape. */
    5991           29352 :     if (conversion == 'I')
    5992                 :     {
    5993                 :         /* quote_identifier may or may not allocate a new string. */
    5994            1553 :         text_format_append_string(buf, quote_identifier(str), flags, width);
    5995                 :     }
    5996           27799 :     else if (conversion == 'L')
    5997                 :     {
    5998            1289 :         char       *qstr = quote_literal_cstr(str);
    5999                 : 
    6000            1289 :         text_format_append_string(buf, qstr, flags, width);
    6001                 :         /* quote_literal_cstr() always allocates a new string */
    6002            1289 :         pfree(qstr);
    6003                 :     }
    6004                 :     else
    6005           26510 :         text_format_append_string(buf, str, flags, width);
    6006                 : 
    6007                 :     /* Cleanup. */
    6008 CBC       29352 :     pfree(str);
    6009                 : }
    6010 ECB             : 
    6011                 : /*
    6012                 :  * Append str to buf, padding as directed by flags/width
    6013                 :  */
    6014                 : static void
    6015 CBC       29502 : text_format_append_string(StringInfo buf, const char *str,
    6016 ECB             :                           int flags, int width)
    6017                 : {
    6018 GIC       29502 :     bool        align_to_left = false;
    6019                 :     int         len;
    6020                 : 
    6021                 :     /* fast path for typical easy case */
    6022           29502 :     if (width == 0)
    6023                 :     {
    6024           29460 :         appendStringInfoString(buf, str);
    6025           29460 :         return;
    6026                 :     }
    6027                 : 
    6028 CBC          42 :     if (width < 0)
    6029                 :     {
    6030                 :         /* Negative width: implicit '-' flag, then take absolute value */
    6031 GIC           3 :         align_to_left = true;
    6032 ECB             :         /* -INT_MIN is undefined */
    6033 GIC           3 :         if (width <= INT_MIN)
    6034 LBC           0 :             ereport(ERROR,
    6035 ECB             :                     (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
    6036 EUB             :                      errmsg("number is out of range")));
    6037 GIC           3 :         width = -width;
    6038                 :     }
    6039              39 :     else if (flags & TEXT_FORMAT_FLAG_MINUS)
    6040              12 :         align_to_left = true;
    6041                 : 
    6042 CBC          42 :     len = pg_mbstrlen(str);
    6043              42 :     if (align_to_left)
    6044 EUB             :     {
    6045                 :         /* left justify */
    6046 CBC          15 :         appendStringInfoString(buf, str);
    6047              15 :         if (len < width)
    6048 GIC          15 :             appendStringInfoSpaces(buf, width - len);
    6049 ECB             :     }
    6050                 :     else
    6051                 :     {
    6052                 :         /* right justify */
    6053 CBC          27 :         if (len < width)
    6054              27 :             appendStringInfoSpaces(buf, width - len);
    6055 GIC          27 :         appendStringInfoString(buf, str);
    6056                 :     }
    6057                 : }
    6058                 : 
    6059                 : /*
    6060                 :  * text_format_nv - nonvariadic wrapper for text_format function.
    6061                 :  *
    6062                 :  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
    6063 ECB             :  * which checks that all built-in functions that share the implementing C
    6064                 :  * function take the same number of arguments.
    6065                 :  */
    6066                 : Datum
    6067 CBC          15 : text_format_nv(PG_FUNCTION_ARGS)
    6068                 : {
    6069 GIC          15 :     return text_format(fcinfo);
    6070                 : }
    6071                 : 
    6072                 : /*
    6073                 :  * Helper function for Levenshtein distance functions. Faster than memcmp(),
    6074                 :  * for this use case.
    6075                 :  */
    6076 ECB             : static inline bool
    6077 UIC           0 : rest_of_char_same(const char *s1, const char *s2, int len)
    6078 ECB             : {
    6079 UIC           0 :     while (len > 0)
    6080                 :     {
    6081               0 :         len--;
    6082               0 :         if (s1[len] != s2[len])
    6083 LBC           0 :             return false;
    6084 EUB             :     }
    6085 UIC           0 :     return true;
    6086                 : }
    6087                 : 
    6088 ECB             : /* Expand each Levenshtein distance variant */
    6089                 : #include "levenshtein.c"
    6090                 : #define LEVENSHTEIN_LESS_EQUAL
    6091                 : #include "levenshtein.c"
    6092                 : 
    6093                 : 
    6094                 : /*
    6095                 :  * The following *ClosestMatch() functions can be used to determine whether a
    6096                 :  * user-provided string resembles any known valid values, which is useful for
    6097                 :  * providing hints in log messages, among other things.  Use these functions
    6098                 :  * like so:
    6099                 :  *
    6100                 :  *      initClosestMatch(&state, source_string, max_distance);
    6101                 :  *
    6102                 :  *      for (int i = 0; i < num_valid_strings; i++)
    6103                 :  *          updateClosestMatch(&state, valid_strings[i]);
    6104                 :  *
    6105                 :  *      closestMatch = getClosestMatch(&state);
    6106                 :  */
    6107                 : 
    6108                 : /*
    6109                 :  * Initialize the given state with the source string and maximum Levenshtein
    6110                 :  * distance to consider.
    6111                 :  */
    6112                 : void
    6113 GNC          28 : initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
    6114                 : {
    6115              28 :     Assert(state);
    6116              28 :     Assert(max_d >= 0);
    6117                 : 
    6118              28 :     state->source = source;
    6119              28 :     state->min_d = -1;
    6120              28 :     state->max_d = max_d;
    6121              28 :     state->match = NULL;
    6122              28 : }
    6123                 : 
    6124                 : /*
    6125                 :  * If the candidate string is a closer match than the current one saved (or
    6126                 :  * there is no match saved), save it as the closest match.
    6127                 :  *
    6128                 :  * If the source or candidate string is NULL, empty, or too long, this function
    6129                 :  * takes no action.  Likewise, if the Levenshtein distance exceeds the maximum
    6130                 :  * allowed or more than half the characters are different, no action is taken.
    6131                 :  */
    6132                 : void
    6133             159 : updateClosestMatch(ClosestMatchState *state, const char *candidate)
    6134                 : {
    6135                 :     int         dist;
    6136                 : 
    6137             159 :     Assert(state);
    6138                 : 
    6139             159 :     if (state->source == NULL || state->source[0] == '\0' ||
    6140             159 :         candidate == NULL || candidate[0] == '\0')
    6141 UNC           0 :         return;
    6142                 : 
    6143                 :     /*
    6144                 :      * To avoid ERROR-ing, we check the lengths here instead of setting
    6145                 :      * 'trusted' to false in the call to varstr_levenshtein_less_equal().
    6146                 :      */
    6147 GNC         159 :     if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
    6148             159 :         strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
    6149 UNC           0 :         return;
    6150                 : 
    6151 GNC         159 :     dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
    6152             159 :                                          candidate, strlen(candidate), 1, 1, 1,
    6153                 :                                          state->max_d, true);
    6154             159 :     if (dist <= state->max_d &&
    6155              28 :         dist <= strlen(state->source) / 2 &&
    6156               7 :         (state->min_d == -1 || dist < state->min_d))
    6157                 :     {
    6158               7 :         state->min_d = dist;
    6159               7 :         state->match = candidate;
    6160                 :     }
    6161                 : }
    6162                 : 
    6163                 : /*
    6164                 :  * Return the closest match.  If no suitable candidates were provided via
    6165                 :  * updateClosestMatch(), return NULL.
    6166                 :  */
    6167                 : const char *
    6168              28 : getClosestMatch(ClosestMatchState *state)
    6169                 : {
    6170              28 :     Assert(state);
    6171                 : 
    6172              28 :     return state->match;
    6173                 : }
    6174                 : 
    6175                 : 
    6176 ECB             : /*
    6177                 :  * Unicode support
    6178                 :  */
    6179                 : 
    6180                 : static UnicodeNormalizationForm
    6181 GIC          93 : unicode_norm_form_from_string(const char *formstr)
    6182                 : {
    6183 CBC          93 :     UnicodeNormalizationForm form = -1;
    6184                 : 
    6185                 :     /*
    6186                 :      * Might as well check this while we're here.
    6187 ECB             :      */
    6188 GIC          93 :     if (GetDatabaseEncoding() != PG_UTF8)
    6189 LBC           0 :         ereport(ERROR,
    6190 ECB             :                 (errcode(ERRCODE_SYNTAX_ERROR),
    6191                 :                  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
    6192                 : 
    6193 GIC          93 :     if (pg_strcasecmp(formstr, "NFC") == 0)
    6194              33 :         form = UNICODE_NFC;
    6195              60 :     else if (pg_strcasecmp(formstr, "NFD") == 0)
    6196              18 :         form = UNICODE_NFD;
    6197              42 :     else if (pg_strcasecmp(formstr, "NFKC") == 0)
    6198              18 :         form = UNICODE_NFKC;
    6199 CBC          24 :     else if (pg_strcasecmp(formstr, "NFKD") == 0)
    6200 GIC          18 :         form = UNICODE_NFKD;
    6201                 :     else
    6202 CBC           6 :         ereport(ERROR,
    6203 ECB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    6204                 :                  errmsg("invalid normalization form: %s", formstr)));
    6205                 : 
    6206 GIC          87 :     return form;
    6207 ECB             : }
    6208                 : 
    6209                 : Datum
    6210 CBC          24 : unicode_normalize_func(PG_FUNCTION_ARGS)
    6211 ECB             : {
    6212 GIC          24 :     text       *input = PG_GETARG_TEXT_PP(0);
    6213              24 :     char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
    6214 ECB             :     UnicodeNormalizationForm form;
    6215                 :     int         size;
    6216                 :     pg_wchar   *input_chars;
    6217                 :     pg_wchar   *output_chars;
    6218                 :     unsigned char *p;
    6219                 :     text       *result;
    6220                 :     int         i;
    6221                 : 
    6222 CBC          24 :     form = unicode_norm_form_from_string(formstr);
    6223 ECB             : 
    6224                 :     /* convert to pg_wchar */
    6225 GIC          21 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    6226 CBC          21 :     input_chars = palloc((size + 1) * sizeof(pg_wchar));
    6227              21 :     p = (unsigned char *) VARDATA_ANY(input);
    6228 GIC          84 :     for (i = 0; i < size; i++)
    6229 ECB             :     {
    6230 CBC          63 :         input_chars[i] = utf8_to_unicode(p);
    6231 GIC          63 :         p += pg_utf_mblen(p);
    6232 ECB             :     }
    6233 CBC          21 :     input_chars[i] = (pg_wchar) '\0';
    6234 GIC          21 :     Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
    6235 ECB             : 
    6236                 :     /* action */
    6237 CBC          21 :     output_chars = unicode_normalize(form, input_chars);
    6238                 : 
    6239                 :     /* convert back to UTF-8 string */
    6240 GIC          21 :     size = 0;
    6241              81 :     for (pg_wchar *wp = output_chars; *wp; wp++)
    6242                 :     {
    6243                 :         unsigned char buf[4];
    6244                 : 
    6245              60 :         unicode_to_utf8(*wp, buf);
    6246              60 :         size += pg_utf_mblen(buf);
    6247                 :     }
    6248                 : 
    6249              21 :     result = palloc(size + VARHDRSZ);
    6250              21 :     SET_VARSIZE(result, size + VARHDRSZ);
    6251                 : 
    6252              21 :     p = (unsigned char *) VARDATA_ANY(result);
    6253 CBC          81 :     for (pg_wchar *wp = output_chars; *wp; wp++)
    6254                 :     {
    6255              60 :         unicode_to_utf8(*wp, p);
    6256              60 :         p += pg_utf_mblen(p);
    6257                 :     }
    6258 GIC          21 :     Assert((char *) p == (char *) result + size + VARHDRSZ);
    6259                 : 
    6260              21 :     PG_RETURN_TEXT_P(result);
    6261                 : }
    6262                 : 
    6263                 : /*
    6264                 :  * Check whether the string is in the specified Unicode normalization form.
    6265                 :  *
    6266                 :  * This is done by converting the string to the specified normal form and then
    6267 ECB             :  * comparing that to the original string.  To speed that up, we also apply the
    6268                 :  * "quick check" algorithm specified in UAX #15, which can give a yes or no
    6269                 :  * answer for many strings by just scanning the string once.
    6270                 :  *
    6271                 :  * This function should generally be optimized for the case where the string
    6272                 :  * is in fact normalized.  In that case, we'll end up looking at the entire
    6273                 :  * string, so it's probably not worth doing any incremental conversion etc.
    6274                 :  */
    6275                 : Datum
    6276 CBC          69 : unicode_is_normalized(PG_FUNCTION_ARGS)
    6277                 : {
    6278              69 :     text       *input = PG_GETARG_TEXT_PP(0);
    6279              69 :     char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
    6280                 :     UnicodeNormalizationForm form;
    6281                 :     int         size;
    6282 ECB             :     pg_wchar   *input_chars;
    6283                 :     pg_wchar   *output_chars;
    6284                 :     unsigned char *p;
    6285                 :     int         i;
    6286                 :     UnicodeNormalizationQC quickcheck;
    6287                 :     int         output_size;
    6288                 :     bool        result;
    6289                 : 
    6290 GIC          69 :     form = unicode_norm_form_from_string(formstr);
    6291 ECB             : 
    6292                 :     /* convert to pg_wchar */
    6293 CBC          66 :     size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
    6294 GIC          66 :     input_chars = palloc((size + 1) * sizeof(pg_wchar));
    6295 CBC          66 :     p = (unsigned char *) VARDATA_ANY(input);
    6296             252 :     for (i = 0; i < size; i++)
    6297                 :     {
    6298             186 :         input_chars[i] = utf8_to_unicode(p);
    6299 GIC         186 :         p += pg_utf_mblen(p);
    6300                 :     }
    6301              66 :     input_chars[i] = (pg_wchar) '\0';
    6302              66 :     Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
    6303                 : 
    6304                 :     /* quick check (see UAX #15) */
    6305 CBC          66 :     quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
    6306 GIC          66 :     if (quickcheck == UNICODE_NORM_QC_YES)
    6307 CBC          21 :         PG_RETURN_BOOL(true);
    6308              45 :     else if (quickcheck == UNICODE_NORM_QC_NO)
    6309               6 :         PG_RETURN_BOOL(false);
    6310                 : 
    6311 ECB             :     /* normalize and compare with original */
    6312 GIC          39 :     output_chars = unicode_normalize(form, input_chars);
    6313                 : 
    6314              39 :     output_size = 0;
    6315 CBC         162 :     for (pg_wchar *wp = output_chars; *wp; wp++)
    6316 GIC         123 :         output_size++;
    6317 ECB             : 
    6318 CBC          57 :     result = (size == output_size) &&
    6319              18 :         (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
    6320 ECB             : 
    6321 CBC          39 :     PG_RETURN_BOOL(result);
    6322 ECB             : }
    6323 EUB             : 
    6324                 : /*
    6325                 :  * Check if first n chars are hexadecimal digits
    6326                 :  */
    6327                 : static bool
    6328 GIC          78 : isxdigits_n(const char *instr, size_t n)
    6329                 : {
    6330             330 :     for (size_t i = 0; i < n; i++)
    6331 CBC         285 :         if (!isxdigit((unsigned char) instr[i]))
    6332 GIC          33 :             return false;
    6333 ECB             : 
    6334 GIC          45 :     return true;
    6335 ECB             : }
    6336                 : 
    6337                 : static unsigned int
    6338 CBC         252 : hexval(unsigned char c)
    6339                 : {
    6340 GIC         252 :     if (c >= '0' && c <= '9')
    6341             192 :         return c - '0';
    6342              60 :     if (c >= 'a' && c <= 'f')
    6343              30 :         return c - 'a' + 0xA;
    6344              30 :     if (c >= 'A' && c <= 'F')
    6345 CBC          30 :         return c - 'A' + 0xA;
    6346 UIC           0 :     elog(ERROR, "invalid hexadecimal digit");
    6347 ECB             :     return 0;                   /* not reached */
    6348                 : }
    6349                 : 
    6350                 : /*
    6351                 :  * Translate string with hexadecimal digits to number
    6352                 :  */
    6353                 : static unsigned int
    6354 GIC          45 : hexval_n(const char *instr, size_t n)
    6355 ECB             : {
    6356 CBC          45 :     unsigned int result = 0;
    6357                 : 
    6358             297 :     for (size_t i = 0; i < n; i++)
    6359 GIC         252 :         result += hexval(instr[i]) << (4 * (n - i - 1));
    6360 ECB             : 
    6361 GIC          45 :     return result;
    6362 ECB             : }
    6363                 : 
    6364                 : /*
    6365                 :  * Replaces Unicode escape sequences by Unicode characters
    6366                 :  */
    6367                 : Datum
    6368 GBC          33 : unistr(PG_FUNCTION_ARGS)
    6369 ECB             : {
    6370 CBC          33 :     text       *input_text = PG_GETARG_TEXT_PP(0);
    6371 ECB             :     char       *instr;
    6372                 :     int         len;
    6373                 :     StringInfoData str;
    6374                 :     text       *result;
    6375 CBC          33 :     pg_wchar    pair_first = 0;
    6376                 :     char        cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
    6377 ECB             : 
    6378 GIC          33 :     instr = VARDATA_ANY(input_text);
    6379 CBC          33 :     len = VARSIZE_ANY_EXHDR(input_text);
    6380                 : 
    6381              33 :     initStringInfo(&str);
    6382 EUB             : 
    6383 GIC         255 :     while (len > 0)
    6384                 :     {
    6385             243 :         if (instr[0] == '\\')
    6386 ECB             :         {
    6387 GIC          51 :             if (len >= 2 &&
    6388 CBC          51 :                 instr[1] == '\\')
    6389                 :             {
    6390 GBC           3 :                 if (pair_first)
    6391 UBC           0 :                     goto invalid_pair;
    6392 GIC           3 :                 appendStringInfoChar(&str, '\\');
    6393               3 :                 instr += 2;
    6394 CBC           3 :                 len -= 2;
    6395                 :             }
    6396              48 :             else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
    6397 GBC          33 :                      (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
    6398 GIC          15 :             {
    6399 ECB             :                 pg_wchar    unicode;
    6400 CBC          21 :                 int         offset = instr[1] == 'u' ? 2 : 1;
    6401                 : 
    6402 GIC          21 :                 unicode = hexval_n(instr + offset, 4);
    6403 ECB             : 
    6404 CBC          21 :                 if (!is_valid_unicode_codepoint(unicode))
    6405 UIC           0 :                     ereport(ERROR,
    6406                 :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    6407 ECB             :                             errmsg("invalid Unicode code point: %04X", unicode));
    6408                 : 
    6409 GIC          21 :                 if (pair_first)
    6410 ECB             :                 {
    6411 CBC           6 :                     if (is_utf16_surrogate_second(unicode))
    6412                 :                     {
    6413 UIC           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    6414 LBC           0 :                         pair_first = 0;
    6415                 :                     }
    6416 ECB             :                     else
    6417 CBC           6 :                         goto invalid_pair;
    6418                 :                 }
    6419 GIC          15 :                 else if (is_utf16_surrogate_second(unicode))
    6420 UIC           0 :                     goto invalid_pair;
    6421 ECB             : 
    6422 GIC          15 :                 if (is_utf16_surrogate_first(unicode))
    6423 CBC           9 :                     pair_first = unicode;
    6424                 :                 else
    6425 EUB             :                 {
    6426 GBC           6 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    6427 GIC           6 :                     appendStringInfoString(&str, cbuf);
    6428                 :                 }
    6429 ECB             : 
    6430 GIC          15 :                 instr += 4 + offset;
    6431 CBC          15 :                 len -= 4 + offset;
    6432 EUB             :             }
    6433 GIC          27 :             else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
    6434 CBC           6 :             {
    6435 ECB             :                 pg_wchar    unicode;
    6436                 : 
    6437 GIC          12 :                 unicode = hexval_n(instr + 2, 6);
    6438 ECB             : 
    6439 CBC          12 :                 if (!is_valid_unicode_codepoint(unicode))
    6440 GIC           3 :                     ereport(ERROR,
    6441                 :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    6442 ECB             :                             errmsg("invalid Unicode code point: %04X", unicode));
    6443                 : 
    6444 GIC           9 :                 if (pair_first)
    6445 ECB             :                 {
    6446 CBC           3 :                     if (is_utf16_surrogate_second(unicode))
    6447                 :                     {
    6448 UIC           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    6449 LBC           0 :                         pair_first = 0;
    6450                 :                     }
    6451 ECB             :                     else
    6452 CBC           3 :                         goto invalid_pair;
    6453                 :                 }
    6454 GIC           6 :                 else if (is_utf16_surrogate_second(unicode))
    6455 UIC           0 :                     goto invalid_pair;
    6456 ECB             : 
    6457 GIC           6 :                 if (is_utf16_surrogate_first(unicode))
    6458 CBC           3 :                     pair_first = unicode;
    6459                 :                 else
    6460 EUB             :                 {
    6461 GBC           3 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    6462 GIC           3 :                     appendStringInfoString(&str, cbuf);
    6463                 :                 }
    6464 ECB             : 
    6465 GIC           6 :                 instr += 8;
    6466 CBC           6 :                 len -= 8;
    6467 EUB             :             }
    6468 GIC          15 :             else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
    6469 CBC           6 :             {
    6470 ECB             :                 pg_wchar    unicode;
    6471                 : 
    6472 GIC          12 :                 unicode = hexval_n(instr + 2, 8);
    6473 ECB             : 
    6474 CBC          12 :                 if (!is_valid_unicode_codepoint(unicode))
    6475 GIC           3 :                     ereport(ERROR,
    6476                 :                             errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    6477 ECB             :                             errmsg("invalid Unicode code point: %04X", unicode));
    6478                 : 
    6479 GIC           9 :                 if (pair_first)
    6480                 :                 {
    6481 CBC           3 :                     if (is_utf16_surrogate_second(unicode))
    6482                 :                     {
    6483 UIC           0 :                         unicode = surrogate_pair_to_codepoint(pair_first, unicode);
    6484               0 :                         pair_first = 0;
    6485                 :                     }
    6486                 :                     else
    6487 GIC           3 :                         goto invalid_pair;
    6488 ECB             :                 }
    6489 GBC           6 :                 else if (is_utf16_surrogate_second(unicode))
    6490 UIC           0 :                     goto invalid_pair;
    6491 ECB             : 
    6492 CBC           6 :                 if (is_utf16_surrogate_first(unicode))
    6493 GIC           3 :                     pair_first = unicode;
    6494                 :                 else
    6495                 :                 {
    6496               3 :                     pg_unicode_to_server(unicode, (unsigned char *) cbuf);
    6497 CBC           3 :                     appendStringInfoString(&str, cbuf);
    6498 ECB             :                 }
    6499                 : 
    6500 CBC           6 :                 instr += 10;
    6501               6 :                 len -= 10;
    6502                 :             }
    6503 ECB             :             else
    6504 GIC           3 :                 ereport(ERROR,
    6505 ECB             :                         (errcode(ERRCODE_SYNTAX_ERROR),
    6506                 :                          errmsg("invalid Unicode escape"),
    6507                 :                          errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
    6508                 :         }
    6509                 :         else
    6510                 :         {
    6511 GIC         192 :             if (pair_first)
    6512 UIC           0 :                 goto invalid_pair;
    6513                 : 
    6514 GIC         192 :             appendStringInfoChar(&str, *instr++);
    6515             192 :             len--;
    6516                 :         }
    6517                 :     }
    6518                 : 
    6519                 :     /* unfinished surrogate pair? */
    6520              12 :     if (pair_first)
    6521               3 :         goto invalid_pair;
    6522                 : 
    6523               9 :     result = cstring_to_text_with_len(str.data, str.len);
    6524               9 :     pfree(str.data);
    6525                 : 
    6526               9 :     PG_RETURN_TEXT_P(result);
    6527                 : 
    6528              15 : invalid_pair:
    6529              15 :     ereport(ERROR,
    6530                 :             (errcode(ERRCODE_SYNTAX_ERROR),
    6531                 :              errmsg("invalid Unicode surrogate pair")));
    6532                 :     PG_RETURN_NULL();           /* keep compiler quiet */
    6533                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a