LCOV - differential code coverage report
Current view: top level - contrib/unaccent - unaccent.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 85.1 % 154 131 23 131
Current Date: 2023-04-08 17:13:01 Functions: 100.0 % 10 10 10
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (240..) days: 85.1 % 154 131 23 131
Legend: Lines: hit not hit Function coverage date bins:
(240..) days: 100.0 % 10 10 10

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * unaccent.c
                                  4                 :  *    Text search unaccent dictionary
                                  5                 :  *
                                  6                 :  * Copyright (c) 2009-2023, PostgreSQL Global Development Group
                                  7                 :  *
                                  8                 :  * IDENTIFICATION
                                  9                 :  *    contrib/unaccent/unaccent.c
                                 10                 :  *
                                 11                 :  *-------------------------------------------------------------------------
                                 12                 :  */
                                 13                 : 
                                 14                 : #include "postgres.h"
                                 15                 : 
                                 16                 : #include "catalog/namespace.h"
                                 17                 : #include "catalog/pg_ts_dict.h"
                                 18                 : #include "commands/defrem.h"
                                 19                 : #include "lib/stringinfo.h"
                                 20                 : #include "tsearch/ts_cache.h"
                                 21                 : #include "tsearch/ts_locale.h"
                                 22                 : #include "tsearch/ts_public.h"
                                 23                 : #include "utils/builtins.h"
                                 24                 : #include "utils/lsyscache.h"
                                 25                 : #include "utils/regproc.h"
                                 26                 : #include "utils/syscache.h"
                                 27                 : 
 4982 teodor                     28 CBC           1 : PG_MODULE_MAGIC;
                                 29                 : 
                                 30                 : /*
                                 31                 :  * An unaccent dictionary uses a trie to find a string to replace.  Each node
                                 32                 :  * of the trie is an array of 256 TrieChar structs; the N-th element of the
                                 33                 :  * array corresponds to next byte value N.  That element can contain both a
                                 34                 :  * replacement string (to be used if the source string ends with this byte)
                                 35                 :  * and a link to another trie node (to be followed if there are more bytes).
                                 36                 :  *
                                 37                 :  * Note that the trie search logic pays no attention to multibyte character
                                 38                 :  * boundaries.  This is OK as long as both the data entered into the trie and
                                 39                 :  * the data we're trying to look up are validly encoded; no partial-character
                                 40                 :  * matches will occur.
                                 41                 :  */
                                 42                 : typedef struct TrieChar
                                 43                 : {
                                 44                 :     struct TrieChar *nextChar;
                                 45                 :     char       *replaceTo;
                                 46                 :     int         replacelen;
                                 47                 : } TrieChar;
                                 48                 : 
                                 49                 : /*
                                 50                 :  * placeChar - put str into trie's structure, byte by byte.
                                 51                 :  *
                                 52                 :  * If node is NULL, we need to make a new node, which will be returned;
                                 53                 :  * otherwise the return value is the same as node.
                                 54                 :  */
                                 55                 : static TrieChar *
 3205 tgl                        56            8894 : placeChar(TrieChar *node, const unsigned char *str, int lenstr,
                                 57                 :           const char *replaceTo, int replacelen)
                                 58                 : {
                                 59                 :     TrieChar   *curnode;
                                 60                 : 
 4790 bruce                      61            8894 :     if (!node)
 3205 tgl                        62             126 :         node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
                                 63                 : 
                                 64            8894 :     Assert(lenstr > 0);          /* else str[0] doesn't exist */
                                 65                 : 
 4982 teodor                     66            8894 :     curnode = node + *str;
                                 67                 : 
 3205 tgl                        68            8894 :     if (lenstr <= 1)
                                 69                 :     {
 4790 bruce                      70            3300 :         if (curnode->replaceTo)
 3205 tgl                        71 UBC           0 :             ereport(WARNING,
                                 72                 :                     (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                 73                 :                      errmsg("duplicate source strings, first one will be used")));
                                 74                 :         else
                                 75                 :         {
 4982 teodor                     76 CBC        3300 :             curnode->replacelen = replacelen;
 3205 tgl                        77            3300 :             curnode->replaceTo = (char *) palloc(replacelen);
 4982 teodor                     78            3300 :             memcpy(curnode->replaceTo, replaceTo, replacelen);
                                 79                 :         }
                                 80                 :     }
                                 81                 :     else
                                 82                 :     {
 3205 tgl                        83            5594 :         curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
                                 84                 :                                       replaceTo, replacelen);
                                 85                 :     }
                                 86                 : 
 4982 teodor                     87            8894 :     return node;
                                 88                 : }
                                 89                 : 
                                 90                 : /*
                                 91                 :  * initTrie  - create trie from file.
                                 92                 :  *
                                 93                 :  * Function converts UTF8-encoded file into current encoding.
                                 94                 :  */
                                 95                 : static TrieChar *
 1986 peter_e                    96               2 : initTrie(const char *filename)
                                 97                 : {
 3602 bruce                      98               2 :     TrieChar   *volatile rootTrie = NULL;
 4982 teodor                     99               2 :     MemoryContext ccxt = CurrentMemoryContext;
                                100                 :     tsearch_readline_state trst;
                                101                 :     volatile bool skip;
                                102                 : 
                                103               2 :     filename = get_tsearch_config_filename(filename, "rules");
                                104               2 :     if (!tsearch_readline_begin(&trst, filename))
 4982 teodor                    105 UBC           0 :         ereport(ERROR,
                                106                 :                 (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                107                 :                  errmsg("could not open unaccent file \"%s\": %m",
                                108                 :                         filename)));
                                109                 : 
                                110                 :     do
                                111                 :     {
                                112                 :         /*
                                113                 :          * pg_do_encoding_conversion() (called by tsearch_readline()) will
                                114                 :          * emit exception if it finds untranslatable characters in current
                                115                 :          * locale. We just skip such lines, continuing with the next.
                                116                 :          */
 4982 teodor                    117 CBC           2 :         skip = true;
                                118                 : 
                                119               2 :         PG_TRY();
                                120                 :         {
                                121                 :             char       *line;
                                122                 : 
                                123            3302 :             while ((line = tsearch_readline(&trst)) != NULL)
                                124                 :             {
                                125                 :                 /*----------
                                126                 :                  * The format of each line must be "src" or "src trg", where
                                127                 :                  * src and trg are sequences of one or more non-whitespace
                                128                 :                  * characters, separated by whitespace.  Whitespace at start
                                129                 :                  * or end of line is ignored.  If trg is omitted, an empty
                                130                 :                  * string is used as the replacement.
                                131                 :                  *
                                132                 :                  * We use a simple state machine, with states
                                133                 :                  *  0   initial (before src)
                                134                 :                  *  1   in src
                                135                 :                  *  2   in whitespace after src
                                136                 :                  *  3   in trg
                                137                 :                  *  4   in whitespace after trg
                                138                 :                  *  -1  syntax error detected
                                139                 :                  *----------
                                140                 :                  */
                                141                 :                 int         state;
                                142                 :                 char       *ptr;
 4171 tgl                       143            3300 :                 char       *src = NULL;
                                144            3300 :                 char       *trg = NULL;
                                145                 :                 int         ptrlen;
                                146            3300 :                 int         srclen = 0;
                                147            3300 :                 int         trglen = 0;
                                148                 : 
                                149            3300 :                 state = 0;
                                150           17134 :                 for (ptr = line; *ptr; ptr += ptrlen)
                                151                 :                 {
                                152           13834 :                     ptrlen = pg_mblen(ptr);
                                153                 :                     /* ignore whitespace, but end src or trg */
                                154           13834 :                     if (t_isspace(ptr))
                                155                 :                     {
                                156            6428 :                         if (state == 1)
                                157            3300 :                             state = 2;
                                158            3128 :                         else if (state == 3)
                                159            3088 :                             state = 4;
                                160            6428 :                         continue;
                                161                 :                     }
                                162            7406 :                     switch (state)
                                163                 :                     {
                                164            3300 :                         case 0:
                                165                 :                             /* start of src */
                                166            3300 :                             src = ptr;
                                167            3300 :                             srclen = ptrlen;
                                168            3300 :                             state = 1;
                                169            3300 :                             break;
 4171 tgl                       170 UBC           0 :                         case 1:
                                171                 :                             /* continue src */
                                172               0 :                             srclen += ptrlen;
                                173               0 :                             break;
 4171 tgl                       174 CBC        3088 :                         case 2:
                                175                 :                             /* start of trg */
                                176            3088 :                             trg = ptr;
                                177            3088 :                             trglen = ptrlen;
                                178            3088 :                             state = 3;
                                179            3088 :                             break;
                                180            1018 :                         case 3:
                                181                 :                             /* continue trg */
                                182            1018 :                             trglen += ptrlen;
                                183            1018 :                             break;
 4171 tgl                       184 UBC           0 :                         default:
                                185                 :                             /* bogus line format */
                                186               0 :                             state = -1;
                                187               0 :                             break;
                                188                 :                     }
                                189                 :                 }
                                190                 : 
 3205 tgl                       191 CBC        3300 :                 if (state == 1 || state == 2)
                                192                 :                 {
                                193                 :                     /* trg was omitted, so use "" */
                                194             212 :                     trg = "";
                                195             212 :                     trglen = 0;
                                196                 :                 }
                                197                 : 
                                198            3300 :                 if (state > 0)
 3623 heikki.linnakangas        199            3300 :                     rootTrie = placeChar(rootTrie,
                                200                 :                                          (unsigned char *) src, srclen,
                                201                 :                                          trg, trglen);
 3205 tgl                       202 UBC           0 :                 else if (state < 0)
                                203               0 :                     ereport(WARNING,
                                204                 :                             (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                205                 :                              errmsg("invalid syntax: more than two strings in unaccent rule")));
                                206                 : 
 4982 teodor                    207 CBC        3300 :                 pfree(line);
                                208                 :             }
 4171 tgl                       209               2 :             skip = false;
                                210                 :         }
 4982 teodor                    211 UBC           0 :         PG_CATCH();
                                212                 :         {
                                213                 :             ErrorData  *errdata;
                                214                 :             MemoryContext ecxt;
                                215                 : 
                                216               0 :             ecxt = MemoryContextSwitchTo(ccxt);
                                217               0 :             errdata = CopyErrorData();
                                218               0 :             if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
                                219                 :             {
                                220               0 :                 FlushErrorState();
                                221                 :             }
                                222                 :             else
                                223                 :             {
                                224               0 :                 MemoryContextSwitchTo(ecxt);
                                225               0 :                 PG_RE_THROW();
                                226                 :             }
                                227                 :         }
 4982 teodor                    228 CBC           2 :         PG_END_TRY();
                                229                 :     }
 4790 bruce                     230               2 :     while (skip);
                                231                 : 
 4982 teodor                    232               2 :     tsearch_readline_end(&trst);
                                233                 : 
 3623 heikki.linnakangas        234               2 :     return rootTrie;
                                235                 : }
                                236                 : 
                                237                 : /*
                                238                 :  * findReplaceTo - find longest possible match in trie
                                239                 :  *
                                240                 :  * On success, returns pointer to ending subnode, plus length of matched
                                241                 :  * source string in *p_matchlen.  On failure, returns NULL.
                                242                 :  */
                                243                 : static TrieChar *
 3205 tgl                       244              70 : findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
                                245                 :               int *p_matchlen)
                                246                 : {
                                247              70 :     TrieChar   *result = NULL;
                                248              70 :     int         matchlen = 0;
                                249                 : 
                                250              70 :     *p_matchlen = 0;            /* prevent uninitialized-variable warnings */
                                251                 : 
                                252             199 :     while (node && matchlen < srclen)
                                253                 :     {
                                254             129 :         node = node + src[matchlen];
                                255             129 :         matchlen++;
                                256                 : 
                                257             129 :         if (node->replaceTo)
                                258                 :         {
                                259              31 :             result = node;
                                260              31 :             *p_matchlen = matchlen;
                                261                 :         }
                                262                 : 
 4982 teodor                    263             129 :         node = node->nextChar;
                                264                 :     }
                                265                 : 
 3205 tgl                       266              70 :     return result;
                                267                 : }
                                268                 : 
 4982 teodor                    269               2 : PG_FUNCTION_INFO_V1(unaccent_init);
                                270                 : Datum
                                271               2 : unaccent_init(PG_FUNCTION_ARGS)
                                272                 : {
 4790 bruce                     273               2 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
 3623 heikki.linnakangas        274               2 :     TrieChar   *rootTrie = NULL;
 4790 bruce                     275               2 :     bool        fileloaded = false;
                                276                 :     ListCell   *l;
                                277                 : 
 4982 teodor                    278               4 :     foreach(l, dictoptions)
                                279                 :     {
                                280               2 :         DefElem    *defel = (DefElem *) lfirst(l);
                                281                 : 
 1899 tgl                       282               2 :         if (strcmp(defel->defname, "rules") == 0)
                                283                 :         {
 4982 teodor                    284               2 :             if (fileloaded)
 4982 teodor                    285 UBC           0 :                 ereport(ERROR,
                                286                 :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                287                 :                          errmsg("multiple Rules parameters")));
 3623 heikki.linnakangas        288 CBC           2 :             rootTrie = initTrie(defGetString(defel));
 4790 bruce                     289               2 :             fileloaded = true;
                                290                 :         }
                                291                 :         else
                                292                 :         {
 4982 teodor                    293 UBC           0 :             ereport(ERROR,
                                294                 :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                295                 :                      errmsg("unrecognized Unaccent parameter: \"%s\"",
                                296                 :                             defel->defname)));
                                297                 :         }
                                298                 :     }
                                299                 : 
 4982 teodor                    300 CBC           2 :     if (!fileloaded)
                                301                 :     {
 4982 teodor                    302 UBC           0 :         ereport(ERROR,
                                303                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                304                 :                  errmsg("missing Rules parameter")));
                                305                 :     }
                                306                 : 
 3623 heikki.linnakangas        307 CBC           2 :     PG_RETURN_POINTER(rootTrie);
                                308                 : }
                                309                 : 
 4982 teodor                    310               2 : PG_FUNCTION_INFO_V1(unaccent_lexize);
                                311                 : Datum
                                312              22 : unaccent_lexize(PG_FUNCTION_ARGS)
                                313                 : {
 3623 heikki.linnakangas        314              22 :     TrieChar   *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
 4790 bruce                     315              22 :     char       *srcchar = (char *) PG_GETARG_POINTER(1);
 4982 teodor                    316              22 :     int32       len = PG_GETARG_INT32(2);
 3204 tgl                       317              22 :     char       *srcstart = srcchar;
                                318                 :     TSLexeme   *res;
                                319                 :     StringInfoData buf;
                                320                 : 
                                321                 :     /* we allocate storage for the buffer only if needed */
                                322              22 :     buf.data = NULL;
                                323                 : 
 3205                           324              92 :     while (len > 0)
                                325                 :     {
                                326                 :         TrieChar   *node;
                                327                 :         int         matchlen;
                                328                 : 
                                329              70 :         node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
                                330                 :                              &matchlen);
 4790 bruce                     331              70 :         if (node && node->replaceTo)
                                332                 :         {
 3204 tgl                       333              31 :             if (buf.data == NULL)
                                334                 :             {
                                335                 :                 /* initialize buffer */
                                336              19 :                 initStringInfo(&buf);
                                337                 :                 /* insert any data we already skipped over */
 4790 bruce                     338              19 :                 if (srcchar != srcstart)
 3204 tgl                       339               3 :                     appendBinaryStringInfo(&buf, srcstart, srcchar - srcstart);
                                340                 :             }
                                341              31 :             appendBinaryStringInfo(&buf, node->replaceTo, node->replacelen);
                                342                 :         }
                                343                 :         else
                                344                 :         {
 3205                           345              39 :             matchlen = pg_mblen(srcchar);
 3204                           346              39 :             if (buf.data != NULL)
                                347              18 :                 appendBinaryStringInfo(&buf, srcchar, matchlen);
                                348                 :         }
                                349                 : 
 3205                           350              70 :         srcchar += matchlen;
                                351              70 :         len -= matchlen;
                                352                 :     }
                                353                 : 
                                354                 :     /* return a result only if we made at least one substitution */
 3204                           355              22 :     if (buf.data != NULL)
                                356                 :     {
                                357              19 :         res = (TSLexeme *) palloc0(sizeof(TSLexeme) * 2);
                                358              19 :         res->lexeme = buf.data;
                                359              19 :         res->flags = TSL_FILTER;
                                360                 :     }
                                361                 :     else
                                362               3 :         res = NULL;
                                363                 : 
 4982 teodor                    364              22 :     PG_RETURN_POINTER(res);
                                365                 : }
                                366                 : 
                                367                 : /*
                                368                 :  * Function-like wrapper for dictionary
                                369                 :  */
                                370               4 : PG_FUNCTION_INFO_V1(unaccent_dict);
                                371                 : Datum
                                372              15 : unaccent_dict(PG_FUNCTION_ARGS)
                                373                 : {
                                374                 :     text       *str;
                                375                 :     int         strArg;
                                376                 :     Oid         dictOid;
                                377                 :     TSDictionaryCacheEntry *dict;
                                378                 :     TSLexeme   *res;
                                379                 : 
                                380              15 :     if (PG_NARGS() == 1)
                                381                 :     {
                                382                 :         /*
                                383                 :          * Use the "unaccent" dictionary that is in the same schema that this
                                384                 :          * function is in.
                                385                 :          */
 1676 tgl                       386               8 :         Oid         procnspid = get_func_namespace(fcinfo->flinfo->fn_oid);
                                387               8 :         const char *dictname = "unaccent";
                                388                 : 
 1601 andres                    389               8 :         dictOid = GetSysCacheOid2(TSDICTNAMENSP, Anum_pg_ts_dict_oid,
                                390                 :                                   PointerGetDatum(dictname),
                                391                 :                                   ObjectIdGetDatum(procnspid));
 1676 tgl                       392               8 :         if (!OidIsValid(dictOid))
 1676 tgl                       393 UBC           0 :             ereport(ERROR,
                                394                 :                     (errcode(ERRCODE_UNDEFINED_OBJECT),
                                395                 :                      errmsg("text search dictionary \"%s.%s\" does not exist",
                                396                 :                             get_namespace_name(procnspid), dictname)));
 4982 teodor                    397 CBC           8 :         strArg = 0;
                                398                 :     }
                                399                 :     else
                                400                 :     {
                                401               7 :         dictOid = PG_GETARG_OID(0);
                                402               7 :         strArg = 1;
                                403                 :     }
 2219 noah                      404              15 :     str = PG_GETARG_TEXT_PP(strArg);
                                405                 : 
 4982 teodor                    406              15 :     dict = lookup_ts_dictionary_cache(dictOid);
                                407                 : 
                                408              15 :     res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
                                409                 :                                                      PointerGetDatum(dict->dictData),
                                410                 :                                                      PointerGetDatum(VARDATA_ANY(str)),
                                411                 :                                                      Int32GetDatum(VARSIZE_ANY_EXHDR(str)),
                                412                 :                                                      PointerGetDatum(NULL)));
                                413                 : 
                                414              15 :     PG_FREE_IF_COPY(str, strArg);
                                415                 : 
 4790 bruce                     416              15 :     if (res == NULL)
                                417                 :     {
 4982 teodor                    418               2 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
                                419                 :     }
 4790 bruce                     420              13 :     else if (res->lexeme == NULL)
                                421                 :     {
 4982 teodor                    422 UBC           0 :         pfree(res);
                                423               0 :         PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
                                424                 :     }
                                425                 :     else
                                426                 :     {
 4790 bruce                     427 CBC          13 :         text       *txt = cstring_to_text(res->lexeme);
                                428                 : 
 4982 teodor                    429              13 :         pfree(res->lexeme);
                                430              13 :         pfree(res);
                                431                 : 
                                432              13 :         PG_RETURN_TEXT_P(txt);
                                433                 :     }
                                434                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a