LCOV - differential code coverage report
Current view: top level - src/backend/snowball - dict_snowball.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 70.6 % 68 48 20 48
Current Date: 2023-04-08 17:13:01 Functions: 100.0 % 6 6 6
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (180,240] days: 66.7 % 3 2 1 2
Legend: Lines: hit not hit (240..) days: 70.8 % 65 46 19 46
Function coverage date bins:
(240..) days: 100.0 % 6 6 6

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * dict_snowball.c
                                  4                 :  *      Snowball dictionary
                                  5                 :  *
                                  6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                  7                 :  *
                                  8                 :  * IDENTIFICATION
                                  9                 :  *    src/backend/snowball/dict_snowball.c
                                 10                 :  *
                                 11                 :  *-------------------------------------------------------------------------
                                 12                 :  */
                                 13                 : #include "postgres.h"
                                 14                 : 
                                 15                 : #include "commands/defrem.h"
                                 16                 : #include "tsearch/ts_locale.h"
                                 17                 : #include "tsearch/ts_utils.h"
                                 18                 : 
                                 19                 : /* Some platforms define MAXINT and/or MININT, causing conflicts */
                                 20                 : #ifdef MAXINT
                                 21                 : #undef MAXINT
                                 22                 : #endif
                                 23                 : #ifdef MININT
                                 24                 : #undef MININT
                                 25                 : #endif
                                 26                 : 
                                 27                 : /* Now we can include the original Snowball header.h */
                                 28                 : #include "snowball/libstemmer/header.h"
                                 29                 : #include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
                                 30                 : #include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
                                 31                 : #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
                                 32                 : #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
                                 33                 : #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
                                 34                 : #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
                                 35                 : #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
                                 36                 : #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
                                 37                 : #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
                                 38                 : #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
                                 39                 : #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
                                 40                 : #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
                                 41                 : #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
                                 42                 : #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
                                 43                 : #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
                                 44                 : #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
                                 45                 : #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
                                 46                 : #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
                                 47                 : #include "snowball/libstemmer/stem_KOI8_R_russian.h"
                                 48                 : #include "snowball/libstemmer/stem_UTF_8_arabic.h"
                                 49                 : #include "snowball/libstemmer/stem_UTF_8_armenian.h"
                                 50                 : #include "snowball/libstemmer/stem_UTF_8_basque.h"
                                 51                 : #include "snowball/libstemmer/stem_UTF_8_catalan.h"
                                 52                 : #include "snowball/libstemmer/stem_UTF_8_danish.h"
                                 53                 : #include "snowball/libstemmer/stem_UTF_8_dutch.h"
                                 54                 : #include "snowball/libstemmer/stem_UTF_8_english.h"
                                 55                 : #include "snowball/libstemmer/stem_UTF_8_finnish.h"
                                 56                 : #include "snowball/libstemmer/stem_UTF_8_french.h"
                                 57                 : #include "snowball/libstemmer/stem_UTF_8_german.h"
                                 58                 : #include "snowball/libstemmer/stem_UTF_8_greek.h"
                                 59                 : #include "snowball/libstemmer/stem_UTF_8_hindi.h"
                                 60                 : #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
                                 61                 : #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
                                 62                 : #include "snowball/libstemmer/stem_UTF_8_irish.h"
                                 63                 : #include "snowball/libstemmer/stem_UTF_8_italian.h"
                                 64                 : #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
                                 65                 : #include "snowball/libstemmer/stem_UTF_8_nepali.h"
                                 66                 : #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
                                 67                 : #include "snowball/libstemmer/stem_UTF_8_porter.h"
                                 68                 : #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
                                 69                 : #include "snowball/libstemmer/stem_UTF_8_romanian.h"
                                 70                 : #include "snowball/libstemmer/stem_UTF_8_russian.h"
                                 71                 : #include "snowball/libstemmer/stem_UTF_8_serbian.h"
                                 72                 : #include "snowball/libstemmer/stem_UTF_8_spanish.h"
                                 73                 : #include "snowball/libstemmer/stem_UTF_8_swedish.h"
                                 74                 : #include "snowball/libstemmer/stem_UTF_8_tamil.h"
                                 75                 : #include "snowball/libstemmer/stem_UTF_8_turkish.h"
                                 76                 : #include "snowball/libstemmer/stem_UTF_8_yiddish.h"
                                 77                 : 
 5710 tgl                        78 CBC         319 : PG_MODULE_MAGIC;
                                 79                 : 
                                 80             319 : PG_FUNCTION_INFO_V1(dsnowball_init);
                                 81                 : 
                                 82             319 : PG_FUNCTION_INFO_V1(dsnowball_lexize);
                                 83                 : 
                                 84                 : /* List of supported modules */
                                 85                 : typedef struct stemmer_module
                                 86                 : {
                                 87                 :     const char *name;
                                 88                 :     pg_enc      enc;
                                 89                 :     struct SN_env *(*create) (void);
                                 90                 :     void        (*close) (struct SN_env *);
                                 91                 :     int         (*stem) (struct SN_env *);
                                 92                 : } stemmer_module;
                                 93                 : 
                                 94                 : /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
                                 95                 : #define STEMMER_MODULE(name,enc,senc) \
                                 96                 :     {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
                                 97                 : 
                                 98                 : static const stemmer_module stemmer_modules[] =
                                 99                 : {
                                100                 :     /*
                                101                 :      * Stemmers list from Snowball distribution
                                102                 :      */
                                103                 :     STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
                                104                 :     STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
                                105                 :     STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
                                106                 :     STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
                                107                 :     STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
                                108                 :     STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
                                109                 :     STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
                                110                 :     STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
                                111                 :     STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
                                112                 :     STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
                                113                 :     STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
                                114                 :     STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
                                115                 :     STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
                                116                 :     STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
                                117                 :     STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
                                118                 :     STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
                                119                 :     STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
                                120                 :     STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
                                121                 :     STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
                                122                 :     STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
                                123                 :     STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
                                124                 :     STEMMER_MODULE(basque, PG_UTF8, UTF_8),
                                125                 :     STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
                                126                 :     STEMMER_MODULE(danish, PG_UTF8, UTF_8),
                                127                 :     STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
                                128                 :     STEMMER_MODULE(english, PG_UTF8, UTF_8),
                                129                 :     STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
                                130                 :     STEMMER_MODULE(french, PG_UTF8, UTF_8),
                                131                 :     STEMMER_MODULE(german, PG_UTF8, UTF_8),
                                132                 :     STEMMER_MODULE(greek, PG_UTF8, UTF_8),
                                133                 :     STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
                                134                 :     STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
                                135                 :     STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
                                136                 :     STEMMER_MODULE(irish, PG_UTF8, UTF_8),
                                137                 :     STEMMER_MODULE(italian, PG_UTF8, UTF_8),
                                138                 :     STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
                                139                 :     STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
                                140                 :     STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
                                141                 :     STEMMER_MODULE(porter, PG_UTF8, UTF_8),
                                142                 :     STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
                                143                 :     STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
                                144                 :     STEMMER_MODULE(russian, PG_UTF8, UTF_8),
                                145                 :     STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
                                146                 :     STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
                                147                 :     STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
                                148                 :     STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
                                149                 :     STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
                                150                 :     STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
                                151                 : 
                                152                 :     /*
                                153                 :      * Stemmer with PG_SQL_ASCII encoding should be valid for any server
                                154                 :      * encoding
                                155                 :      */
                                156                 :     STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
                                157                 : 
                                158                 :     {NULL, 0, NULL, NULL, NULL} /* list end marker */
                                159                 : };
                                160                 : 
                                161                 : 
                                162                 : typedef struct DictSnowball
                                163                 : {
                                164                 :     struct SN_env *z;
                                165                 :     StopList    stoplist;
                                166                 :     bool        needrecode;     /* needs recoding before/after call stem */
                                167                 :     int         (*stem) (struct SN_env *z);
                                168                 : 
                                169                 :     /*
                                170                 :      * snowball saves alloced memory between calls, so we should run it in our
                                171                 :      * private memory context. Note, init function is executed in long lived
                                172                 :      * context, so we just remember CurrentMemoryContext
                                173                 :      */
                                174                 :     MemoryContext dictCtx;
                                175                 : } DictSnowball;
                                176                 : 
                                177                 : 
                                178                 : static void
 1986 peter_e                   179              19 : locate_stem_module(DictSnowball *d, const char *lang)
                                180                 : {
                                181                 :     const stemmer_module *m;
                                182                 : 
                                183                 :     /*
                                184                 :      * First, try to find exact match of stemmer module. Stemmer with
                                185                 :      * PG_SQL_ASCII encoding is treated as working with any server encoding
                                186                 :      */
 5710 tgl                       187             494 :     for (m = stemmer_modules; m->name; m++)
                                188                 :     {
                                189             627 :         if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
                                190             133 :             pg_strcasecmp(m->name, lang) == 0)
                                191                 :         {
                                192              19 :             d->stem = m->stem;
                                193              19 :             d->z = m->create();
                                194              19 :             d->needrecode = false;
                                195              19 :             return;
                                196                 :         }
                                197                 :     }
                                198                 : 
                                199                 :     /*
                                200                 :      * Second, try to find stemmer for needed language for UTF8 encoding.
                                201                 :      */
 5710 tgl                       202 UBC           0 :     for (m = stemmer_modules; m->name; m++)
                                203                 :     {
                                204               0 :         if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
                                205                 :         {
                                206               0 :             d->stem = m->stem;
                                207               0 :             d->z = m->create();
                                208               0 :             d->needrecode = true;
                                209               0 :             return;
                                210                 :         }
                                211                 :     }
                                212                 : 
                                213               0 :     ereport(ERROR,
                                214                 :             (errcode(ERRCODE_UNDEFINED_OBJECT),
                                215                 :              errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
                                216                 :                     lang, GetDatabaseEncodingName())));
                                217                 : }
                                218                 : 
                                219                 : Datum
 5710 tgl                       220 CBC          19 : dsnowball_init(PG_FUNCTION_ARGS)
                                221                 : {
 5709                           222              19 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
                                223                 :     DictSnowball *d;
 5710                           224              19 :     bool        stoploaded = false;
                                225                 :     ListCell   *l;
                                226                 : 
                                227              19 :     d = (DictSnowball *) palloc0(sizeof(DictSnowball));
                                228                 : 
 5709                           229              57 :     foreach(l, dictoptions)
                                230                 :     {
                                231              38 :         DefElem    *defel = (DefElem *) lfirst(l);
                                232                 : 
 1899                           233              38 :         if (strcmp(defel->defname, "stopwords") == 0)
                                234                 :         {
 5710                           235              19 :             if (stoploaded)
 5710 tgl                       236 UBC           0 :                 ereport(ERROR,
                                237                 :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                238                 :                          errmsg("multiple StopWords parameters")));
 5706 tgl                       239 CBC          19 :             readstoplist(defGetString(defel), &d->stoplist, lowerstr);
 5710                           240              19 :             stoploaded = true;
                                241                 :         }
 1899                           242              19 :         else if (strcmp(defel->defname, "language") == 0)
                                243                 :         {
 5710                           244              19 :             if (d->stem)
 5710 tgl                       245 UBC           0 :                 ereport(ERROR,
                                246                 :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                247                 :                          errmsg("multiple Language parameters")));
 5709 tgl                       248 CBC          19 :             locate_stem_module(d, defGetString(defel));
                                249                 :         }
                                250                 :         else
                                251                 :         {
 5710 tgl                       252 UBC           0 :             ereport(ERROR,
                                253                 :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                254                 :                      errmsg("unrecognized Snowball parameter: \"%s\"",
                                255                 :                             defel->defname)));
                                256                 :         }
                                257                 :     }
                                258                 : 
 5710 tgl                       259 CBC          19 :     if (!d->stem)
 5710 tgl                       260 UBC           0 :         ereport(ERROR,
                                261                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                262                 :                  errmsg("missing Language parameter")));
                                263                 : 
 5710 tgl                       264 CBC          19 :     d->dictCtx = CurrentMemoryContext;
                                265                 : 
                                266              19 :     PG_RETURN_POINTER(d);
                                267                 : }
                                268                 : 
                                269                 : Datum
                                270            5135 : dsnowball_lexize(PG_FUNCTION_ARGS)
                                271                 : {
                                272            5135 :     DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
                                273            5135 :     char       *in = (char *) PG_GETARG_POINTER(1);
 5624 bruce                     274            5135 :     int32       len = PG_GETARG_INT32(2);
 5710 tgl                       275            5135 :     char       *txt = lowerstr_with_len(in, len);
                                276            5135 :     TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
                                277                 : 
                                278                 :     /*
                                279                 :      * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
                                280                 :      * surely not words in any human language.  This restriction avoids
                                281                 :      * wasting cycles on stuff like base64-encoded data, and it protects us
                                282                 :      * against possible inefficiency or misbehavior in the stemmer.  (For
                                283                 :      * example, the Turkish stemmer has an indefinite recursion, so it can
                                284                 :      * crash on long-enough strings.)  However, Snowball dictionaries are
                                285                 :      * defined to recognize all strings, so we can't reject the string as an
                                286                 :      * unknown word.
                                287                 :      */
  221                           288            5135 :     if (len > 1000)
                                289                 :     {
                                290                 :         /* return the lexeme lowercased, but otherwise unmodified */
  221 tgl                       291 UBC           0 :         res->lexeme = txt;
                                292                 :     }
  221 tgl                       293 CBC        5135 :     else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
                                294                 :     {
                                295                 :         /* empty or stopword, so report as stopword */
 5710                           296            1734 :         pfree(txt);
                                297                 :     }
                                298                 :     else
                                299                 :     {
                                300                 :         MemoryContext saveCtx;
                                301                 : 
                                302                 :         /*
                                303                 :          * recode to utf8 if stemmer is utf8 and doesn't match server encoding
                                304                 :          */
                                305            3401 :         if (d->needrecode)
                                306                 :         {
                                307                 :             char       *recoded;
                                308                 : 
 3332 tgl                       309 UBC           0 :             recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
 5710                           310               0 :             if (recoded != txt)
                                311                 :             {
                                312               0 :                 pfree(txt);
                                313               0 :                 txt = recoded;
                                314                 :             }
                                315                 :         }
                                316                 : 
                                317                 :         /* see comment about d->dictCtx */
 5710 tgl                       318 CBC        3401 :         saveCtx = MemoryContextSwitchTo(d->dictCtx);
                                319            3401 :         SN_set_current(d->z, strlen(txt), (symbol *) txt);
                                320            3401 :         d->stem(d->z);
                                321            3401 :         MemoryContextSwitchTo(saveCtx);
                                322                 : 
                                323            3401 :         if (d->z->p && d->z->l)
                                324                 :         {
                                325            3401 :             txt = repalloc(txt, d->z->l + 1);
                                326            3401 :             memcpy(txt, d->z->p, d->z->l);
                                327            3401 :             txt[d->z->l] = '\0';
                                328                 :         }
                                329                 : 
                                330                 :         /* back recode if needed */
                                331            3401 :         if (d->needrecode)
                                332                 :         {
                                333                 :             char       *recoded;
                                334                 : 
 3332 tgl                       335 UBC           0 :             recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
 5710                           336               0 :             if (recoded != txt)
                                337                 :             {
                                338               0 :                 pfree(txt);
                                339               0 :                 txt = recoded;
                                340                 :             }
                                341                 :         }
                                342                 : 
 5710 tgl                       343 CBC        3401 :         res->lexeme = txt;
                                344                 :     }
                                345                 : 
                                346            5135 :     PG_RETURN_POINTER(res);
                                347                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a