LCOV - differential code coverage report
Current view: top level - src/backend/snowball - dict_snowball.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 70.6 % 68 48 20 48
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 6 6 6
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * dict_snowball.c
       4                 :  *      Snowball dictionary
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  *
       8                 :  * IDENTIFICATION
       9                 :  *    src/backend/snowball/dict_snowball.c
      10                 :  *
      11                 :  *-------------------------------------------------------------------------
      12                 :  */
      13                 : #include "postgres.h"
      14                 : 
      15                 : #include "commands/defrem.h"
      16                 : #include "tsearch/ts_locale.h"
      17                 : #include "tsearch/ts_utils.h"
      18                 : 
      19                 : /* Some platforms define MAXINT and/or MININT, causing conflicts */
      20                 : #ifdef MAXINT
      21                 : #undef MAXINT
      22                 : #endif
      23                 : #ifdef MININT
      24                 : #undef MININT
      25                 : #endif
      26                 : 
      27                 : /* Now we can include the original Snowball header.h */
      28                 : #include "snowball/libstemmer/header.h"
      29                 : #include "snowball/libstemmer/stem_ISO_8859_1_basque.h"
      30                 : #include "snowball/libstemmer/stem_ISO_8859_1_catalan.h"
      31                 : #include "snowball/libstemmer/stem_ISO_8859_1_danish.h"
      32                 : #include "snowball/libstemmer/stem_ISO_8859_1_dutch.h"
      33                 : #include "snowball/libstemmer/stem_ISO_8859_1_english.h"
      34                 : #include "snowball/libstemmer/stem_ISO_8859_1_finnish.h"
      35                 : #include "snowball/libstemmer/stem_ISO_8859_1_french.h"
      36                 : #include "snowball/libstemmer/stem_ISO_8859_1_german.h"
      37                 : #include "snowball/libstemmer/stem_ISO_8859_1_indonesian.h"
      38                 : #include "snowball/libstemmer/stem_ISO_8859_1_irish.h"
      39                 : #include "snowball/libstemmer/stem_ISO_8859_1_italian.h"
      40                 : #include "snowball/libstemmer/stem_ISO_8859_1_norwegian.h"
      41                 : #include "snowball/libstemmer/stem_ISO_8859_1_porter.h"
      42                 : #include "snowball/libstemmer/stem_ISO_8859_1_portuguese.h"
      43                 : #include "snowball/libstemmer/stem_ISO_8859_1_spanish.h"
      44                 : #include "snowball/libstemmer/stem_ISO_8859_1_swedish.h"
      45                 : #include "snowball/libstemmer/stem_ISO_8859_2_hungarian.h"
      46                 : #include "snowball/libstemmer/stem_ISO_8859_2_romanian.h"
      47                 : #include "snowball/libstemmer/stem_KOI8_R_russian.h"
      48                 : #include "snowball/libstemmer/stem_UTF_8_arabic.h"
      49                 : #include "snowball/libstemmer/stem_UTF_8_armenian.h"
      50                 : #include "snowball/libstemmer/stem_UTF_8_basque.h"
      51                 : #include "snowball/libstemmer/stem_UTF_8_catalan.h"
      52                 : #include "snowball/libstemmer/stem_UTF_8_danish.h"
      53                 : #include "snowball/libstemmer/stem_UTF_8_dutch.h"
      54                 : #include "snowball/libstemmer/stem_UTF_8_english.h"
      55                 : #include "snowball/libstemmer/stem_UTF_8_finnish.h"
      56                 : #include "snowball/libstemmer/stem_UTF_8_french.h"
      57                 : #include "snowball/libstemmer/stem_UTF_8_german.h"
      58                 : #include "snowball/libstemmer/stem_UTF_8_greek.h"
      59                 : #include "snowball/libstemmer/stem_UTF_8_hindi.h"
      60                 : #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
      61                 : #include "snowball/libstemmer/stem_UTF_8_indonesian.h"
      62                 : #include "snowball/libstemmer/stem_UTF_8_irish.h"
      63                 : #include "snowball/libstemmer/stem_UTF_8_italian.h"
      64                 : #include "snowball/libstemmer/stem_UTF_8_lithuanian.h"
      65                 : #include "snowball/libstemmer/stem_UTF_8_nepali.h"
      66                 : #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
      67                 : #include "snowball/libstemmer/stem_UTF_8_porter.h"
      68                 : #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
      69                 : #include "snowball/libstemmer/stem_UTF_8_romanian.h"
      70                 : #include "snowball/libstemmer/stem_UTF_8_russian.h"
      71                 : #include "snowball/libstemmer/stem_UTF_8_serbian.h"
      72                 : #include "snowball/libstemmer/stem_UTF_8_spanish.h"
      73                 : #include "snowball/libstemmer/stem_UTF_8_swedish.h"
      74                 : #include "snowball/libstemmer/stem_UTF_8_tamil.h"
      75                 : #include "snowball/libstemmer/stem_UTF_8_turkish.h"
      76                 : #include "snowball/libstemmer/stem_UTF_8_yiddish.h"
      77                 : 
      78 CBC         319 : PG_MODULE_MAGIC;
      79                 : 
      80             319 : PG_FUNCTION_INFO_V1(dsnowball_init);
      81                 : 
      82             319 : PG_FUNCTION_INFO_V1(dsnowball_lexize);
      83                 : 
      84                 : /* List of supported modules */
      85                 : typedef struct stemmer_module
      86                 : {
      87                 :     const char *name;
      88                 :     pg_enc      enc;
      89                 :     struct SN_env *(*create) (void);
      90                 :     void        (*close) (struct SN_env *);
      91                 :     int         (*stem) (struct SN_env *);
      92                 : } stemmer_module;
      93                 : 
      94                 : /* Args: stemmer name, PG code for encoding, Snowball's name for encoding */
      95                 : #define STEMMER_MODULE(name,enc,senc) \
      96                 :     {#name, enc, name##_##senc##_create_env, name##_##senc##_close_env, name##_##senc##_stem}
      97                 : 
      98                 : static const stemmer_module stemmer_modules[] =
      99                 : {
     100                 :     /*
     101                 :      * Stemmers list from Snowball distribution
     102                 :      */
     103                 :     STEMMER_MODULE(basque, PG_LATIN1, ISO_8859_1),
     104                 :     STEMMER_MODULE(catalan, PG_LATIN1, ISO_8859_1),
     105                 :     STEMMER_MODULE(danish, PG_LATIN1, ISO_8859_1),
     106                 :     STEMMER_MODULE(dutch, PG_LATIN1, ISO_8859_1),
     107                 :     STEMMER_MODULE(english, PG_LATIN1, ISO_8859_1),
     108                 :     STEMMER_MODULE(finnish, PG_LATIN1, ISO_8859_1),
     109                 :     STEMMER_MODULE(french, PG_LATIN1, ISO_8859_1),
     110                 :     STEMMER_MODULE(german, PG_LATIN1, ISO_8859_1),
     111                 :     STEMMER_MODULE(indonesian, PG_LATIN1, ISO_8859_1),
     112                 :     STEMMER_MODULE(irish, PG_LATIN1, ISO_8859_1),
     113                 :     STEMMER_MODULE(italian, PG_LATIN1, ISO_8859_1),
     114                 :     STEMMER_MODULE(norwegian, PG_LATIN1, ISO_8859_1),
     115                 :     STEMMER_MODULE(porter, PG_LATIN1, ISO_8859_1),
     116                 :     STEMMER_MODULE(portuguese, PG_LATIN1, ISO_8859_1),
     117                 :     STEMMER_MODULE(spanish, PG_LATIN1, ISO_8859_1),
     118                 :     STEMMER_MODULE(swedish, PG_LATIN1, ISO_8859_1),
     119                 :     STEMMER_MODULE(hungarian, PG_LATIN2, ISO_8859_2),
     120                 :     STEMMER_MODULE(romanian, PG_LATIN2, ISO_8859_2),
     121                 :     STEMMER_MODULE(russian, PG_KOI8R, KOI8_R),
     122                 :     STEMMER_MODULE(arabic, PG_UTF8, UTF_8),
     123                 :     STEMMER_MODULE(armenian, PG_UTF8, UTF_8),
     124                 :     STEMMER_MODULE(basque, PG_UTF8, UTF_8),
     125                 :     STEMMER_MODULE(catalan, PG_UTF8, UTF_8),
     126                 :     STEMMER_MODULE(danish, PG_UTF8, UTF_8),
     127                 :     STEMMER_MODULE(dutch, PG_UTF8, UTF_8),
     128                 :     STEMMER_MODULE(english, PG_UTF8, UTF_8),
     129                 :     STEMMER_MODULE(finnish, PG_UTF8, UTF_8),
     130                 :     STEMMER_MODULE(french, PG_UTF8, UTF_8),
     131                 :     STEMMER_MODULE(german, PG_UTF8, UTF_8),
     132                 :     STEMMER_MODULE(greek, PG_UTF8, UTF_8),
     133                 :     STEMMER_MODULE(hindi, PG_UTF8, UTF_8),
     134                 :     STEMMER_MODULE(hungarian, PG_UTF8, UTF_8),
     135                 :     STEMMER_MODULE(indonesian, PG_UTF8, UTF_8),
     136                 :     STEMMER_MODULE(irish, PG_UTF8, UTF_8),
     137                 :     STEMMER_MODULE(italian, PG_UTF8, UTF_8),
     138                 :     STEMMER_MODULE(lithuanian, PG_UTF8, UTF_8),
     139                 :     STEMMER_MODULE(nepali, PG_UTF8, UTF_8),
     140                 :     STEMMER_MODULE(norwegian, PG_UTF8, UTF_8),
     141                 :     STEMMER_MODULE(porter, PG_UTF8, UTF_8),
     142                 :     STEMMER_MODULE(portuguese, PG_UTF8, UTF_8),
     143                 :     STEMMER_MODULE(romanian, PG_UTF8, UTF_8),
     144                 :     STEMMER_MODULE(russian, PG_UTF8, UTF_8),
     145                 :     STEMMER_MODULE(serbian, PG_UTF8, UTF_8),
     146                 :     STEMMER_MODULE(spanish, PG_UTF8, UTF_8),
     147                 :     STEMMER_MODULE(swedish, PG_UTF8, UTF_8),
     148                 :     STEMMER_MODULE(tamil, PG_UTF8, UTF_8),
     149                 :     STEMMER_MODULE(turkish, PG_UTF8, UTF_8),
     150                 :     STEMMER_MODULE(yiddish, PG_UTF8, UTF_8),
     151                 : 
     152                 :     /*
     153                 :      * Stemmer with PG_SQL_ASCII encoding should be valid for any server
     154                 :      * encoding
     155                 :      */
     156                 :     STEMMER_MODULE(english, PG_SQL_ASCII, ISO_8859_1),
     157                 : 
     158                 :     {NULL, 0, NULL, NULL, NULL} /* list end marker */
     159                 : };
     160                 : 
     161                 : 
     162                 : typedef struct DictSnowball
     163                 : {
     164                 :     struct SN_env *z;
     165                 :     StopList    stoplist;
     166                 :     bool        needrecode;     /* needs recoding before/after call stem */
     167                 :     int         (*stem) (struct SN_env *z);
     168                 : 
     169                 :     /*
     170                 :      * snowball saves alloced memory between calls, so we should run it in our
     171                 :      * private memory context. Note, init function is executed in long lived
     172                 :      * context, so we just remember CurrentMemoryContext
     173                 :      */
     174                 :     MemoryContext dictCtx;
     175                 : } DictSnowball;
     176                 : 
     177                 : 
     178                 : static void
     179              19 : locate_stem_module(DictSnowball *d, const char *lang)
     180                 : {
     181                 :     const stemmer_module *m;
     182                 : 
     183                 :     /*
     184                 :      * First, try to find exact match of stemmer module. Stemmer with
     185                 :      * PG_SQL_ASCII encoding is treated as working with any server encoding
     186                 :      */
     187             494 :     for (m = stemmer_modules; m->name; m++)
     188                 :     {
     189             627 :         if ((m->enc == PG_SQL_ASCII || m->enc == GetDatabaseEncoding()) &&
     190             133 :             pg_strcasecmp(m->name, lang) == 0)
     191                 :         {
     192              19 :             d->stem = m->stem;
     193              19 :             d->z = m->create();
     194              19 :             d->needrecode = false;
     195              19 :             return;
     196                 :         }
     197                 :     }
     198                 : 
     199                 :     /*
     200                 :      * Second, try to find stemmer for needed language for UTF8 encoding.
     201                 :      */
     202 UBC           0 :     for (m = stemmer_modules; m->name; m++)
     203                 :     {
     204               0 :         if (m->enc == PG_UTF8 && pg_strcasecmp(m->name, lang) == 0)
     205                 :         {
     206               0 :             d->stem = m->stem;
     207               0 :             d->z = m->create();
     208               0 :             d->needrecode = true;
     209               0 :             return;
     210                 :         }
     211                 :     }
     212                 : 
     213               0 :     ereport(ERROR,
     214                 :             (errcode(ERRCODE_UNDEFINED_OBJECT),
     215                 :              errmsg("no Snowball stemmer available for language \"%s\" and encoding \"%s\"",
     216                 :                     lang, GetDatabaseEncodingName())));
     217                 : }
     218                 : 
     219                 : Datum
     220 CBC          19 : dsnowball_init(PG_FUNCTION_ARGS)
     221                 : {
     222              19 :     List       *dictoptions = (List *) PG_GETARG_POINTER(0);
     223                 :     DictSnowball *d;
     224              19 :     bool        stoploaded = false;
     225                 :     ListCell   *l;
     226                 : 
     227              19 :     d = (DictSnowball *) palloc0(sizeof(DictSnowball));
     228                 : 
     229              57 :     foreach(l, dictoptions)
     230                 :     {
     231              38 :         DefElem    *defel = (DefElem *) lfirst(l);
     232                 : 
     233              38 :         if (strcmp(defel->defname, "stopwords") == 0)
     234                 :         {
     235              19 :             if (stoploaded)
     236 UBC           0 :                 ereport(ERROR,
     237                 :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     238                 :                          errmsg("multiple StopWords parameters")));
     239 CBC          19 :             readstoplist(defGetString(defel), &d->stoplist, lowerstr);
     240              19 :             stoploaded = true;
     241                 :         }
     242              19 :         else if (strcmp(defel->defname, "language") == 0)
     243                 :         {
     244              19 :             if (d->stem)
     245 UBC           0 :                 ereport(ERROR,
     246                 :                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     247                 :                          errmsg("multiple Language parameters")));
     248 CBC          19 :             locate_stem_module(d, defGetString(defel));
     249                 :         }
     250                 :         else
     251                 :         {
     252 UBC           0 :             ereport(ERROR,
     253                 :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     254                 :                      errmsg("unrecognized Snowball parameter: \"%s\"",
     255                 :                             defel->defname)));
     256                 :         }
     257                 :     }
     258                 : 
     259 CBC          19 :     if (!d->stem)
     260 UBC           0 :         ereport(ERROR,
     261                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     262                 :                  errmsg("missing Language parameter")));
     263                 : 
     264 CBC          19 :     d->dictCtx = CurrentMemoryContext;
     265                 : 
     266              19 :     PG_RETURN_POINTER(d);
     267                 : }
     268                 : 
     269                 : Datum
     270            5135 : dsnowball_lexize(PG_FUNCTION_ARGS)
     271                 : {
     272            5135 :     DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
     273            5135 :     char       *in = (char *) PG_GETARG_POINTER(1);
     274            5135 :     int32       len = PG_GETARG_INT32(2);
     275            5135 :     char       *txt = lowerstr_with_len(in, len);
     276            5135 :     TSLexeme   *res = palloc0(sizeof(TSLexeme) * 2);
     277                 : 
     278                 :     /*
     279                 :      * Do not pass strings exceeding 1000 bytes to the stemmer, as they're
     280                 :      * surely not words in any human language.  This restriction avoids
     281                 :      * wasting cycles on stuff like base64-encoded data, and it protects us
     282                 :      * against possible inefficiency or misbehavior in the stemmer.  (For
     283                 :      * example, the Turkish stemmer has an indefinite recursion, so it can
     284                 :      * crash on long-enough strings.)  However, Snowball dictionaries are
     285                 :      * defined to recognize all strings, so we can't reject the string as an
     286                 :      * unknown word.
     287                 :      */
     288            5135 :     if (len > 1000)
     289                 :     {
     290                 :         /* return the lexeme lowercased, but otherwise unmodified */
     291 UBC           0 :         res->lexeme = txt;
     292                 :     }
     293 CBC        5135 :     else if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
     294                 :     {
     295                 :         /* empty or stopword, so report as stopword */
     296            1734 :         pfree(txt);
     297                 :     }
     298                 :     else
     299                 :     {
     300                 :         MemoryContext saveCtx;
     301                 : 
     302                 :         /*
     303                 :          * recode to utf8 if stemmer is utf8 and doesn't match server encoding
     304                 :          */
     305            3401 :         if (d->needrecode)
     306                 :         {
     307                 :             char       *recoded;
     308                 : 
     309 UBC           0 :             recoded = pg_server_to_any(txt, strlen(txt), PG_UTF8);
     310               0 :             if (recoded != txt)
     311                 :             {
     312               0 :                 pfree(txt);
     313               0 :                 txt = recoded;
     314                 :             }
     315                 :         }
     316                 : 
     317                 :         /* see comment about d->dictCtx */
     318 CBC        3401 :         saveCtx = MemoryContextSwitchTo(d->dictCtx);
     319            3401 :         SN_set_current(d->z, strlen(txt), (symbol *) txt);
     320            3401 :         d->stem(d->z);
     321            3401 :         MemoryContextSwitchTo(saveCtx);
     322                 : 
     323            3401 :         if (d->z->p && d->z->l)
     324                 :         {
     325            3401 :             txt = repalloc(txt, d->z->l + 1);
     326            3401 :             memcpy(txt, d->z->p, d->z->l);
     327            3401 :             txt[d->z->l] = '\0';
     328                 :         }
     329                 : 
     330                 :         /* back recode if needed */
     331            3401 :         if (d->needrecode)
     332                 :         {
     333                 :             char       *recoded;
     334                 : 
     335 UBC           0 :             recoded = pg_any_to_server(txt, strlen(txt), PG_UTF8);
     336               0 :             if (recoded != txt)
     337                 :             {
     338               0 :                 pfree(txt);
     339               0 :                 txt = recoded;
     340                 :             }
     341                 :         }
     342                 : 
     343 CBC        3401 :         res->lexeme = txt;
     344                 :     }
     345                 : 
     346            5135 :     PG_RETURN_POINTER(res);
     347                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a