LCOV - differential code coverage report
Current view: top level - src/backend/tsearch - ts_selfuncs.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 86.5 % 104 90 14 90
Current Date: 2023-04-08 17:13:01 Functions: 83.3 % 6 5 1 5
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (240..) days: 86.5 % 104 90 14 90
Legend: Lines: hit not hit Function coverage date bins:
(240..) days: 83.3 % 6 5 1 5

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * ts_selfuncs.c
                                  4                 :  *    Selectivity estimation functions for text search operators.
                                  5                 :  *
                                  6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                  7                 :  *
                                  8                 :  *
                                  9                 :  * IDENTIFICATION
                                 10                 :  *    src/backend/tsearch/ts_selfuncs.c
                                 11                 :  *
                                 12                 :  *-------------------------------------------------------------------------
                                 13                 :  */
                                 14                 : #include "postgres.h"
                                 15                 : 
                                 16                 : #include "access/htup_details.h"
                                 17                 : #include "catalog/pg_statistic.h"
                                 18                 : #include "catalog/pg_type.h"
                                 19                 : #include "miscadmin.h"
                                 20                 : #include "nodes/nodes.h"
                                 21                 : #include "tsearch/ts_type.h"
                                 22                 : #include "utils/builtins.h"
                                 23                 : #include "utils/lsyscache.h"
                                 24                 : #include "utils/selfuncs.h"
                                 25                 : #include "utils/syscache.h"
                                 26                 : 
                                 27                 : 
                                 28                 : /*
                                 29                 :  * The default text search selectivity is chosen to be small enough to
                                 30                 :  * encourage indexscans for typical table densities.  See selfuncs.h and
                                 31                 :  * DEFAULT_EQ_SEL for details.
                                 32                 :  */
                                 33                 : #define DEFAULT_TS_MATCH_SEL 0.005
                                 34                 : 
                                 35                 : /* lookup table type for binary searching through MCELEMs */
                                 36                 : typedef struct
                                 37                 : {
                                 38                 :     text       *element;
                                 39                 :     float4      frequency;
                                 40                 : } TextFreq;
                                 41                 : 
                                 42                 : /* type of keys for bsearch'ing through an array of TextFreqs */
                                 43                 : typedef struct
                                 44                 : {
                                 45                 :     char       *lexeme;
                                 46                 :     int         length;
                                 47                 : } LexemeKey;
                                 48                 : 
                                 49                 : static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
                                 50                 : static Selectivity mcelem_tsquery_selec(TSQuery query,
                                 51                 :                                         Datum *mcelem, int nmcelem,
                                 52                 :                                         float4 *numbers, int nnumbers);
                                 53                 : static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
                                 54                 :                                      TextFreq *lookup, int length, float4 minfreq);
                                 55                 : static int  compare_lexeme_textfreq(const void *e1, const void *e2);
                                 56                 : 
                                 57                 : #define tsquery_opr_selec_no_stats(query) \
                                 58                 :     tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
                                 59                 : 
                                 60                 : 
                                 61                 : /*
                                 62                 :  *  tsmatchsel -- Selectivity of "@@"
                                 63                 :  *
                                 64                 :  * restriction selectivity function for tsvector @@ tsquery and
                                 65                 :  * tsquery @@ tsvector
                                 66                 :  */
                                 67                 : Datum
 5315 tgl                        68 CBC         513 : tsmatchsel(PG_FUNCTION_ARGS)
                                 69                 : {
                                 70             513 :     PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
                                 71                 : 
                                 72                 : #ifdef NOT_USED
                                 73                 :     Oid         operator = PG_GETARG_OID(1);
                                 74                 : #endif
                                 75             513 :     List       *args = (List *) PG_GETARG_POINTER(2);
                                 76             513 :     int         varRelid = PG_GETARG_INT32(3);
                                 77                 :     VariableStatData vardata;
                                 78                 :     Node       *other;
                                 79                 :     bool        varonleft;
                                 80                 :     Selectivity selec;
                                 81                 : 
                                 82                 :     /*
                                 83                 :      * If expression is not variable = something or something = variable, then
                                 84                 :      * punt and return a default estimate.
                                 85                 :      */
                                 86             513 :     if (!get_restriction_variable(root, args, varRelid,
                                 87                 :                                   &vardata, &other, &varonleft))
 5315 tgl                        88 UBC           0 :         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
                                 89                 : 
                                 90                 :     /*
                                 91                 :      * Can't do anything useful if the something is not a constant, either.
                                 92                 :      */
 5315 tgl                        93 CBC         513 :     if (!IsA(other, Const))
                                 94                 :     {
 5315 tgl                        95 UBC           0 :         ReleaseVariableStats(vardata);
                                 96               0 :         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
                                 97                 :     }
                                 98                 : 
                                 99                 :     /*
                                100                 :      * The "@@" operator is strict, so we can cope with NULL right away
                                101                 :      */
 5315 tgl                       102 CBC         513 :     if (((Const *) other)->constisnull)
                                103                 :     {
 5315 tgl                       104 UBC           0 :         ReleaseVariableStats(vardata);
                                105               0 :         PG_RETURN_FLOAT8(0.0);
                                106                 :     }
                                107                 : 
                                108                 :     /*
                                109                 :      * OK, there's a Var and a Const we're dealing with here.  We need the
                                110                 :      * Const to be a TSQuery, else we can't do anything useful.  We have to
                                111                 :      * check this because the Var might be the TSQuery not the TSVector.
                                112                 :      */
 4635 tgl                       113 CBC         513 :     if (((Const *) other)->consttype == TSQUERYOID)
                                114                 :     {
                                115                 :         /* tsvector @@ tsquery or the other way around */
                                116             513 :         Assert(vardata.vartype == TSVECTOROID);
                                117                 : 
 5315                           118             513 :         selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
                                119                 :     }
                                120                 :     else
                                121                 :     {
                                122                 :         /* If we can't see the query structure, must punt */
 5315 tgl                       123 UBC           0 :         selec = DEFAULT_TS_MATCH_SEL;
                                124                 :     }
                                125                 : 
 5315 tgl                       126 CBC         513 :     ReleaseVariableStats(vardata);
                                127                 : 
                                128             513 :     CLAMP_PROBABILITY(selec);
                                129                 : 
                                130             513 :     PG_RETURN_FLOAT8((float8) selec);
                                131                 : }
                                132                 : 
                                133                 : 
                                134                 : /*
                                135                 :  *  tsmatchjoinsel -- join selectivity of "@@"
                                136                 :  *
                                137                 :  * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
                                138                 :  */
                                139                 : Datum
 5315 tgl                       140 UBC           0 : tsmatchjoinsel(PG_FUNCTION_ARGS)
                                141                 : {
                                142                 :     /* for the moment we just punt */
                                143               0 :     PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
                                144                 : }
                                145                 : 
                                146                 : 
                                147                 : /*
                                148                 :  * @@ selectivity for tsvector var vs tsquery constant
                                149                 :  */
                                150                 : static Selectivity
 5315 tgl                       151 CBC         513 : tsquerysel(VariableStatData *vardata, Datum constval)
                                152                 : {
                                153                 :     Selectivity selec;
                                154                 :     TSQuery     query;
                                155                 : 
                                156                 :     /* The caller made sure the const is a TSQuery, so get it now */
 5058                           157             513 :     query = DatumGetTSQuery(constval);
                                158                 : 
                                159                 :     /* Empty query matches nothing */
                                160             513 :     if (query->size == 0)
 5058 tgl                       161 UBC           0 :         return (Selectivity) 0.0;
                                162                 : 
 5315 tgl                       163 CBC         513 :     if (HeapTupleIsValid(vardata->statsTuple))
                                164                 :     {
                                165                 :         Form_pg_statistic stats;
                                166                 :         AttStatsSlot sslot;
                                167                 : 
                                168             495 :         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
                                169                 : 
                                170                 :         /* MCELEM will be an array of TEXT elements for a tsvector column */
 2157                           171             495 :         if (get_attstatsslot(&sslot, vardata->statsTuple,
                                172                 :                              STATISTIC_KIND_MCELEM, InvalidOid,
                                173                 :                              ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
                                174                 :         {
                                175                 :             /*
                                176                 :              * There is a most-common-elements slot for the tsvector Var, so
                                177                 :              * use that.
                                178                 :              */
                                179             495 :             selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
                                180                 :                                          sslot.numbers, sslot.nnumbers);
                                181             495 :             free_attstatsslot(&sslot);
                                182                 :         }
                                183                 :         else
                                184                 :         {
                                185                 :             /* No most-common-elements info, so do without */
 4635 tgl                       186 UBC           0 :             selec = tsquery_opr_selec_no_stats(query);
                                187                 :         }
                                188                 : 
                                189                 :         /*
                                190                 :          * MCE stats count only non-null rows, so adjust for null rows.
                                191                 :          */
 4434 tgl                       192 CBC         495 :         selec *= (1.0 - stats->stanullfrac);
                                193                 :     }
                                194                 :     else
                                195                 :     {
                                196                 :         /* No stats at all, so do without */
 4635                           197              18 :         selec = tsquery_opr_selec_no_stats(query);
                                198                 :         /* we assume no nulls here, so no stanullfrac correction */
                                199                 :     }
                                200                 : 
 5315                           201             513 :     return selec;
                                202                 : }
                                203                 : 
                                204                 : /*
                                205                 :  * Extract data from the pg_statistic arrays into useful format.
                                206                 :  */
                                207                 : static Selectivity
                                208             495 : mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
                                209                 :                      float4 *numbers, int nnumbers)
                                210                 : {
                                211                 :     float4      minfreq;
                                212                 :     TextFreq   *lookup;
                                213                 :     Selectivity selec;
                                214                 :     int         i;
                                215                 : 
                                216                 :     /*
                                217                 :      * There should be two more Numbers than Values, because the last two
                                218                 :      * cells are taken for minimal and maximal frequency.  Punt if not.
                                219                 :      *
                                220                 :      * (Note: the MCELEM statistics slot definition allows for a third extra
                                221                 :      * number containing the frequency of nulls, but we're not expecting that
                                222                 :      * to appear for a tsvector column.)
                                223                 :      */
                                224             495 :     if (nnumbers != nmcelem + 2)
 4635 tgl                       225 UBC           0 :         return tsquery_opr_selec_no_stats(query);
                                226                 : 
                                227                 :     /*
                                228                 :      * Transpose the data into a single array so we can use bsearch().
                                229                 :      */
 5315 tgl                       230 CBC         495 :     lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
                                231          495495 :     for (i = 0; i < nmcelem; i++)
                                232                 :     {
                                233                 :         /*
                                234                 :          * The text Datums came from an array, so it cannot be compressed or
                                235                 :          * stored out-of-line -- it's safe to use VARSIZE_ANY*.
                                236                 :          */
                                237          495000 :         Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
                                238          495000 :         lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
                                239          495000 :         lookup[i].frequency = numbers[i];
                                240                 :     }
                                241                 : 
                                242                 :     /*
                                243                 :      * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
                                244                 :      * the one before the last cell of the Numbers array. See ts_typanalyze.c
                                245                 :      */
                                246             495 :     minfreq = numbers[nnumbers - 2];
                                247                 : 
                                248             495 :     selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
                                249                 :                               nmcelem, minfreq);
                                250                 : 
                                251             495 :     pfree(lookup);
                                252                 : 
                                253             495 :     return selec;
                                254                 : }
                                255                 : 
                                256                 : /*
                                257                 :  * Traverse the tsquery in preorder, calculating selectivity as:
                                258                 :  *
                                259                 :  *   selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
                                260                 :  *
                                261                 :  *   selec(left_oper) + selec(right_oper) -
                                262                 :  *      selec(left_oper) * selec(right_oper) in OR nodes,
                                263                 :  *
                                264                 :  *   1 - select(oper) in NOT nodes
                                265                 :  *
                                266                 :  *   histogram-based estimation in prefix VAL nodes
                                267                 :  *
                                268                 :  *   freq[val] in exact VAL nodes, if the value is in MCELEM
                                269                 :  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
                                270                 :  *
                                271                 :  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
                                272                 :  * binary search for determining freq[MCELEM].
                                273                 :  *
                                274                 :  * If we don't have stats for the tsvector, we still use this logic,
                                275                 :  * except we use default estimates for VAL nodes.  This case is signaled
                                276                 :  * by lookup == NULL.
                                277                 :  */
                                278                 : static Selectivity
                                279            1539 : tsquery_opr_selec(QueryItem *item, char *operand,
                                280                 :                   TextFreq *lookup, int length, float4 minfreq)
                                281                 : {
                                282                 :     Selectivity selec;
                                283                 : 
                                284                 :     /* since this function recurses, it could be driven to stack overflow */
                                285            1539 :     check_stack_depth();
                                286                 : 
                                287            1539 :     if (item->type == QI_VAL)
                                288                 :     {
                                289             921 :         QueryOperand *oper = (QueryOperand *) item;
                                290                 :         LexemeKey   key;
                                291                 : 
                                292                 :         /*
                                293                 :          * Prepare the key for bsearch().
                                294                 :          */
                                295             921 :         key.lexeme = operand + oper->distance;
                                296             921 :         key.length = oper->length;
                                297                 : 
 4634                           298             921 :         if (oper->prefix)
                                299                 :         {
                                300                 :             /* Prefix match, ie the query item is lexeme:* */
                                301                 :             Selectivity matched,
                                302                 :                         allmces;
                                303                 :             int         i,
                                304                 :                         n_matched;
                                305                 : 
                                306                 :             /*
                                307                 :              * Our strategy is to scan through the MCELEM list and combine the
                                308                 :              * frequencies of the ones that match the prefix.  We then
                                309                 :              * extrapolate the fraction of matching MCELEMs to the remaining
                                310                 :              * rows, assuming that the MCELEMs are representative of the whole
                                311                 :              * lexeme population in this respect.  (Compare
                                312                 :              * histogram_selectivity().)  Note that these are most common
                                313                 :              * elements not most common values, so they're not mutually
                                314                 :              * exclusive.  We treat occurrences as independent events.
                                315                 :              *
                                316                 :              * This is only a good plan if we have a pretty fair number of
                                317                 :              * MCELEMs available; we set the threshold at 100.  If no stats or
                                318                 :              * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
                                319                 :              */
                                320              51 :             if (lookup == NULL || length < 100)
                                321              21 :                 return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
                                322                 : 
 3862                           323              36 :             matched = allmces = 0;
                                324              36 :             n_matched = 0;
 4634                           325           36036 :             for (i = 0; i < length; i++)
                                326                 :             {
                                327           36000 :                 TextFreq   *t = lookup + i;
                                328           36000 :                 int         tlen = VARSIZE_ANY_EXHDR(t->element);
                                329                 : 
                                330           36000 :                 if (tlen >= key.length &&
                                331           36000 :                     strncmp(key.lexeme, VARDATA_ANY(t->element),
                                332           36000 :                             key.length) == 0)
                                333                 :                 {
 3862                           334            1296 :                     matched += t->frequency - matched * t->frequency;
                                335            1296 :                     n_matched++;
                                336                 :                 }
                                337           36000 :                 allmces += t->frequency - allmces * t->frequency;
                                338                 :             }
                                339                 : 
                                340                 :             /* Clamp to ensure sanity in the face of roundoff error */
                                341              36 :             CLAMP_PROBABILITY(matched);
                                342              36 :             CLAMP_PROBABILITY(allmces);
                                343                 : 
                                344              36 :             selec = matched + (1.0 - allmces) * ((double) n_matched / length);
                                345                 : 
                                346                 :             /*
                                347                 :              * In any case, never believe that a prefix match has selectivity
                                348                 :              * less than we would assign for a non-MCELEM lexeme.  This
                                349                 :              * preserves the property that "word:*" should be estimated to
                                350                 :              * match at least as many rows as "word" would be.
                                351                 :              */
                                352              36 :             selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
                                353                 :         }
                                354                 :         else
                                355                 :         {
                                356                 :             /* Regular exact lexeme match */
                                357                 :             TextFreq   *searchres;
                                358                 : 
                                359                 :             /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
 4634                           360             870 :             if (lookup == NULL)
                                361               6 :                 return (Selectivity) DEFAULT_TS_MATCH_SEL;
                                362                 : 
                                363             864 :             searchres = (TextFreq *) bsearch(&key, lookup, length,
                                364                 :                                              sizeof(TextFreq),
                                365                 :                                              compare_lexeme_textfreq);
                                366                 : 
                                367             864 :             if (searchres)
                                368                 :             {
                                369                 :                 /*
                                370                 :                  * The element is in MCELEM.  Return precise selectivity (or
                                371                 :                  * at least as precise as ANALYZE could find out).
                                372                 :                  */
                                373             804 :                 selec = searchres->frequency;
                                374                 :             }
                                375                 :             else
                                376                 :             {
                                377                 :                 /*
                                378                 :                  * The element is not in MCELEM.  Punt, but assume that the
                                379                 :                  * selectivity cannot be more than minfreq / 2.
                                380                 :                  */
                                381              60 :                 selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
                                382                 :             }
                                383                 :         }
                                384                 :     }
                                385                 :     else
                                386                 :     {
                                387                 :         /* Current TSQuery node is an operator */
                                388                 :         Selectivity s1,
                                389                 :                     s2;
                                390                 : 
                                391             618 :         switch (item->qoperator.oper)
                                392                 :         {
                                393             210 :             case OP_NOT:
                                394             210 :                 selec = 1.0 - tsquery_opr_selec(item + 1, operand,
                                395                 :                                                 lookup, length, minfreq);
                                396             210 :                 break;
                                397                 : 
 2558 teodor                    398             285 :             case OP_PHRASE:
                                399                 :             case OP_AND:
 4634 tgl                       400             285 :                 s1 = tsquery_opr_selec(item + 1, operand,
                                401                 :                                        lookup, length, minfreq);
                                402             285 :                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
                                403                 :                                        lookup, length, minfreq);
                                404             285 :                 selec = s1 * s2;
                                405             285 :                 break;
                                406                 : 
                                407             123 :             case OP_OR:
                                408             123 :                 s1 = tsquery_opr_selec(item + 1, operand,
                                409                 :                                        lookup, length, minfreq);
                                410             123 :                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
                                411                 :                                        lookup, length, minfreq);
                                412             123 :                 selec = s1 + s2 - s1 * s2;
                                413             123 :                 break;
                                414                 : 
 4634 tgl                       415 UBC           0 :             default:
                                416               0 :                 elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
                                417                 :                 selec = 0;      /* keep compiler quiet */
                                418                 :                 break;
                                419                 :         }
                                420                 :     }
                                421                 : 
                                422                 :     /* Clamp intermediate results to stay sane despite roundoff error */
 5315 tgl                       423 CBC        1518 :     CLAMP_PROBABILITY(selec);
                                424                 : 
                                425            1518 :     return selec;
                                426                 : }
                                427                 : 
                                428                 : /*
                                429                 :  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
                                430                 :  * and a TextFreq. Use length, then byte-for-byte comparison, because that's
                                431                 :  * how ANALYZE code sorted data before storing it in a statistic tuple.
                                432                 :  * See ts_typanalyze.c for details.
                                433                 :  */
                                434                 : static int
                                435            6672 : compare_lexeme_textfreq(const void *e1, const void *e2)
                                436                 : {
 5050 bruce                     437            6672 :     const LexemeKey *key = (const LexemeKey *) e1;
                                438            6672 :     const TextFreq *t = (const TextFreq *) e2;
                                439                 :     int         len1,
                                440                 :                 len2;
                                441                 : 
 5315 tgl                       442            6672 :     len1 = key->length;
                                443            6672 :     len2 = VARSIZE_ANY_EXHDR(t->element);
                                444                 : 
                                445                 :     /* Compare lengths first, possibly avoiding a strncmp call */
                                446            6672 :     if (len1 > len2)
                                447             540 :         return 1;
                                448            6132 :     else if (len1 < len2)
 5315 tgl                       449 UBC           0 :         return -1;
                                450                 : 
                                451                 :     /* Fall back on byte-for-byte comparison */
 5315 tgl                       452 CBC        6132 :     return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
                                453                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a