LCOV - differential code coverage report
Current view: top level - src/backend/tsearch - ts_selfuncs.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 86.5 % 104 90 14 90
Current Date: 2023-04-08 15:15:32 Functions: 83.3 % 6 5 1 5
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * ts_selfuncs.c
       4                 :  *    Selectivity estimation functions for text search operators.
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  *
       8                 :  *
       9                 :  * IDENTIFICATION
      10                 :  *    src/backend/tsearch/ts_selfuncs.c
      11                 :  *
      12                 :  *-------------------------------------------------------------------------
      13                 :  */
      14                 : #include "postgres.h"
      15                 : 
      16                 : #include "access/htup_details.h"
      17                 : #include "catalog/pg_statistic.h"
      18                 : #include "catalog/pg_type.h"
      19                 : #include "miscadmin.h"
      20                 : #include "nodes/nodes.h"
      21                 : #include "tsearch/ts_type.h"
      22                 : #include "utils/builtins.h"
      23                 : #include "utils/lsyscache.h"
      24                 : #include "utils/selfuncs.h"
      25                 : #include "utils/syscache.h"
      26                 : 
      27                 : 
      28                 : /*
      29                 :  * The default text search selectivity is chosen to be small enough to
      30                 :  * encourage indexscans for typical table densities.  See selfuncs.h and
      31                 :  * DEFAULT_EQ_SEL for details.
      32                 :  */
      33                 : #define DEFAULT_TS_MATCH_SEL 0.005
      34                 : 
      35                 : /* lookup table type for binary searching through MCELEMs */
      36                 : typedef struct
      37                 : {
      38                 :     text       *element;
      39                 :     float4      frequency;
      40                 : } TextFreq;
      41                 : 
      42                 : /* type of keys for bsearch'ing through an array of TextFreqs */
      43                 : typedef struct
      44                 : {
      45                 :     char       *lexeme;
      46                 :     int         length;
      47                 : } LexemeKey;
      48                 : 
      49                 : static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
      50                 : static Selectivity mcelem_tsquery_selec(TSQuery query,
      51                 :                                         Datum *mcelem, int nmcelem,
      52                 :                                         float4 *numbers, int nnumbers);
      53                 : static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
      54                 :                                      TextFreq *lookup, int length, float4 minfreq);
      55                 : static int  compare_lexeme_textfreq(const void *e1, const void *e2);
      56                 : 
      57                 : #define tsquery_opr_selec_no_stats(query) \
      58                 :     tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
      59                 : 
      60                 : 
      61                 : /*
      62                 :  *  tsmatchsel -- Selectivity of "@@"
      63                 :  *
      64                 :  * restriction selectivity function for tsvector @@ tsquery and
      65                 :  * tsquery @@ tsvector
      66                 :  */
      67                 : Datum
      68 CBC         513 : tsmatchsel(PG_FUNCTION_ARGS)
      69                 : {
      70             513 :     PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
      71                 : 
      72                 : #ifdef NOT_USED
      73                 :     Oid         operator = PG_GETARG_OID(1);
      74                 : #endif
      75             513 :     List       *args = (List *) PG_GETARG_POINTER(2);
      76             513 :     int         varRelid = PG_GETARG_INT32(3);
      77                 :     VariableStatData vardata;
      78                 :     Node       *other;
      79                 :     bool        varonleft;
      80                 :     Selectivity selec;
      81                 : 
      82                 :     /*
      83                 :      * If expression is not variable = something or something = variable, then
      84                 :      * punt and return a default estimate.
      85                 :      */
      86             513 :     if (!get_restriction_variable(root, args, varRelid,
      87                 :                                   &vardata, &other, &varonleft))
      88 UBC           0 :         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
      89                 : 
      90                 :     /*
      91                 :      * Can't do anything useful if the something is not a constant, either.
      92                 :      */
      93 CBC         513 :     if (!IsA(other, Const))
      94                 :     {
      95 UBC           0 :         ReleaseVariableStats(vardata);
      96               0 :         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
      97                 :     }
      98                 : 
      99                 :     /*
     100                 :      * The "@@" operator is strict, so we can cope with NULL right away
     101                 :      */
     102 CBC         513 :     if (((Const *) other)->constisnull)
     103                 :     {
     104 UBC           0 :         ReleaseVariableStats(vardata);
     105               0 :         PG_RETURN_FLOAT8(0.0);
     106                 :     }
     107                 : 
     108                 :     /*
     109                 :      * OK, there's a Var and a Const we're dealing with here.  We need the
     110                 :      * Const to be a TSQuery, else we can't do anything useful.  We have to
     111                 :      * check this because the Var might be the TSQuery not the TSVector.
     112                 :      */
     113 CBC         513 :     if (((Const *) other)->consttype == TSQUERYOID)
     114                 :     {
     115                 :         /* tsvector @@ tsquery or the other way around */
     116             513 :         Assert(vardata.vartype == TSVECTOROID);
     117                 : 
     118             513 :         selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
     119                 :     }
     120                 :     else
     121                 :     {
     122                 :         /* If we can't see the query structure, must punt */
     123 UBC           0 :         selec = DEFAULT_TS_MATCH_SEL;
     124                 :     }
     125                 : 
     126 CBC         513 :     ReleaseVariableStats(vardata);
     127                 : 
     128             513 :     CLAMP_PROBABILITY(selec);
     129                 : 
     130             513 :     PG_RETURN_FLOAT8((float8) selec);
     131                 : }
     132                 : 
     133                 : 
     134                 : /*
     135                 :  *  tsmatchjoinsel -- join selectivity of "@@"
     136                 :  *
     137                 :  * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
     138                 :  */
     139                 : Datum
     140 UBC           0 : tsmatchjoinsel(PG_FUNCTION_ARGS)
     141                 : {
     142                 :     /* for the moment we just punt */
     143               0 :     PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
     144                 : }
     145                 : 
     146                 : 
     147                 : /*
     148                 :  * @@ selectivity for tsvector var vs tsquery constant
     149                 :  */
     150                 : static Selectivity
     151 CBC         513 : tsquerysel(VariableStatData *vardata, Datum constval)
     152                 : {
     153                 :     Selectivity selec;
     154                 :     TSQuery     query;
     155                 : 
     156                 :     /* The caller made sure the const is a TSQuery, so get it now */
     157             513 :     query = DatumGetTSQuery(constval);
     158                 : 
     159                 :     /* Empty query matches nothing */
     160             513 :     if (query->size == 0)
     161 UBC           0 :         return (Selectivity) 0.0;
     162                 : 
     163 CBC         513 :     if (HeapTupleIsValid(vardata->statsTuple))
     164                 :     {
     165                 :         Form_pg_statistic stats;
     166                 :         AttStatsSlot sslot;
     167                 : 
     168             495 :         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
     169                 : 
     170                 :         /* MCELEM will be an array of TEXT elements for a tsvector column */
     171             495 :         if (get_attstatsslot(&sslot, vardata->statsTuple,
     172                 :                              STATISTIC_KIND_MCELEM, InvalidOid,
     173                 :                              ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS))
     174                 :         {
     175                 :             /*
     176                 :              * There is a most-common-elements slot for the tsvector Var, so
     177                 :              * use that.
     178                 :              */
     179             495 :             selec = mcelem_tsquery_selec(query, sslot.values, sslot.nvalues,
     180                 :                                          sslot.numbers, sslot.nnumbers);
     181             495 :             free_attstatsslot(&sslot);
     182                 :         }
     183                 :         else
     184                 :         {
     185                 :             /* No most-common-elements info, so do without */
     186 UBC           0 :             selec = tsquery_opr_selec_no_stats(query);
     187                 :         }
     188                 : 
     189                 :         /*
     190                 :          * MCE stats count only non-null rows, so adjust for null rows.
     191                 :          */
     192 CBC         495 :         selec *= (1.0 - stats->stanullfrac);
     193                 :     }
     194                 :     else
     195                 :     {
     196                 :         /* No stats at all, so do without */
     197              18 :         selec = tsquery_opr_selec_no_stats(query);
     198                 :         /* we assume no nulls here, so no stanullfrac correction */
     199                 :     }
     200                 : 
     201             513 :     return selec;
     202                 : }
     203                 : 
     204                 : /*
     205                 :  * Extract data from the pg_statistic arrays into useful format.
     206                 :  */
     207                 : static Selectivity
     208             495 : mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
     209                 :                      float4 *numbers, int nnumbers)
     210                 : {
     211                 :     float4      minfreq;
     212                 :     TextFreq   *lookup;
     213                 :     Selectivity selec;
     214                 :     int         i;
     215                 : 
     216                 :     /*
     217                 :      * There should be two more Numbers than Values, because the last two
     218                 :      * cells are taken for minimal and maximal frequency.  Punt if not.
     219                 :      *
     220                 :      * (Note: the MCELEM statistics slot definition allows for a third extra
     221                 :      * number containing the frequency of nulls, but we're not expecting that
     222                 :      * to appear for a tsvector column.)
     223                 :      */
     224             495 :     if (nnumbers != nmcelem + 2)
     225 UBC           0 :         return tsquery_opr_selec_no_stats(query);
     226                 : 
     227                 :     /*
     228                 :      * Transpose the data into a single array so we can use bsearch().
     229                 :      */
     230 CBC         495 :     lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
     231          495495 :     for (i = 0; i < nmcelem; i++)
     232                 :     {
     233                 :         /*
     234                 :          * The text Datums came from an array, so it cannot be compressed or
     235                 :          * stored out-of-line -- it's safe to use VARSIZE_ANY*.
     236                 :          */
     237          495000 :         Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
     238          495000 :         lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
     239          495000 :         lookup[i].frequency = numbers[i];
     240                 :     }
     241                 : 
     242                 :     /*
     243                 :      * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
     244                 :      * the one before the last cell of the Numbers array. See ts_typanalyze.c
     245                 :      */
     246             495 :     minfreq = numbers[nnumbers - 2];
     247                 : 
     248             495 :     selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
     249                 :                               nmcelem, minfreq);
     250                 : 
     251             495 :     pfree(lookup);
     252                 : 
     253             495 :     return selec;
     254                 : }
     255                 : 
     256                 : /*
     257                 :  * Traverse the tsquery in preorder, calculating selectivity as:
     258                 :  *
     259                 :  *   selec(left_oper) * selec(right_oper) in AND & PHRASE nodes,
     260                 :  *
     261                 :  *   selec(left_oper) + selec(right_oper) -
     262                 :  *      selec(left_oper) * selec(right_oper) in OR nodes,
     263                 :  *
     264                 :  *   1 - select(oper) in NOT nodes
     265                 :  *
     266                 :  *   histogram-based estimation in prefix VAL nodes
     267                 :  *
     268                 :  *   freq[val] in exact VAL nodes, if the value is in MCELEM
     269                 :  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
     270                 :  *
     271                 :  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
     272                 :  * binary search for determining freq[MCELEM].
     273                 :  *
     274                 :  * If we don't have stats for the tsvector, we still use this logic,
     275                 :  * except we use default estimates for VAL nodes.  This case is signaled
     276                 :  * by lookup == NULL.
     277                 :  */
     278                 : static Selectivity
     279            1539 : tsquery_opr_selec(QueryItem *item, char *operand,
     280                 :                   TextFreq *lookup, int length, float4 minfreq)
     281                 : {
     282                 :     Selectivity selec;
     283                 : 
     284                 :     /* since this function recurses, it could be driven to stack overflow */
     285            1539 :     check_stack_depth();
     286                 : 
     287            1539 :     if (item->type == QI_VAL)
     288                 :     {
     289             921 :         QueryOperand *oper = (QueryOperand *) item;
     290                 :         LexemeKey   key;
     291                 : 
     292                 :         /*
     293                 :          * Prepare the key for bsearch().
     294                 :          */
     295             921 :         key.lexeme = operand + oper->distance;
     296             921 :         key.length = oper->length;
     297                 : 
     298             921 :         if (oper->prefix)
     299                 :         {
     300                 :             /* Prefix match, ie the query item is lexeme:* */
     301                 :             Selectivity matched,
     302                 :                         allmces;
     303                 :             int         i,
     304                 :                         n_matched;
     305                 : 
     306                 :             /*
     307                 :              * Our strategy is to scan through the MCELEM list and combine the
     308                 :              * frequencies of the ones that match the prefix.  We then
     309                 :              * extrapolate the fraction of matching MCELEMs to the remaining
     310                 :              * rows, assuming that the MCELEMs are representative of the whole
     311                 :              * lexeme population in this respect.  (Compare
     312                 :              * histogram_selectivity().)  Note that these are most common
     313                 :              * elements not most common values, so they're not mutually
     314                 :              * exclusive.  We treat occurrences as independent events.
     315                 :              *
     316                 :              * This is only a good plan if we have a pretty fair number of
     317                 :              * MCELEMs available; we set the threshold at 100.  If no stats or
     318                 :              * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
     319                 :              */
     320              51 :             if (lookup == NULL || length < 100)
     321              21 :                 return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
     322                 : 
     323              36 :             matched = allmces = 0;
     324              36 :             n_matched = 0;
     325           36036 :             for (i = 0; i < length; i++)
     326                 :             {
     327           36000 :                 TextFreq   *t = lookup + i;
     328           36000 :                 int         tlen = VARSIZE_ANY_EXHDR(t->element);
     329                 : 
     330           36000 :                 if (tlen >= key.length &&
     331           36000 :                     strncmp(key.lexeme, VARDATA_ANY(t->element),
     332           36000 :                             key.length) == 0)
     333                 :                 {
     334            1296 :                     matched += t->frequency - matched * t->frequency;
     335            1296 :                     n_matched++;
     336                 :                 }
     337           36000 :                 allmces += t->frequency - allmces * t->frequency;
     338                 :             }
     339                 : 
     340                 :             /* Clamp to ensure sanity in the face of roundoff error */
     341              36 :             CLAMP_PROBABILITY(matched);
     342              36 :             CLAMP_PROBABILITY(allmces);
     343                 : 
     344              36 :             selec = matched + (1.0 - allmces) * ((double) n_matched / length);
     345                 : 
     346                 :             /*
     347                 :              * In any case, never believe that a prefix match has selectivity
     348                 :              * less than we would assign for a non-MCELEM lexeme.  This
     349                 :              * preserves the property that "word:*" should be estimated to
     350                 :              * match at least as many rows as "word" would be.
     351                 :              */
     352              36 :             selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
     353                 :         }
     354                 :         else
     355                 :         {
     356                 :             /* Regular exact lexeme match */
     357                 :             TextFreq   *searchres;
     358                 : 
     359                 :             /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
     360             870 :             if (lookup == NULL)
     361               6 :                 return (Selectivity) DEFAULT_TS_MATCH_SEL;
     362                 : 
     363             864 :             searchres = (TextFreq *) bsearch(&key, lookup, length,
     364                 :                                              sizeof(TextFreq),
     365                 :                                              compare_lexeme_textfreq);
     366                 : 
     367             864 :             if (searchres)
     368                 :             {
     369                 :                 /*
     370                 :                  * The element is in MCELEM.  Return precise selectivity (or
     371                 :                  * at least as precise as ANALYZE could find out).
     372                 :                  */
     373             804 :                 selec = searchres->frequency;
     374                 :             }
     375                 :             else
     376                 :             {
     377                 :                 /*
     378                 :                  * The element is not in MCELEM.  Punt, but assume that the
     379                 :                  * selectivity cannot be more than minfreq / 2.
     380                 :                  */
     381              60 :                 selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
     382                 :             }
     383                 :         }
     384                 :     }
     385                 :     else
     386                 :     {
     387                 :         /* Current TSQuery node is an operator */
     388                 :         Selectivity s1,
     389                 :                     s2;
     390                 : 
     391             618 :         switch (item->qoperator.oper)
     392                 :         {
     393             210 :             case OP_NOT:
     394             210 :                 selec = 1.0 - tsquery_opr_selec(item + 1, operand,
     395                 :                                                 lookup, length, minfreq);
     396             210 :                 break;
     397                 : 
     398             285 :             case OP_PHRASE:
     399                 :             case OP_AND:
     400             285 :                 s1 = tsquery_opr_selec(item + 1, operand,
     401                 :                                        lookup, length, minfreq);
     402             285 :                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
     403                 :                                        lookup, length, minfreq);
     404             285 :                 selec = s1 * s2;
     405             285 :                 break;
     406                 : 
     407             123 :             case OP_OR:
     408             123 :                 s1 = tsquery_opr_selec(item + 1, operand,
     409                 :                                        lookup, length, minfreq);
     410             123 :                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
     411                 :                                        lookup, length, minfreq);
     412             123 :                 selec = s1 + s2 - s1 * s2;
     413             123 :                 break;
     414                 : 
     415 UBC           0 :             default:
     416               0 :                 elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
     417                 :                 selec = 0;      /* keep compiler quiet */
     418                 :                 break;
     419                 :         }
     420                 :     }
     421                 : 
     422                 :     /* Clamp intermediate results to stay sane despite roundoff error */
     423 CBC        1518 :     CLAMP_PROBABILITY(selec);
     424                 : 
     425            1518 :     return selec;
     426                 : }
     427                 : 
     428                 : /*
     429                 :  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
     430                 :  * and a TextFreq. Use length, then byte-for-byte comparison, because that's
     431                 :  * how ANALYZE code sorted data before storing it in a statistic tuple.
     432                 :  * See ts_typanalyze.c for details.
     433                 :  */
     434                 : static int
     435            6672 : compare_lexeme_textfreq(const void *e1, const void *e2)
     436                 : {
     437            6672 :     const LexemeKey *key = (const LexemeKey *) e1;
     438            6672 :     const TextFreq *t = (const TextFreq *) e2;
     439                 :     int         len1,
     440                 :                 len2;
     441                 : 
     442            6672 :     len1 = key->length;
     443            6672 :     len2 = VARSIZE_ANY_EXHDR(t->element);
     444                 : 
     445                 :     /* Compare lengths first, possibly avoiding a strncmp call */
     446            6672 :     if (len1 > len2)
     447             540 :         return 1;
     448            6132 :     else if (len1 < len2)
     449 UBC           0 :         return -1;
     450                 : 
     451                 :     /* Fall back on byte-for-byte comparison */
     452 CBC        6132 :     return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
     453                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a