LCOV - differential code coverage report
Current view: top level - src/backend/access/nbtree - nbtutils.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DUB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 78.7 % 779 613 7 24 135 4 133 4 472 26 127 1 7
Current Date: 2023-04-08 15:15:32 Functions: 93.8 % 32 30 2 13 3 14 2 13 1
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * nbtutils.c
       4                 :  *    Utility code for Postgres btree implementation.
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/access/nbtree/nbtutils.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : 
      16                 : #include "postgres.h"
      17                 : 
      18                 : #include <time.h>
      19                 : 
      20                 : #include "access/nbtree.h"
      21                 : #include "access/reloptions.h"
      22                 : #include "access/relscan.h"
      23                 : #include "catalog/catalog.h"
      24                 : #include "commands/progress.h"
      25                 : #include "lib/qunique.h"
      26                 : #include "miscadmin.h"
      27                 : #include "utils/array.h"
      28                 : #include "utils/datum.h"
      29                 : #include "utils/lsyscache.h"
      30                 : #include "utils/memutils.h"
      31                 : #include "utils/rel.h"
      32                 : 
      33                 : 
      34                 : typedef struct BTSortArrayContext
      35                 : {
      36                 :     FmgrInfo    flinfo;
      37                 :     Oid         collation;
      38                 :     bool        reverse;
      39                 : } BTSortArrayContext;
      40                 : 
      41                 : static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
      42                 :                                       StrategyNumber strat,
      43                 :                                       Datum *elems, int nelems);
      44                 : static int  _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
      45                 :                                     bool reverse,
      46                 :                                     Datum *elems, int nelems);
      47                 : static int  _bt_compare_array_elements(const void *a, const void *b, void *arg);
      48                 : static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
      49                 :                                      ScanKey leftarg, ScanKey rightarg,
      50                 :                                      bool *result);
      51                 : static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
      52                 : static void _bt_mark_scankey_required(ScanKey skey);
      53                 : static bool _bt_check_rowcompare(ScanKey skey,
      54                 :                                  IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
      55                 :                                  ScanDirection dir, bool *continuescan);
      56                 : static int  _bt_keep_natts(Relation rel, IndexTuple lastleft,
      57                 :                            IndexTuple firstright, BTScanInsert itup_key);
      58                 : 
      59                 : 
      60                 : /*
      61                 :  * _bt_mkscankey
      62                 :  *      Build an insertion scan key that contains comparison data from itup
      63                 :  *      as well as comparator routines appropriate to the key datatypes.
      64                 :  *
      65                 :  *      When itup is a non-pivot tuple, the returned insertion scan key is
      66                 :  *      suitable for finding a place for it to go on the leaf level.  Pivot
      67                 :  *      tuples can be used to re-find leaf page with matching high key, but
      68                 :  *      then caller needs to set scan key's pivotsearch field to true.  This
      69                 :  *      allows caller to search for a leaf page with a matching high key,
      70                 :  *      which is usually to the left of the first leaf page a non-pivot match
      71                 :  *      might appear on.
      72                 :  *
      73                 :  *      The result is intended for use with _bt_compare() and _bt_truncate().
      74                 :  *      Callers that don't need to fill out the insertion scankey arguments
      75                 :  *      (e.g. they use an ad-hoc comparison routine, or only need a scankey
      76                 :  *      for _bt_truncate()) can pass a NULL index tuple.  The scankey will
      77                 :  *      be initialized as if an "all truncated" pivot tuple was passed
      78                 :  *      instead.
      79                 :  *
      80                 :  *      Note that we may occasionally have to share lock the metapage to
      81                 :  *      determine whether or not the keys in the index are expected to be
      82                 :  *      unique (i.e. if this is a "heapkeyspace" index).  We assume a
      83                 :  *      heapkeyspace index when caller passes a NULL tuple, allowing index
      84                 :  *      build callers to avoid accessing the non-existent metapage.  We
      85                 :  *      also assume that the index is _not_ allequalimage when a NULL tuple
      86                 :  *      is passed; CREATE INDEX callers call _bt_allequalimage() to set the
      87                 :  *      field themselves.
      88                 :  */
      89                 : BTScanInsert
      90 GNC     9135483 : _bt_mkscankey(Relation rel, Relation heaprel, IndexTuple itup)
      91                 : {
      92                 :     BTScanInsert key;
      93                 :     ScanKey     skey;
      94                 :     TupleDesc   itupdesc;
      95                 :     int         indnkeyatts;
      96                 :     int16      *indoption;
      97                 :     int         tupnatts;
      98                 :     int         i;
      99                 : 
     100 CBC     9135483 :     itupdesc = RelationGetDescr(rel);
     101         9135483 :     indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     102         9135483 :     indoption = rel->rd_indoption;
     103         9135483 :     tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0;
     104                 : 
     105         9135483 :     Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel));
     106                 : 
     107                 :     /*
     108                 :      * We'll execute search using scan key constructed on key columns.
     109                 :      * Truncated attributes and non-key attributes are omitted from the final
     110                 :      * scan key.
     111                 :      */
     112         9135483 :     key = palloc(offsetof(BTScanInsertData, scankeys) +
     113         9135483 :                  sizeof(ScanKeyData) * indnkeyatts);
     114         9135483 :     if (itup)
     115 GNC     8950316 :         _bt_metaversion(rel, heaprel, &key->heapkeyspace, &key->allequalimage);
     116                 :     else
     117                 :     {
     118                 :         /* Utility statement callers can set these fields themselves */
     119 CBC      185167 :         key->heapkeyspace = true;
     120          185167 :         key->allequalimage = false;
     121                 :     }
     122         9135483 :     key->anynullkeys = false;    /* initial assumption */
     123         9135483 :     key->nextkey = false;
     124         9135483 :     key->pivotsearch = false;
     125         9135483 :     key->keysz = Min(indnkeyatts, tupnatts);
     126         9135483 :     key->scantid = key->heapkeyspace && itup ?
     127        18270966 :         BTreeTupleGetHeapTID(itup) : NULL;
     128         9135483 :     skey = key->scankeys;
     129        27120508 :     for (i = 0; i < indnkeyatts; i++)
     130                 :     {
     131                 :         FmgrInfo   *procinfo;
     132                 :         Datum       arg;
     133                 :         bool        null;
     134                 :         int         flags;
     135                 : 
     136                 :         /*
     137                 :          * We can use the cached (default) support procs since no cross-type
     138                 :          * comparison can be needed.
     139                 :          */
     140        17985025 :         procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
     141                 : 
     142                 :         /*
     143                 :          * Key arguments built from truncated attributes (or when caller
     144                 :          * provides no tuple) are defensively represented as NULL values. They
     145                 :          * should never be used.
     146                 :          */
     147        17985025 :         if (i < tupnatts)
     148        17659875 :             arg = index_getattr(itup, i + 1, itupdesc, &null);
     149                 :         else
     150                 :         {
     151          325150 :             arg = (Datum) 0;
     152          325150 :             null = true;
     153                 :         }
     154        17985025 :         flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
     155        17985025 :         ScanKeyEntryInitializeWithInfo(&skey[i],
     156                 :                                        flags,
     157        17985025 :                                        (AttrNumber) (i + 1),
     158                 :                                        InvalidStrategy,
     159                 :                                        InvalidOid,
     160        17985025 :                                        rel->rd_indcollation[i],
     161                 :                                        procinfo,
     162                 :                                        arg);
     163                 :         /* Record if any key attribute is NULL (or truncated) */
     164        17985025 :         if (null)
     165          325247 :             key->anynullkeys = true;
     166                 :     }
     167                 : 
     168                 :     /*
     169                 :      * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so
     170                 :      * that full uniqueness check is done.
     171                 :      */
     172         9135483 :     if (rel->rd_index->indnullsnotdistinct)
     173              84 :         key->anynullkeys = false;
     174                 : 
     175         9135483 :     return key;
     176                 : }
     177                 : 
     178                 : /*
     179                 :  * free a retracement stack made by _bt_search.
     180                 :  */
     181                 : void
     182        14919966 : _bt_freestack(BTStack stack)
     183                 : {
     184                 :     BTStack     ostack;
     185                 : 
     186        27190253 :     while (stack != NULL)
     187                 :     {
     188        12270287 :         ostack = stack;
     189        12270287 :         stack = stack->bts_parent;
     190        12270287 :         pfree(ostack);
     191                 :     }
     192        14919966 : }
     193                 : 
     194                 : 
     195                 : /*
     196                 :  *  _bt_preprocess_array_keys() -- Preprocess SK_SEARCHARRAY scan keys
     197                 :  *
     198                 :  * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
     199                 :  * set up BTArrayKeyInfo info for each one that is an equality-type key.
     200                 :  * Prepare modified scan keys in so->arrayKeyData, which will hold the current
     201                 :  * array elements during each primitive indexscan operation.  For inequality
     202                 :  * array keys, it's sufficient to find the extreme element value and replace
     203                 :  * the whole array with that scalar value.
     204                 :  *
     205                 :  * Note: the reason we need so->arrayKeyData, rather than just scribbling
     206                 :  * on scan->keyData, is that callers are permitted to call btrescan without
     207                 :  * supplying a new set of scankey data.
     208                 :  */
     209                 : void
     210         9088299 : _bt_preprocess_array_keys(IndexScanDesc scan)
     211                 : {
     212         9088299 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     213         9088299 :     int         numberOfKeys = scan->numberOfKeys;
     214         9088299 :     int16      *indoption = scan->indexRelation->rd_indoption;
     215                 :     int         numArrayKeys;
     216                 :     ScanKey     cur;
     217                 :     int         i;
     218                 :     MemoryContext oldContext;
     219                 : 
     220                 :     /* Quick check to see if there are any array keys */
     221         9088299 :     numArrayKeys = 0;
     222        24998875 :     for (i = 0; i < numberOfKeys; i++)
     223                 :     {
     224        15910576 :         cur = &scan->keyData[i];
     225        15910576 :         if (cur->sk_flags & SK_SEARCHARRAY)
     226                 :         {
     227             353 :             numArrayKeys++;
     228             353 :             Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL)));
     229                 :             /* If any arrays are null as a whole, we can quit right now. */
     230             353 :             if (cur->sk_flags & SK_ISNULL)
     231                 :             {
     232 UBC           0 :                 so->numArrayKeys = -1;
     233               0 :                 so->arrayKeyData = NULL;
     234               0 :                 return;
     235                 :             }
     236                 :         }
     237                 :     }
     238                 : 
     239                 :     /* Quit if nothing to do. */
     240 CBC     9088299 :     if (numArrayKeys == 0)
     241                 :     {
     242         9087985 :         so->numArrayKeys = 0;
     243         9087985 :         so->arrayKeyData = NULL;
     244         9087985 :         return;
     245                 :     }
     246                 : 
     247                 :     /*
     248                 :      * Make a scan-lifespan context to hold array-associated data, or reset it
     249                 :      * if we already have one from a previous rescan cycle.
     250                 :      */
     251             314 :     if (so->arrayContext == NULL)
     252             314 :         so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
     253                 :                                                  "BTree array context",
     254                 :                                                  ALLOCSET_SMALL_SIZES);
     255                 :     else
     256 UBC           0 :         MemoryContextReset(so->arrayContext);
     257                 : 
     258 CBC         314 :     oldContext = MemoryContextSwitchTo(so->arrayContext);
     259                 : 
     260                 :     /* Create modifiable copy of scan->keyData in the workspace context */
     261             314 :     so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
     262             314 :     memcpy(so->arrayKeyData,
     263             314 :            scan->keyData,
     264             314 :            scan->numberOfKeys * sizeof(ScanKeyData));
     265                 : 
     266                 :     /* Allocate space for per-array data in the workspace context */
     267             314 :     so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));
     268                 : 
     269                 :     /* Now process each array key */
     270             314 :     numArrayKeys = 0;
     271             763 :     for (i = 0; i < numberOfKeys; i++)
     272                 :     {
     273                 :         ArrayType  *arrayval;
     274                 :         int16       elmlen;
     275                 :         bool        elmbyval;
     276                 :         char        elmalign;
     277                 :         int         num_elems;
     278                 :         Datum      *elem_values;
     279                 :         bool       *elem_nulls;
     280                 :         int         num_nonnulls;
     281                 :         int         j;
     282                 : 
     283             449 :         cur = &so->arrayKeyData[i];
     284             449 :         if (!(cur->sk_flags & SK_SEARCHARRAY))
     285              99 :             continue;
     286                 : 
     287                 :         /*
     288                 :          * First, deconstruct the array into elements.  Anything allocated
     289                 :          * here (including a possibly detoasted array value) is in the
     290                 :          * workspace context.
     291                 :          */
     292             353 :         arrayval = DatumGetArrayTypeP(cur->sk_argument);
     293                 :         /* We could cache this data, but not clear it's worth it */
     294             353 :         get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
     295                 :                              &elmlen, &elmbyval, &elmalign);
     296             353 :         deconstruct_array(arrayval,
     297                 :                           ARR_ELEMTYPE(arrayval),
     298                 :                           elmlen, elmbyval, elmalign,
     299                 :                           &elem_values, &elem_nulls, &num_elems);
     300                 : 
     301                 :         /*
     302                 :          * Compress out any null elements.  We can ignore them since we assume
     303                 :          * all btree operators are strict.
     304                 :          */
     305             353 :         num_nonnulls = 0;
     306            1926 :         for (j = 0; j < num_elems; j++)
     307                 :         {
     308            1573 :             if (!elem_nulls[j])
     309            1573 :                 elem_values[num_nonnulls++] = elem_values[j];
     310                 :         }
     311                 : 
     312                 :         /* We could pfree(elem_nulls) now, but not worth the cycles */
     313                 : 
     314                 :         /* If there's no non-nulls, the scan qual is unsatisfiable */
     315             353 :         if (num_nonnulls == 0)
     316                 :         {
     317 UBC           0 :             numArrayKeys = -1;
     318               0 :             break;
     319                 :         }
     320                 : 
     321                 :         /*
     322                 :          * If the comparison operator is not equality, then the array qual
     323                 :          * degenerates to a simple comparison against the smallest or largest
     324                 :          * non-null array element, as appropriate.
     325                 :          */
     326 CBC         353 :         switch (cur->sk_strategy)
     327                 :         {
     328 UBC           0 :             case BTLessStrategyNumber:
     329                 :             case BTLessEqualStrategyNumber:
     330               0 :                 cur->sk_argument =
     331               0 :                     _bt_find_extreme_element(scan, cur,
     332                 :                                              BTGreaterStrategyNumber,
     333                 :                                              elem_values, num_nonnulls);
     334               0 :                 continue;
     335 CBC         350 :             case BTEqualStrategyNumber:
     336                 :                 /* proceed with rest of loop */
     337             350 :                 break;
     338               3 :             case BTGreaterEqualStrategyNumber:
     339                 :             case BTGreaterStrategyNumber:
     340               3 :                 cur->sk_argument =
     341               3 :                     _bt_find_extreme_element(scan, cur,
     342                 :                                              BTLessStrategyNumber,
     343                 :                                              elem_values, num_nonnulls);
     344               3 :                 continue;
     345 UBC           0 :             default:
     346               0 :                 elog(ERROR, "unrecognized StrategyNumber: %d",
     347                 :                      (int) cur->sk_strategy);
     348                 :                 break;
     349                 :         }
     350                 : 
     351                 :         /*
     352                 :          * Sort the non-null elements and eliminate any duplicates.  We must
     353                 :          * sort in the same ordering used by the index column, so that the
     354                 :          * successive primitive indexscans produce data in index order.
     355                 :          */
     356 CBC         700 :         num_elems = _bt_sort_array_elements(scan, cur,
     357             350 :                                             (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
     358                 :                                             elem_values, num_nonnulls);
     359                 : 
     360                 :         /*
     361                 :          * And set up the BTArrayKeyInfo data.
     362                 :          */
     363             350 :         so->arrayKeys[numArrayKeys].scan_key = i;
     364             350 :         so->arrayKeys[numArrayKeys].num_elems = num_elems;
     365             350 :         so->arrayKeys[numArrayKeys].elem_values = elem_values;
     366             350 :         numArrayKeys++;
     367                 :     }
     368                 : 
     369             314 :     so->numArrayKeys = numArrayKeys;
     370                 : 
     371             314 :     MemoryContextSwitchTo(oldContext);
     372                 : }
     373                 : 
     374                 : /*
     375                 :  * _bt_find_extreme_element() -- get least or greatest array element
     376                 :  *
     377                 :  * scan and skey identify the index column, whose opfamily determines the
     378                 :  * comparison semantics.  strat should be BTLessStrategyNumber to get the
     379                 :  * least element, or BTGreaterStrategyNumber to get the greatest.
     380                 :  */
     381                 : static Datum
     382               3 : _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
     383                 :                          StrategyNumber strat,
     384                 :                          Datum *elems, int nelems)
     385                 : {
     386               3 :     Relation    rel = scan->indexRelation;
     387                 :     Oid         elemtype,
     388                 :                 cmp_op;
     389                 :     RegProcedure cmp_proc;
     390                 :     FmgrInfo    flinfo;
     391                 :     Datum       result;
     392                 :     int         i;
     393                 : 
     394                 :     /*
     395                 :      * Determine the nominal datatype of the array elements.  We have to
     396                 :      * support the convention that sk_subtype == InvalidOid means the opclass
     397                 :      * input type; this is a hack to simplify life for ScanKeyInit().
     398                 :      */
     399               3 :     elemtype = skey->sk_subtype;
     400               3 :     if (elemtype == InvalidOid)
     401 UBC           0 :         elemtype = rel->rd_opcintype[skey->sk_attno - 1];
     402                 : 
     403                 :     /*
     404                 :      * Look up the appropriate comparison operator in the opfamily.
     405                 :      *
     406                 :      * Note: it's possible that this would fail, if the opfamily is
     407                 :      * incomplete, but it seems quite unlikely that an opfamily would omit
     408                 :      * non-cross-type comparison operators for any datatype that it supports
     409                 :      * at all.
     410                 :      */
     411 CBC           3 :     cmp_op = get_opfamily_member(rel->rd_opfamily[skey->sk_attno - 1],
     412                 :                                  elemtype,
     413                 :                                  elemtype,
     414                 :                                  strat);
     415               3 :     if (!OidIsValid(cmp_op))
     416 UBC           0 :         elog(ERROR, "missing operator %d(%u,%u) in opfamily %u",
     417                 :              strat, elemtype, elemtype,
     418                 :              rel->rd_opfamily[skey->sk_attno - 1]);
     419 CBC           3 :     cmp_proc = get_opcode(cmp_op);
     420               3 :     if (!RegProcedureIsValid(cmp_proc))
     421 UBC           0 :         elog(ERROR, "missing oprcode for operator %u", cmp_op);
     422                 : 
     423 CBC           3 :     fmgr_info(cmp_proc, &flinfo);
     424                 : 
     425               3 :     Assert(nelems > 0);
     426               3 :     result = elems[0];
     427               6 :     for (i = 1; i < nelems; i++)
     428                 :     {
     429               3 :         if (DatumGetBool(FunctionCall2Coll(&flinfo,
     430                 :                                            skey->sk_collation,
     431               3 :                                            elems[i],
     432                 :                                            result)))
     433 UBC           0 :             result = elems[i];
     434                 :     }
     435                 : 
     436 CBC           3 :     return result;
     437                 : }
     438                 : 
     439                 : /*
     440                 :  * _bt_sort_array_elements() -- sort and de-dup array elements
     441                 :  *
     442                 :  * The array elements are sorted in-place, and the new number of elements
     443                 :  * after duplicate removal is returned.
     444                 :  *
     445                 :  * scan and skey identify the index column, whose opfamily determines the
     446                 :  * comparison semantics.  If reverse is true, we sort in descending order.
     447                 :  */
     448                 : static int
     449             350 : _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
     450                 :                         bool reverse,
     451                 :                         Datum *elems, int nelems)
     452                 : {
     453             350 :     Relation    rel = scan->indexRelation;
     454                 :     Oid         elemtype;
     455                 :     RegProcedure cmp_proc;
     456                 :     BTSortArrayContext cxt;
     457                 : 
     458             350 :     if (nelems <= 1)
     459               6 :         return nelems;          /* no work to do */
     460                 : 
     461                 :     /*
     462                 :      * Determine the nominal datatype of the array elements.  We have to
     463                 :      * support the convention that sk_subtype == InvalidOid means the opclass
     464                 :      * input type; this is a hack to simplify life for ScanKeyInit().
     465                 :      */
     466             344 :     elemtype = skey->sk_subtype;
     467             344 :     if (elemtype == InvalidOid)
     468 UBC           0 :         elemtype = rel->rd_opcintype[skey->sk_attno - 1];
     469                 : 
     470                 :     /*
     471                 :      * Look up the appropriate comparison function in the opfamily.
     472                 :      *
     473                 :      * Note: it's possible that this would fail, if the opfamily is
     474                 :      * incomplete, but it seems quite unlikely that an opfamily would omit
     475                 :      * non-cross-type support functions for any datatype that it supports at
     476                 :      * all.
     477                 :      */
     478 CBC         344 :     cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1],
     479                 :                                  elemtype,
     480                 :                                  elemtype,
     481                 :                                  BTORDER_PROC);
     482             344 :     if (!RegProcedureIsValid(cmp_proc))
     483 UBC           0 :         elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
     484                 :              BTORDER_PROC, elemtype, elemtype,
     485                 :              rel->rd_opfamily[skey->sk_attno - 1]);
     486                 : 
     487                 :     /* Sort the array elements */
     488 CBC         344 :     fmgr_info(cmp_proc, &cxt.flinfo);
     489             344 :     cxt.collation = skey->sk_collation;
     490             344 :     cxt.reverse = reverse;
     491 GNC         344 :     qsort_arg(elems, nelems, sizeof(Datum),
     492                 :               _bt_compare_array_elements, &cxt);
     493                 : 
     494                 :     /* Now scan the sorted elements and remove duplicates */
     495 CBC         344 :     return qunique_arg(elems, nelems, sizeof(Datum),
     496                 :                        _bt_compare_array_elements, &cxt);
     497                 : }
     498                 : 
     499                 : /*
     500                 :  * qsort_arg comparator for sorting array elements
     501                 :  */
     502                 : static int
     503            3475 : _bt_compare_array_elements(const void *a, const void *b, void *arg)
     504                 : {
     505            3475 :     Datum       da = *((const Datum *) a);
     506            3475 :     Datum       db = *((const Datum *) b);
     507            3475 :     BTSortArrayContext *cxt = (BTSortArrayContext *) arg;
     508                 :     int32       compare;
     509                 : 
     510            3475 :     compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo,
     511                 :                                               cxt->collation,
     512                 :                                               da, db));
     513            3475 :     if (cxt->reverse)
     514 UBC           0 :         INVERT_COMPARE_RESULT(compare);
     515 CBC        3475 :     return compare;
     516                 : }
     517                 : 
     518                 : /*
     519                 :  * _bt_start_array_keys() -- Initialize array keys at start of a scan
     520                 :  *
     521                 :  * Set up the cur_elem counters and fill in the first sk_argument value for
     522                 :  * each array scankey.  We can't do this until we know the scan direction.
     523                 :  */
     524                 : void
     525             310 : _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
     526                 : {
     527             310 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     528                 :     int         i;
     529                 : 
     530             659 :     for (i = 0; i < so->numArrayKeys; i++)
     531                 :     {
     532             349 :         BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
     533             349 :         ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
     534                 : 
     535             349 :         Assert(curArrayKey->num_elems > 0);
     536             349 :         if (ScanDirectionIsBackward(dir))
     537 UBC           0 :             curArrayKey->cur_elem = curArrayKey->num_elems - 1;
     538                 :         else
     539 CBC         349 :             curArrayKey->cur_elem = 0;
     540             349 :         skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
     541                 :     }
     542             310 : }
     543                 : 
     544                 : /*
     545                 :  * _bt_advance_array_keys() -- Advance to next set of array elements
     546                 :  *
     547                 :  * Returns true if there is another set of values to consider, false if not.
     548                 :  * On true result, the scankeys are initialized with the next set of values.
     549                 :  */
     550                 : bool
     551            1895 : _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir)
     552                 : {
     553            1895 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     554            1895 :     bool        found = false;
     555                 :     int         i;
     556                 : 
     557                 :     /*
     558                 :      * We must advance the last array key most quickly, since it will
     559                 :      * correspond to the lowest-order index column among the available
     560                 :      * qualifications. This is necessary to ensure correct ordering of output
     561                 :      * when there are multiple array keys.
     562                 :      */
     563            2550 :     for (i = so->numArrayKeys - 1; i >= 0; i--)
     564                 :     {
     565            2243 :         BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
     566            2243 :         ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
     567            2243 :         int         cur_elem = curArrayKey->cur_elem;
     568            2243 :         int         num_elems = curArrayKey->num_elems;
     569                 : 
     570            2243 :         if (ScanDirectionIsBackward(dir))
     571                 :         {
     572 UBC           0 :             if (--cur_elem < 0)
     573                 :             {
     574               0 :                 cur_elem = num_elems - 1;
     575               0 :                 found = false;  /* need to advance next array key */
     576                 :             }
     577                 :             else
     578               0 :                 found = true;
     579                 :         }
     580                 :         else
     581                 :         {
     582 CBC        2243 :             if (++cur_elem >= num_elems)
     583                 :             {
     584             655 :                 cur_elem = 0;
     585             655 :                 found = false;  /* need to advance next array key */
     586                 :             }
     587                 :             else
     588            1588 :                 found = true;
     589                 :         }
     590                 : 
     591            2243 :         curArrayKey->cur_elem = cur_elem;
     592            2243 :         skey->sk_argument = curArrayKey->elem_values[cur_elem];
     593            2243 :         if (found)
     594            1588 :             break;
     595                 :     }
     596                 : 
     597                 :     /* advance parallel scan */
     598            1895 :     if (scan->parallel_scan != NULL)
     599 UBC           0 :         _bt_parallel_advance_array_keys(scan);
     600                 : 
     601 CBC        1895 :     return found;
     602                 : }
     603                 : 
     604                 : /*
     605                 :  * _bt_mark_array_keys() -- Handle array keys during btmarkpos
     606                 :  *
     607                 :  * Save the current state of the array keys as the "mark" position.
     608                 :  */
     609                 : void
     610               3 : _bt_mark_array_keys(IndexScanDesc scan)
     611                 : {
     612               3 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     613                 :     int         i;
     614                 : 
     615               6 :     for (i = 0; i < so->numArrayKeys; i++)
     616                 :     {
     617               3 :         BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
     618                 : 
     619               3 :         curArrayKey->mark_elem = curArrayKey->cur_elem;
     620                 :     }
     621               3 : }
     622                 : 
     623                 : /*
     624                 :  * _bt_restore_array_keys() -- Handle array keys during btrestrpos
     625                 :  *
     626                 :  * Restore the array keys to where they were when the mark was set.
     627                 :  */
     628                 : void
     629               3 : _bt_restore_array_keys(IndexScanDesc scan)
     630                 : {
     631               3 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     632               3 :     bool        changed = false;
     633                 :     int         i;
     634                 : 
     635                 :     /* Restore each array key to its position when the mark was set */
     636               6 :     for (i = 0; i < so->numArrayKeys; i++)
     637                 :     {
     638               3 :         BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
     639               3 :         ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
     640               3 :         int         mark_elem = curArrayKey->mark_elem;
     641                 : 
     642               3 :         if (curArrayKey->cur_elem != mark_elem)
     643                 :         {
     644 UBC           0 :             curArrayKey->cur_elem = mark_elem;
     645               0 :             skey->sk_argument = curArrayKey->elem_values[mark_elem];
     646               0 :             changed = true;
     647                 :         }
     648                 :     }
     649                 : 
     650                 :     /*
     651                 :      * If we changed any keys, we must redo _bt_preprocess_keys.  That might
     652                 :      * sound like overkill, but in cases with multiple keys per index column
     653                 :      * it seems necessary to do the full set of pushups.
     654                 :      */
     655 CBC           3 :     if (changed)
     656                 :     {
     657 UBC           0 :         _bt_preprocess_keys(scan);
     658                 :         /* The mark should have been set on a consistent set of keys... */
     659               0 :         Assert(so->qual_ok);
     660                 :     }
     661 CBC           3 : }
     662                 : 
     663                 : 
     664                 : /*
     665                 :  *  _bt_preprocess_keys() -- Preprocess scan keys
     666                 :  *
     667                 :  * The given search-type keys (in scan->keyData[] or so->arrayKeyData[])
     668                 :  * are copied to so->keyData[] with possible transformation.
     669                 :  * scan->numberOfKeys is the number of input keys, so->numberOfKeys gets
     670                 :  * the number of output keys (possibly less, never greater).
     671                 :  *
     672                 :  * The output keys are marked with additional sk_flags bits beyond the
     673                 :  * system-standard bits supplied by the caller.  The DESC and NULLS_FIRST
     674                 :  * indoption bits for the relevant index attribute are copied into the flags.
     675                 :  * Also, for a DESC column, we commute (flip) all the sk_strategy numbers
     676                 :  * so that the index sorts in the desired direction.
     677                 :  *
     678                 :  * One key purpose of this routine is to discover which scan keys must be
     679                 :  * satisfied to continue the scan.  It also attempts to eliminate redundant
     680                 :  * keys and detect contradictory keys.  (If the index opfamily provides
     681                 :  * incomplete sets of cross-type operators, we may fail to detect redundant
     682                 :  * or contradictory keys, but we can survive that.)
     683                 :  *
     684                 :  * The output keys must be sorted by index attribute.  Presently we expect
     685                 :  * (but verify) that the input keys are already so sorted --- this is done
     686                 :  * by match_clauses_to_index() in indxpath.c.  Some reordering of the keys
     687                 :  * within each attribute may be done as a byproduct of the processing here,
     688                 :  * but no other code depends on that.
     689                 :  *
     690                 :  * The output keys are marked with flags SK_BT_REQFWD and/or SK_BT_REQBKWD
     691                 :  * if they must be satisfied in order to continue the scan forward or backward
     692                 :  * respectively.  _bt_checkkeys uses these flags.  For example, if the quals
     693                 :  * are "x = 1 AND y < 4 AND z < 5", then _bt_checkkeys will reject a tuple
     694                 :  * (1,2,7), but we must continue the scan in case there are tuples (1,3,z).
     695                 :  * But once we reach tuples like (1,4,z) we can stop scanning because no
     696                 :  * later tuples could match.  This is reflected by marking the x and y keys,
     697                 :  * but not the z key, with SK_BT_REQFWD.  In general, the keys for leading
     698                 :  * attributes with "=" keys are marked both SK_BT_REQFWD and SK_BT_REQBKWD.
     699                 :  * For the first attribute without an "=" key, any "<" and "<=" keys are
     700                 :  * marked SK_BT_REQFWD while any ">" and ">=" keys are marked SK_BT_REQBKWD.
     701                 :  * This can be seen to be correct by considering the above example.  Note
     702                 :  * in particular that if there are no keys for a given attribute, the keys for
     703                 :  * subsequent attributes can never be required; for instance "WHERE y = 4"
     704                 :  * requires a full-index scan.
     705                 :  *
     706                 :  * If possible, redundant keys are eliminated: we keep only the tightest
     707                 :  * >/>= bound and the tightest </<= bound, and if there's an = key then
     708                 :  * that's the only one returned.  (So, we return either a single = key,
     709                 :  * or one or two boundary-condition keys for each attr.)  However, if we
     710                 :  * cannot compare two keys for lack of a suitable cross-type operator,
     711                 :  * we cannot eliminate either.  If there are two such keys of the same
     712                 :  * operator strategy, the second one is just pushed into the output array
     713                 :  * without further processing here.  We may also emit both >/>= or both
     714                 :  * </<= keys if we can't compare them.  The logic about required keys still
     715                 :  * works if we don't eliminate redundant keys.
     716                 :  *
     717                 :  * Note that one reason we need direction-sensitive required-key flags is
     718                 :  * precisely that we may not be able to eliminate redundant keys.  Suppose
     719                 :  * we have "x > 4::int AND x > 10::bigint", and we are unable to determine
     720                 :  * which key is more restrictive for lack of a suitable cross-type operator.
     721                 :  * _bt_first will arbitrarily pick one of the keys to do the initial
     722                 :  * positioning with.  If it picks x > 4, then the x > 10 condition will fail
     723                 :  * until we reach index entries > 10; but we can't stop the scan just because
     724                 :  * x > 10 is failing.  On the other hand, if we are scanning backwards, then
     725                 :  * failure of either key is indeed enough to stop the scan.  (In general, when
     726                 :  * inequality keys are present, the initial-positioning code only promises to
     727                 :  * position before the first possible match, not exactly at the first match,
     728                 :  * for a forward scan; or after the last match for a backward scan.)
     729                 :  *
     730                 :  * As a byproduct of this work, we can detect contradictory quals such
     731                 :  * as "x = 1 AND x > 2".  If we see that, we return so->qual_ok = false,
     732                 :  * indicating the scan need not be run at all since no tuples can match.
     733                 :  * (In this case we do not bother completing the output key array!)
     734                 :  * Again, missing cross-type operators might cause us to fail to prove the
     735                 :  * quals contradictory when they really are, but the scan will work correctly.
     736                 :  *
     737                 :  * Row comparison keys are currently also treated without any smarts:
     738                 :  * we just transfer them into the preprocessed array without any
     739                 :  * editorialization.  We can treat them the same as an ordinary inequality
     740                 :  * comparison on the row's first index column, for the purposes of the logic
     741                 :  * about required keys.
     742                 :  *
     743                 :  * Note: the reason we have to copy the preprocessed scan keys into private
     744                 :  * storage is that we are modifying the array based on comparisons of the
     745                 :  * key argument values, which could change on a rescan or after moving to
     746                 :  * new elements of array keys.  Therefore we can't overwrite the source data.
     747                 :  */
     748                 : void
     749         9088140 : _bt_preprocess_keys(IndexScanDesc scan)
     750                 : {
     751         9088140 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     752         9088140 :     int         numberOfKeys = scan->numberOfKeys;
     753         9088140 :     int16      *indoption = scan->indexRelation->rd_indoption;
     754                 :     int         new_numberOfKeys;
     755                 :     int         numberOfEqualCols;
     756                 :     ScanKey     inkeys;
     757                 :     ScanKey     outkeys;
     758                 :     ScanKey     cur;
     759                 :     ScanKey     xform[BTMaxStrategyNumber];
     760                 :     bool        test_result;
     761                 :     int         i,
     762                 :                 j;
     763                 :     AttrNumber  attno;
     764                 : 
     765                 :     /* initialize result variables */
     766         9088140 :     so->qual_ok = true;
     767         9088140 :     so->numberOfKeys = 0;
     768                 : 
     769         9088140 :     if (numberOfKeys < 1)
     770         4167301 :         return;                 /* done if qual-less scan */
     771                 : 
     772                 :     /*
     773                 :      * Read so->arrayKeyData if array keys are present, else scan->keyData
     774                 :      */
     775         9083232 :     if (so->arrayKeyData != NULL)
     776            1901 :         inkeys = so->arrayKeyData;
     777                 :     else
     778         9081331 :         inkeys = scan->keyData;
     779                 : 
     780         9083232 :     outkeys = so->keyData;
     781         9083232 :     cur = &inkeys[0];
     782                 :     /* we check that input keys are correctly ordered */
     783         9083232 :     if (cur->sk_attno < 1)
     784 UBC           0 :         elog(ERROR, "btree index keys must be ordered by attribute");
     785                 : 
     786                 :     /* We can short-circuit most of the work if there's just one key */
     787 CBC     9083232 :     if (numberOfKeys == 1)
     788                 :     {
     789                 :         /* Apply indoption to scankey (might change sk_strategy!) */
     790         4162381 :         if (!_bt_fix_scankey_strategy(cur, indoption))
     791            1274 :             so->qual_ok = false;
     792         4162381 :         memcpy(outkeys, cur, sizeof(ScanKeyData));
     793         4162381 :         so->numberOfKeys = 1;
     794                 :         /* We can mark the qual as required if it's for first index col */
     795         4162381 :         if (cur->sk_attno == 1)
     796         4161125 :             _bt_mark_scankey_required(outkeys);
     797         4162381 :         return;
     798                 :     }
     799                 : 
     800                 :     /*
     801                 :      * Otherwise, do the full set of pushups.
     802                 :      */
     803         4920851 :     new_numberOfKeys = 0;
     804         4920851 :     numberOfEqualCols = 0;
     805                 : 
     806                 :     /*
     807                 :      * Initialize for processing of keys for attr 1.
     808                 :      *
     809                 :      * xform[i] points to the currently best scan key of strategy type i+1; it
     810                 :      * is NULL if we haven't yet found such a key for this attr.
     811                 :      */
     812         4920851 :     attno = 1;
     813         4920851 :     memset(xform, 0, sizeof(xform));
     814                 : 
     815                 :     /*
     816                 :      * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
     817                 :      * handle after-last-key processing.  Actual exit from the loop is at the
     818                 :      * "break" statement below.
     819                 :      */
     820        16670534 :     for (i = 0;; cur++, i++)
     821                 :     {
     822        16670534 :         if (i < numberOfKeys)
     823                 :         {
     824                 :             /* Apply indoption to scankey (might change sk_strategy!) */
     825        11749683 :             if (!_bt_fix_scankey_strategy(cur, indoption))
     826                 :             {
     827                 :                 /* NULL can't be matched, so give up */
     828 UBC           0 :                 so->qual_ok = false;
     829               0 :                 return;
     830                 :             }
     831                 :         }
     832                 : 
     833                 :         /*
     834                 :          * If we are at the end of the keys for a particular attr, finish up
     835                 :          * processing and emit the cleaned-up keys.
     836                 :          */
     837 CBC    16670534 :         if (i == numberOfKeys || cur->sk_attno != attno)
     838                 :         {
     839        11748683 :             int         priorNumberOfEqualCols = numberOfEqualCols;
     840                 : 
     841                 :             /* check input keys are correctly ordered */
     842        11748683 :             if (i < numberOfKeys && cur->sk_attno < attno)
     843 UBC           0 :                 elog(ERROR, "btree index keys must be ordered by attribute");
     844                 : 
     845                 :             /*
     846                 :              * If = has been specified, all other keys can be eliminated as
     847                 :              * redundant.  If we have a case like key = 1 AND key > 2, we can
     848                 :              * set qual_ok to false and abandon further processing.
     849                 :              *
     850                 :              * We also have to deal with the case of "key IS NULL", which is
     851                 :              * unsatisfiable in combination with any other index condition. By
     852                 :              * the time we get here, that's been classified as an equality
     853                 :              * check, and we've rejected any combination of it with a regular
     854                 :              * equality condition; but not with other types of conditions.
     855                 :              */
     856 CBC    11748683 :             if (xform[BTEqualStrategyNumber - 1])
     857                 :             {
     858        11061315 :                 ScanKey     eq = xform[BTEqualStrategyNumber - 1];
     859                 : 
     860        66367830 :                 for (j = BTMaxStrategyNumber; --j >= 0;)
     861                 :                 {
     862        55306527 :                     ScanKey     chk = xform[j];
     863                 : 
     864        55306527 :                     if (!chk || j == (BTEqualStrategyNumber - 1))
     865        55306450 :                         continue;
     866                 : 
     867              77 :                     if (eq->sk_flags & SK_SEARCHNULL)
     868                 :                     {
     869                 :                         /* IS NULL is contradictory to anything else */
     870              12 :                         so->qual_ok = false;
     871              12 :                         return;
     872                 :                     }
     873                 : 
     874              65 :                     if (_bt_compare_scankey_args(scan, chk, eq, chk,
     875                 :                                                  &test_result))
     876                 :                     {
     877              65 :                         if (!test_result)
     878                 :                         {
     879                 :                             /* keys proven mutually contradictory */
     880 UBC           0 :                             so->qual_ok = false;
     881               0 :                             return;
     882                 :                         }
     883                 :                         /* else discard the redundant non-equality key */
     884 CBC          65 :                         xform[j] = NULL;
     885                 :                     }
     886                 :                     /* else, cannot determine redundancy, keep both keys */
     887                 :                 }
     888                 :                 /* track number of attrs for which we have "=" keys */
     889        11061303 :                 numberOfEqualCols++;
     890                 :             }
     891                 : 
     892                 :             /* try to keep only one of <, <= */
     893        11748671 :             if (xform[BTLessStrategyNumber - 1]
     894             983 :                 && xform[BTLessEqualStrategyNumber - 1])
     895                 :             {
     896 UBC           0 :                 ScanKey     lt = xform[BTLessStrategyNumber - 1];
     897               0 :                 ScanKey     le = xform[BTLessEqualStrategyNumber - 1];
     898                 : 
     899               0 :                 if (_bt_compare_scankey_args(scan, le, lt, le,
     900                 :                                              &test_result))
     901                 :                 {
     902               0 :                     if (test_result)
     903               0 :                         xform[BTLessEqualStrategyNumber - 1] = NULL;
     904                 :                     else
     905               0 :                         xform[BTLessStrategyNumber - 1] = NULL;
     906                 :                 }
     907                 :             }
     908                 : 
     909                 :             /* try to keep only one of >, >= */
     910 CBC    11748671 :             if (xform[BTGreaterStrategyNumber - 1]
     911          685214 :                 && xform[BTGreaterEqualStrategyNumber - 1])
     912                 :             {
     913 UBC           0 :                 ScanKey     gt = xform[BTGreaterStrategyNumber - 1];
     914               0 :                 ScanKey     ge = xform[BTGreaterEqualStrategyNumber - 1];
     915                 : 
     916               0 :                 if (_bt_compare_scankey_args(scan, ge, gt, ge,
     917                 :                                              &test_result))
     918                 :                 {
     919               0 :                     if (test_result)
     920               0 :                         xform[BTGreaterEqualStrategyNumber - 1] = NULL;
     921                 :                     else
     922               0 :                         xform[BTGreaterStrategyNumber - 1] = NULL;
     923                 :                 }
     924                 :             }
     925                 : 
     926                 :             /*
     927                 :              * Emit the cleaned-up keys into the outkeys[] array, and then
     928                 :              * mark them if they are required.  They are required (possibly
     929                 :              * only in one direction) if all attrs before this one had "=".
     930                 :              */
     931 CBC    70492026 :             for (j = BTMaxStrategyNumber; --j >= 0;)
     932                 :             {
     933        58743355 :                 if (xform[j])
     934                 :                 {
     935        11749561 :                     ScanKey     outkey = &outkeys[new_numberOfKeys++];
     936                 : 
     937        11749561 :                     memcpy(outkey, xform[j], sizeof(ScanKeyData));
     938        11749561 :                     if (priorNumberOfEqualCols == attno - 1)
     939        11748725 :                         _bt_mark_scankey_required(outkey);
     940                 :                 }
     941                 :             }
     942                 : 
     943                 :             /*
     944                 :              * Exit loop here if done.
     945                 :              */
     946        11748671 :             if (i == numberOfKeys)
     947         4920839 :                 break;
     948                 : 
     949                 :             /* Re-initialize for new attno */
     950         6827832 :             attno = cur->sk_attno;
     951         6827832 :             memset(xform, 0, sizeof(xform));
     952                 :         }
     953                 : 
     954                 :         /* check strategy this key's operator corresponds to */
     955        11749683 :         j = cur->sk_strategy - 1;
     956                 : 
     957                 :         /* if row comparison, push it directly to the output array */
     958        11749683 :         if (cur->sk_flags & SK_ROW_HEADER)
     959 UBC           0 :         {
     960               0 :             ScanKey     outkey = &outkeys[new_numberOfKeys++];
     961                 : 
     962               0 :             memcpy(outkey, cur, sizeof(ScanKeyData));
     963               0 :             if (numberOfEqualCols == attno - 1)
     964               0 :                 _bt_mark_scankey_required(outkey);
     965                 : 
     966                 :             /*
     967                 :              * We don't support RowCompare using equality; such a qual would
     968                 :              * mess up the numberOfEqualCols tracking.
     969                 :              */
     970               0 :             Assert(j != (BTEqualStrategyNumber - 1));
     971               0 :             continue;
     972                 :         }
     973                 : 
     974                 :         /* have we seen one of these before? */
     975 CBC    11749683 :         if (xform[j] == NULL)
     976                 :         {
     977                 :             /* nope, so remember this scankey */
     978        11749650 :             xform[j] = cur;
     979                 :         }
     980                 :         else
     981                 :         {
     982                 :             /* yup, keep only the more restrictive key */
     983              33 :             if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
     984                 :                                          &test_result))
     985                 :             {
     986              33 :                 if (test_result)
     987              30 :                     xform[j] = cur;
     988               3 :                 else if (j == (BTEqualStrategyNumber - 1))
     989                 :                 {
     990                 :                     /* key == a && key == b, but a != b */
     991 UBC           0 :                     so->qual_ok = false;
     992               0 :                     return;
     993                 :                 }
     994                 :                 /* else old key is more restrictive, keep it */
     995                 :             }
     996                 :             else
     997                 :             {
     998                 :                 /*
     999                 :                  * We can't determine which key is more restrictive.  Keep the
    1000                 :                  * previous one in xform[j] and push this one directly to the
    1001                 :                  * output array.
    1002                 :                  */
    1003               0 :                 ScanKey     outkey = &outkeys[new_numberOfKeys++];
    1004                 : 
    1005               0 :                 memcpy(outkey, cur, sizeof(ScanKeyData));
    1006               0 :                 if (numberOfEqualCols == attno - 1)
    1007               0 :                     _bt_mark_scankey_required(outkey);
    1008                 :             }
    1009                 :         }
    1010                 :     }
    1011                 : 
    1012 CBC     4920839 :     so->numberOfKeys = new_numberOfKeys;
    1013                 : }
    1014                 : 
    1015                 : /*
    1016                 :  * Compare two scankey values using a specified operator.
    1017                 :  *
    1018                 :  * The test we want to perform is logically "leftarg op rightarg", where
    1019                 :  * leftarg and rightarg are the sk_argument values in those ScanKeys, and
    1020                 :  * the comparison operator is the one in the op ScanKey.  However, in
    1021                 :  * cross-data-type situations we may need to look up the correct operator in
    1022                 :  * the index's opfamily: it is the one having amopstrategy = op->sk_strategy
    1023                 :  * and amoplefttype/amoprighttype equal to the two argument datatypes.
    1024                 :  *
    1025                 :  * If the opfamily doesn't supply a complete set of cross-type operators we
    1026                 :  * may not be able to make the comparison.  If we can make the comparison
    1027                 :  * we store the operator result in *result and return true.  We return false
    1028                 :  * if the comparison could not be made.
    1029                 :  *
    1030                 :  * Note: op always points at the same ScanKey as either leftarg or rightarg.
    1031                 :  * Since we don't scribble on the scankeys, this aliasing should cause no
    1032                 :  * trouble.
    1033                 :  *
    1034                 :  * Note: this routine needs to be insensitive to any DESC option applied
    1035                 :  * to the index column.  For example, "x < 4" is a tighter constraint than
    1036                 :  * "x < 5" regardless of which way the index is sorted.
    1037                 :  */
    1038                 : static bool
    1039              98 : _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
    1040                 :                          ScanKey leftarg, ScanKey rightarg,
    1041                 :                          bool *result)
    1042                 : {
    1043              98 :     Relation    rel = scan->indexRelation;
    1044                 :     Oid         lefttype,
    1045                 :                 righttype,
    1046                 :                 optype,
    1047                 :                 opcintype,
    1048                 :                 cmp_op;
    1049                 :     StrategyNumber strat;
    1050                 : 
    1051                 :     /*
    1052                 :      * First, deal with cases where one or both args are NULL.  This should
    1053                 :      * only happen when the scankeys represent IS NULL/NOT NULL conditions.
    1054                 :      */
    1055              98 :     if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL)
    1056                 :     {
    1057                 :         bool        leftnull,
    1058                 :                     rightnull;
    1059                 : 
    1060              55 :         if (leftarg->sk_flags & SK_ISNULL)
    1061                 :         {
    1062               3 :             Assert(leftarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
    1063               3 :             leftnull = true;
    1064                 :         }
    1065                 :         else
    1066              52 :             leftnull = false;
    1067              55 :         if (rightarg->sk_flags & SK_ISNULL)
    1068                 :         {
    1069              52 :             Assert(rightarg->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL));
    1070              52 :             rightnull = true;
    1071                 :         }
    1072                 :         else
    1073               3 :             rightnull = false;
    1074                 : 
    1075                 :         /*
    1076                 :          * We treat NULL as either greater than or less than all other values.
    1077                 :          * Since true > false, the tests below work correctly for NULLS LAST
    1078                 :          * logic.  If the index is NULLS FIRST, we need to flip the strategy.
    1079                 :          */
    1080              55 :         strat = op->sk_strategy;
    1081              55 :         if (op->sk_flags & SK_BT_NULLS_FIRST)
    1082 UBC           0 :             strat = BTCommuteStrategyNumber(strat);
    1083                 : 
    1084 CBC          55 :         switch (strat)
    1085                 :         {
    1086              55 :             case BTLessStrategyNumber:
    1087              55 :                 *result = (leftnull < rightnull);
    1088              55 :                 break;
    1089 UBC           0 :             case BTLessEqualStrategyNumber:
    1090               0 :                 *result = (leftnull <= rightnull);
    1091               0 :                 break;
    1092               0 :             case BTEqualStrategyNumber:
    1093               0 :                 *result = (leftnull == rightnull);
    1094               0 :                 break;
    1095               0 :             case BTGreaterEqualStrategyNumber:
    1096               0 :                 *result = (leftnull >= rightnull);
    1097               0 :                 break;
    1098               0 :             case BTGreaterStrategyNumber:
    1099               0 :                 *result = (leftnull > rightnull);
    1100               0 :                 break;
    1101               0 :             default:
    1102               0 :                 elog(ERROR, "unrecognized StrategyNumber: %d", (int) strat);
    1103                 :                 *result = false;    /* keep compiler quiet */
    1104                 :                 break;
    1105                 :         }
    1106 CBC          55 :         return true;
    1107                 :     }
    1108                 : 
    1109                 :     /*
    1110                 :      * The opfamily we need to worry about is identified by the index column.
    1111                 :      */
    1112              43 :     Assert(leftarg->sk_attno == rightarg->sk_attno);
    1113                 : 
    1114              43 :     opcintype = rel->rd_opcintype[leftarg->sk_attno - 1];
    1115                 : 
    1116                 :     /*
    1117                 :      * Determine the actual datatypes of the ScanKey arguments.  We have to
    1118                 :      * support the convention that sk_subtype == InvalidOid means the opclass
    1119                 :      * input type; this is a hack to simplify life for ScanKeyInit().
    1120                 :      */
    1121              43 :     lefttype = leftarg->sk_subtype;
    1122              43 :     if (lefttype == InvalidOid)
    1123 UBC           0 :         lefttype = opcintype;
    1124 CBC          43 :     righttype = rightarg->sk_subtype;
    1125              43 :     if (righttype == InvalidOid)
    1126 UBC           0 :         righttype = opcintype;
    1127 CBC          43 :     optype = op->sk_subtype;
    1128              43 :     if (optype == InvalidOid)
    1129 UBC           0 :         optype = opcintype;
    1130                 : 
    1131                 :     /*
    1132                 :      * If leftarg and rightarg match the types expected for the "op" scankey,
    1133                 :      * we can use its already-looked-up comparison function.
    1134                 :      */
    1135 CBC          43 :     if (lefttype == opcintype && righttype == optype)
    1136                 :     {
    1137              43 :         *result = DatumGetBool(FunctionCall2Coll(&op->sk_func,
    1138                 :                                                  op->sk_collation,
    1139                 :                                                  leftarg->sk_argument,
    1140                 :                                                  rightarg->sk_argument));
    1141              43 :         return true;
    1142                 :     }
    1143                 : 
    1144                 :     /*
    1145                 :      * Otherwise, we need to go to the syscache to find the appropriate
    1146                 :      * operator.  (This cannot result in infinite recursion, since no
    1147                 :      * indexscan initiated by syscache lookup will use cross-data-type
    1148                 :      * operators.)
    1149                 :      *
    1150                 :      * If the sk_strategy was flipped by _bt_fix_scankey_strategy, we have to
    1151                 :      * un-flip it to get the correct opfamily member.
    1152                 :      */
    1153 UBC           0 :     strat = op->sk_strategy;
    1154               0 :     if (op->sk_flags & SK_BT_DESC)
    1155               0 :         strat = BTCommuteStrategyNumber(strat);
    1156                 : 
    1157               0 :     cmp_op = get_opfamily_member(rel->rd_opfamily[leftarg->sk_attno - 1],
    1158                 :                                  lefttype,
    1159                 :                                  righttype,
    1160                 :                                  strat);
    1161               0 :     if (OidIsValid(cmp_op))
    1162                 :     {
    1163               0 :         RegProcedure cmp_proc = get_opcode(cmp_op);
    1164                 : 
    1165               0 :         if (RegProcedureIsValid(cmp_proc))
    1166                 :         {
    1167               0 :             *result = DatumGetBool(OidFunctionCall2Coll(cmp_proc,
    1168                 :                                                         op->sk_collation,
    1169                 :                                                         leftarg->sk_argument,
    1170                 :                                                         rightarg->sk_argument));
    1171               0 :             return true;
    1172                 :         }
    1173                 :     }
    1174                 : 
    1175                 :     /* Can't make the comparison */
    1176               0 :     *result = false;            /* suppress compiler warnings */
    1177               0 :     return false;
    1178                 : }
    1179                 : 
    1180                 : /*
    1181                 :  * Adjust a scankey's strategy and flags setting as needed for indoptions.
    1182                 :  *
    1183                 :  * We copy the appropriate indoption value into the scankey sk_flags
    1184                 :  * (shifting to avoid clobbering system-defined flag bits).  Also, if
    1185                 :  * the DESC option is set, commute (flip) the operator strategy number.
    1186                 :  *
    1187                 :  * A secondary purpose is to check for IS NULL/NOT NULL scankeys and set up
    1188                 :  * the strategy field correctly for them.
    1189                 :  *
    1190                 :  * Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a
    1191                 :  * NULL comparison value.  Since all btree operators are assumed strict,
    1192                 :  * a NULL means that the qual cannot be satisfied.  We return true if the
    1193                 :  * comparison value isn't NULL, or false if the scan should be abandoned.
    1194                 :  *
    1195                 :  * This function is applied to the *input* scankey structure; therefore
    1196                 :  * on a rescan we will be looking at already-processed scankeys.  Hence
    1197                 :  * we have to be careful not to re-commute the strategy if we already did it.
    1198                 :  * It's a bit ugly to modify the caller's copy of the scankey but in practice
    1199                 :  * there shouldn't be any problem, since the index's indoptions are certainly
    1200                 :  * not going to change while the scankey survives.
    1201                 :  */
    1202                 : static bool
    1203 CBC    15912064 : _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
    1204                 : {
    1205                 :     int         addflags;
    1206                 : 
    1207        15912064 :     addflags = indoption[skey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
    1208                 : 
    1209                 :     /*
    1210                 :      * We treat all btree operators as strict (even if they're not so marked
    1211                 :      * in pg_proc). This means that it is impossible for an operator condition
    1212                 :      * with a NULL comparison constant to succeed, and we can reject it right
    1213                 :      * away.
    1214                 :      *
    1215                 :      * However, we now also support "x IS NULL" clauses as search conditions,
    1216                 :      * so in that case keep going. The planner has not filled in any
    1217                 :      * particular strategy in this case, so set it to BTEqualStrategyNumber
    1218                 :      * --- we can treat IS NULL as an equality operator for purposes of search
    1219                 :      * strategy.
    1220                 :      *
    1221                 :      * Likewise, "x IS NOT NULL" is supported.  We treat that as either "less
    1222                 :      * than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
    1223                 :      * FIRST index.
    1224                 :      *
    1225                 :      * Note: someday we might have to fill in sk_collation from the index
    1226                 :      * column's collation.  At the moment this is a non-issue because we'll
    1227                 :      * never actually call the comparison operator on a NULL.
    1228                 :      */
    1229        15912064 :     if (skey->sk_flags & SK_ISNULL)
    1230                 :     {
    1231                 :         /* SK_ISNULL shouldn't be set in a row header scankey */
    1232           48245 :         Assert(!(skey->sk_flags & SK_ROW_HEADER));
    1233                 : 
    1234                 :         /* Set indoption flags in scankey (might be done already) */
    1235           48245 :         skey->sk_flags |= addflags;
    1236                 : 
    1237                 :         /* Set correct strategy for IS NULL or NOT NULL search */
    1238           48245 :         if (skey->sk_flags & SK_SEARCHNULL)
    1239                 :         {
    1240              67 :             skey->sk_strategy = BTEqualStrategyNumber;
    1241              67 :             skey->sk_subtype = InvalidOid;
    1242              67 :             skey->sk_collation = InvalidOid;
    1243                 :         }
    1244           48178 :         else if (skey->sk_flags & SK_SEARCHNOTNULL)
    1245                 :         {
    1246           46904 :             if (skey->sk_flags & SK_BT_NULLS_FIRST)
    1247              18 :                 skey->sk_strategy = BTGreaterStrategyNumber;
    1248                 :             else
    1249           46886 :                 skey->sk_strategy = BTLessStrategyNumber;
    1250           46904 :             skey->sk_subtype = InvalidOid;
    1251           46904 :             skey->sk_collation = InvalidOid;
    1252                 :         }
    1253                 :         else
    1254                 :         {
    1255                 :             /* regular qual, so it cannot be satisfied */
    1256            1274 :             return false;
    1257                 :         }
    1258                 : 
    1259                 :         /* Needn't do the rest */
    1260           46971 :         return true;
    1261                 :     }
    1262                 : 
    1263                 :     /* Adjust strategy for DESC, if we didn't already */
    1264        15863819 :     if ((addflags & SK_BT_DESC) && !(skey->sk_flags & SK_BT_DESC))
    1265 UBC           0 :         skey->sk_strategy = BTCommuteStrategyNumber(skey->sk_strategy);
    1266 CBC    15863819 :     skey->sk_flags |= addflags;
    1267                 : 
    1268                 :     /* If it's a row header, fix row member flags and strategies similarly */
    1269        15863819 :     if (skey->sk_flags & SK_ROW_HEADER)
    1270                 :     {
    1271              18 :         ScanKey     subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
    1272                 : 
    1273                 :         for (;;)
    1274                 :         {
    1275              36 :             Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1276              36 :             addflags = indoption[subkey->sk_attno - 1] << SK_BT_INDOPTION_SHIFT;
    1277              36 :             if ((addflags & SK_BT_DESC) && !(subkey->sk_flags & SK_BT_DESC))
    1278 UBC           0 :                 subkey->sk_strategy = BTCommuteStrategyNumber(subkey->sk_strategy);
    1279 CBC          36 :             subkey->sk_flags |= addflags;
    1280              36 :             if (subkey->sk_flags & SK_ROW_END)
    1281              18 :                 break;
    1282              18 :             subkey++;
    1283                 :         }
    1284                 :     }
    1285                 : 
    1286        15863819 :     return true;
    1287                 : }
    1288                 : 
    1289                 : /*
    1290                 :  * Mark a scankey as "required to continue the scan".
    1291                 :  *
    1292                 :  * Depending on the operator type, the key may be required for both scan
    1293                 :  * directions or just one.  Also, if the key is a row comparison header,
    1294                 :  * we have to mark its first subsidiary ScanKey as required.  (Subsequent
    1295                 :  * subsidiary ScanKeys are normally for lower-order columns, and thus
    1296                 :  * cannot be required, since they're after the first non-equality scankey.)
    1297                 :  *
    1298                 :  * Note: when we set required-key flag bits in a subsidiary scankey, we are
    1299                 :  * scribbling on a data structure belonging to the index AM's caller, not on
    1300                 :  * our private copy.  This should be OK because the marking will not change
    1301                 :  * from scan to scan within a query, and so we'd just re-mark the same way
    1302                 :  * anyway on a rescan.  Something to keep an eye on though.
    1303                 :  */
    1304                 : static void
    1305        15909850 : _bt_mark_scankey_required(ScanKey skey)
    1306                 : {
    1307                 :     int         addflags;
    1308                 : 
    1309        15909850 :     switch (skey->sk_strategy)
    1310                 :     {
    1311           48298 :         case BTLessStrategyNumber:
    1312                 :         case BTLessEqualStrategyNumber:
    1313           48298 :             addflags = SK_BT_REQFWD;
    1314           48298 :             break;
    1315        15173879 :         case BTEqualStrategyNumber:
    1316        15173879 :             addflags = SK_BT_REQFWD | SK_BT_REQBKWD;
    1317        15173879 :             break;
    1318          687673 :         case BTGreaterEqualStrategyNumber:
    1319                 :         case BTGreaterStrategyNumber:
    1320          687673 :             addflags = SK_BT_REQBKWD;
    1321          687673 :             break;
    1322 UBC           0 :         default:
    1323               0 :             elog(ERROR, "unrecognized StrategyNumber: %d",
    1324                 :                  (int) skey->sk_strategy);
    1325                 :             addflags = 0;       /* keep compiler quiet */
    1326                 :             break;
    1327                 :     }
    1328                 : 
    1329 CBC    15909850 :     skey->sk_flags |= addflags;
    1330                 : 
    1331        15909850 :     if (skey->sk_flags & SK_ROW_HEADER)
    1332                 :     {
    1333              18 :         ScanKey     subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
    1334                 : 
    1335                 :         /* First subkey should be same column/operator as the header */
    1336              18 :         Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1337              18 :         Assert(subkey->sk_attno == skey->sk_attno);
    1338              18 :         Assert(subkey->sk_strategy == skey->sk_strategy);
    1339              18 :         subkey->sk_flags |= addflags;
    1340                 :     }
    1341        15909850 : }
    1342                 : 
    1343                 : /*
    1344                 :  * Test whether an indextuple satisfies all the scankey conditions.
    1345                 :  *
    1346                 :  * Return true if so, false if not.  If the tuple fails to pass the qual,
    1347                 :  * we also determine whether there's any need to continue the scan beyond
    1348                 :  * this tuple, and set *continuescan accordingly.  See comments for
    1349                 :  * _bt_preprocess_keys(), above, about how this is done.
    1350                 :  *
    1351                 :  * Forward scan callers can pass a high key tuple in the hopes of having
    1352                 :  * us set *continuescan to false, and avoiding an unnecessary visit to
    1353                 :  * the page to the right.
    1354                 :  *
    1355                 :  * scan: index scan descriptor (containing a search-type scankey)
    1356                 :  * tuple: index tuple to test
    1357                 :  * tupnatts: number of attributes in tupnatts (high key may be truncated)
    1358                 :  * dir: direction we are scanning in
    1359                 :  * continuescan: output parameter (will be set correctly in all cases)
    1360                 :  */
    1361                 : bool
    1362        30729387 : _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts,
    1363                 :               ScanDirection dir, bool *continuescan)
    1364                 : {
    1365                 :     TupleDesc   tupdesc;
    1366                 :     BTScanOpaque so;
    1367                 :     int         keysz;
    1368                 :     int         ikey;
    1369                 :     ScanKey     key;
    1370                 : 
    1371        30729387 :     Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts);
    1372                 : 
    1373        30729387 :     *continuescan = true;       /* default assumption */
    1374                 : 
    1375        30729387 :     tupdesc = RelationGetDescr(scan->indexRelation);
    1376        30729387 :     so = (BTScanOpaque) scan->opaque;
    1377        30729387 :     keysz = so->numberOfKeys;
    1378                 : 
    1379        60105367 :     for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
    1380                 :     {
    1381                 :         Datum       datum;
    1382                 :         bool        isNull;
    1383                 :         Datum       test;
    1384                 : 
    1385        35695871 :         if (key->sk_attno > tupnatts)
    1386                 :         {
    1387                 :             /*
    1388                 :              * This attribute is truncated (must be high key).  The value for
    1389                 :              * this attribute in the first non-pivot tuple on the page to the
    1390                 :              * right could be any possible value.  Assume that truncated
    1391                 :              * attribute passes the qual.
    1392                 :              */
    1393             949 :             Assert(ScanDirectionIsForward(dir));
    1394             949 :             Assert(BTreeTupleIsPivot(tuple));
    1395         8580342 :             continue;
    1396                 :         }
    1397                 : 
    1398                 :         /* row-comparison keys need special processing */
    1399        35694922 :         if (key->sk_flags & SK_ROW_HEADER)
    1400                 :         {
    1401             990 :             if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
    1402                 :                                      continuescan))
    1403             963 :                 continue;
    1404         6319891 :             return false;
    1405                 :         }
    1406                 : 
    1407        35693932 :         datum = index_getattr(tuple,
    1408        35693932 :                               key->sk_attno,
    1409                 :                               tupdesc,
    1410                 :                               &isNull);
    1411                 : 
    1412        35693932 :         if (key->sk_flags & SK_ISNULL)
    1413                 :         {
    1414                 :             /* Handle IS NULL/NOT NULL tests */
    1415         8602466 :             if (key->sk_flags & SK_SEARCHNULL)
    1416                 :             {
    1417           24073 :                 if (isNull)
    1418              67 :                     continue;   /* tuple satisfies this qual */
    1419                 :             }
    1420                 :             else
    1421                 :             {
    1422         8578393 :                 Assert(key->sk_flags & SK_SEARCHNOTNULL);
    1423         8578393 :                 if (!isNull)
    1424         8578363 :                     continue;   /* tuple satisfies this qual */
    1425                 :             }
    1426                 : 
    1427                 :             /*
    1428                 :              * Tuple fails this qual.  If it's a required qual for the current
    1429                 :              * scan direction, then we can conclude no further tuples will
    1430                 :              * pass, either.
    1431                 :              */
    1432           24036 :             if ((key->sk_flags & SK_BT_REQFWD) &&
    1433                 :                 ScanDirectionIsForward(dir))
    1434              12 :                 *continuescan = false;
    1435           24024 :             else if ((key->sk_flags & SK_BT_REQBKWD) &&
    1436                 :                      ScanDirectionIsBackward(dir))
    1437 UBC           0 :                 *continuescan = false;
    1438                 : 
    1439                 :             /*
    1440                 :              * In any case, this indextuple doesn't match the qual.
    1441                 :              */
    1442 CBC       24036 :             return false;
    1443                 :         }
    1444                 : 
    1445        27091466 :         if (isNull)
    1446                 :         {
    1447              66 :             if (key->sk_flags & SK_BT_NULLS_FIRST)
    1448                 :             {
    1449                 :                 /*
    1450                 :                  * Since NULLs are sorted before non-NULLs, we know we have
    1451                 :                  * reached the lower limit of the range of values for this
    1452                 :                  * index attr.  On a backward scan, we can stop if this qual
    1453                 :                  * is one of the "must match" subset.  We can stop regardless
    1454                 :                  * of whether the qual is > or <, so long as it's required,
    1455                 :                  * because it's not possible for any future tuples to pass. On
    1456                 :                  * a forward scan, however, we must keep going, because we may
    1457                 :                  * have initially positioned to the start of the index.
    1458                 :                  */
    1459 UBC           0 :                 if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
    1460                 :                     ScanDirectionIsBackward(dir))
    1461               0 :                     *continuescan = false;
    1462                 :             }
    1463                 :             else
    1464                 :             {
    1465                 :                 /*
    1466                 :                  * Since NULLs are sorted after non-NULLs, we know we have
    1467                 :                  * reached the upper limit of the range of values for this
    1468                 :                  * index attr.  On a forward scan, we can stop if this qual is
    1469                 :                  * one of the "must match" subset.  We can stop regardless of
    1470                 :                  * whether the qual is > or <, so long as it's required,
    1471                 :                  * because it's not possible for any future tuples to pass. On
    1472                 :                  * a backward scan, however, we must keep going, because we
    1473                 :                  * may have initially positioned to the end of the index.
    1474                 :                  */
    1475 CBC          66 :                 if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
    1476                 :                     ScanDirectionIsForward(dir))
    1477              42 :                     *continuescan = false;
    1478                 :             }
    1479                 : 
    1480                 :             /*
    1481                 :              * In any case, this indextuple doesn't match the qual.
    1482                 :              */
    1483              66 :             return false;
    1484                 :         }
    1485                 : 
    1486        27091400 :         test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
    1487                 :                                  datum, key->sk_argument);
    1488                 : 
    1489        27091400 :         if (!DatumGetBool(test))
    1490                 :         {
    1491                 :             /*
    1492                 :              * Tuple fails this qual.  If it's a required qual for the current
    1493                 :              * scan direction, then we can conclude no further tuples will
    1494                 :              * pass, either.
    1495                 :              *
    1496                 :              * Note: because we stop the scan as soon as any required equality
    1497                 :              * qual fails, it is critical that equality quals be used for the
    1498                 :              * initial positioning in _bt_first() when they are available. See
    1499                 :              * comments in _bt_first().
    1500                 :              */
    1501         6295762 :             if ((key->sk_flags & SK_BT_REQFWD) &&
    1502                 :                 ScanDirectionIsForward(dir))
    1503         6147307 :                 *continuescan = false;
    1504          148455 :             else if ((key->sk_flags & SK_BT_REQBKWD) &&
    1505                 :                      ScanDirectionIsBackward(dir))
    1506              42 :                 *continuescan = false;
    1507                 : 
    1508                 :             /*
    1509                 :              * In any case, this indextuple doesn't match the qual.
    1510                 :              */
    1511         6295762 :             return false;
    1512                 :         }
    1513                 :     }
    1514                 : 
    1515                 :     /* If we get here, the tuple passes all index quals. */
    1516        24409496 :     return true;
    1517                 : }
    1518                 : 
    1519                 : /*
    1520                 :  * Test whether an indextuple satisfies a row-comparison scan condition.
    1521                 :  *
    1522                 :  * Return true if so, false if not.  If not, also clear *continuescan if
    1523                 :  * it's not possible for any future tuples in the current scan direction
    1524                 :  * to pass the qual.
    1525                 :  *
    1526                 :  * This is a subroutine for _bt_checkkeys, which see for more info.
    1527                 :  */
    1528                 : static bool
    1529             990 : _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
    1530                 :                      TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
    1531                 : {
    1532             990 :     ScanKey     subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
    1533             990 :     int32       cmpresult = 0;
    1534                 :     bool        result;
    1535                 : 
    1536                 :     /* First subkey should be same as the header says */
    1537             990 :     Assert(subkey->sk_attno == skey->sk_attno);
    1538                 : 
    1539                 :     /* Loop over columns of the row condition */
    1540                 :     for (;;)
    1541              78 :     {
    1542                 :         Datum       datum;
    1543                 :         bool        isNull;
    1544                 : 
    1545            1068 :         Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1546                 : 
    1547            1068 :         if (subkey->sk_attno > tupnatts)
    1548                 :         {
    1549                 :             /*
    1550                 :              * This attribute is truncated (must be high key).  The value for
    1551                 :              * this attribute in the first non-pivot tuple on the page to the
    1552                 :              * right could be any possible value.  Assume that truncated
    1553                 :              * attribute passes the qual.
    1554                 :              */
    1555               3 :             Assert(ScanDirectionIsForward(dir));
    1556               3 :             Assert(BTreeTupleIsPivot(tuple));
    1557               3 :             cmpresult = 0;
    1558               3 :             if (subkey->sk_flags & SK_ROW_END)
    1559               3 :                 break;
    1560 UBC           0 :             subkey++;
    1561               0 :             continue;
    1562                 :         }
    1563                 : 
    1564 CBC        1065 :         datum = index_getattr(tuple,
    1565            1065 :                               subkey->sk_attno,
    1566                 :                               tupdesc,
    1567                 :                               &isNull);
    1568                 : 
    1569            1065 :         if (isNull)
    1570                 :         {
    1571              24 :             if (subkey->sk_flags & SK_BT_NULLS_FIRST)
    1572                 :             {
    1573                 :                 /*
    1574                 :                  * Since NULLs are sorted before non-NULLs, we know we have
    1575                 :                  * reached the lower limit of the range of values for this
    1576                 :                  * index attr.  On a backward scan, we can stop if this qual
    1577                 :                  * is one of the "must match" subset.  We can stop regardless
    1578                 :                  * of whether the qual is > or <, so long as it's required,
    1579                 :                  * because it's not possible for any future tuples to pass. On
    1580                 :                  * a forward scan, however, we must keep going, because we may
    1581                 :                  * have initially positioned to the start of the index.
    1582                 :                  */
    1583 UBC           0 :                 if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
    1584                 :                     ScanDirectionIsBackward(dir))
    1585               0 :                     *continuescan = false;
    1586                 :             }
    1587                 :             else
    1588                 :             {
    1589                 :                 /*
    1590                 :                  * Since NULLs are sorted after non-NULLs, we know we have
    1591                 :                  * reached the upper limit of the range of values for this
    1592                 :                  * index attr.  On a forward scan, we can stop if this qual is
    1593                 :                  * one of the "must match" subset.  We can stop regardless of
    1594                 :                  * whether the qual is > or <, so long as it's required,
    1595                 :                  * because it's not possible for any future tuples to pass. On
    1596                 :                  * a backward scan, however, we must keep going, because we
    1597                 :                  * may have initially positioned to the end of the index.
    1598                 :                  */
    1599 CBC          24 :                 if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
    1600                 :                     ScanDirectionIsForward(dir))
    1601 UBC           0 :                     *continuescan = false;
    1602                 :             }
    1603                 : 
    1604                 :             /*
    1605                 :              * In any case, this indextuple doesn't match the qual.
    1606                 :              */
    1607 CBC          24 :             return false;
    1608                 :         }
    1609                 : 
    1610            1041 :         if (subkey->sk_flags & SK_ISNULL)
    1611                 :         {
    1612                 :             /*
    1613                 :              * Unlike the simple-scankey case, this isn't a disallowed case.
    1614                 :              * But it can never match.  If all the earlier row comparison
    1615                 :              * columns are required for the scan direction, we can stop the
    1616                 :              * scan, because there can't be another tuple that will succeed.
    1617                 :              */
    1618 UBC           0 :             if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument))
    1619               0 :                 subkey--;
    1620               0 :             if ((subkey->sk_flags & SK_BT_REQFWD) &&
    1621                 :                 ScanDirectionIsForward(dir))
    1622               0 :                 *continuescan = false;
    1623               0 :             else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
    1624                 :                      ScanDirectionIsBackward(dir))
    1625               0 :                 *continuescan = false;
    1626               0 :             return false;
    1627                 :         }
    1628                 : 
    1629                 :         /* Perform the test --- three-way comparison not bool operator */
    1630 CBC        1041 :         cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func,
    1631                 :                                                     subkey->sk_collation,
    1632                 :                                                     datum,
    1633                 :                                                     subkey->sk_argument));
    1634                 : 
    1635            1041 :         if (subkey->sk_flags & SK_BT_DESC)
    1636 UBC           0 :             INVERT_COMPARE_RESULT(cmpresult);
    1637                 : 
    1638                 :         /* Done comparing if unequal, else advance to next column */
    1639 CBC        1041 :         if (cmpresult != 0)
    1640             963 :             break;
    1641                 : 
    1642              78 :         if (subkey->sk_flags & SK_ROW_END)
    1643 UBC           0 :             break;
    1644 CBC          78 :         subkey++;
    1645                 :     }
    1646                 : 
    1647                 :     /*
    1648                 :      * At this point cmpresult indicates the overall result of the row
    1649                 :      * comparison, and subkey points to the deciding column (or the last
    1650                 :      * column if the result is "=").
    1651                 :      */
    1652             966 :     switch (subkey->sk_strategy)
    1653                 :     {
    1654                 :             /* EQ and NE cases aren't allowed here */
    1655 UBC           0 :         case BTLessStrategyNumber:
    1656               0 :             result = (cmpresult < 0);
    1657               0 :             break;
    1658 CBC         795 :         case BTLessEqualStrategyNumber:
    1659             795 :             result = (cmpresult <= 0);
    1660             795 :             break;
    1661             120 :         case BTGreaterEqualStrategyNumber:
    1662             120 :             result = (cmpresult >= 0);
    1663             120 :             break;
    1664              51 :         case BTGreaterStrategyNumber:
    1665              51 :             result = (cmpresult > 0);
    1666              51 :             break;
    1667 UBC           0 :         default:
    1668               0 :             elog(ERROR, "unrecognized RowCompareType: %d",
    1669                 :                  (int) subkey->sk_strategy);
    1670                 :             result = 0;         /* keep compiler quiet */
    1671                 :             break;
    1672                 :     }
    1673                 : 
    1674 CBC         966 :     if (!result)
    1675                 :     {
    1676                 :         /*
    1677                 :          * Tuple fails this qual.  If it's a required qual for the current
    1678                 :          * scan direction, then we can conclude no further tuples will pass,
    1679                 :          * either.  Note we have to look at the deciding column, not
    1680                 :          * necessarily the first or last column of the row condition.
    1681                 :          */
    1682               3 :         if ((subkey->sk_flags & SK_BT_REQFWD) &&
    1683                 :             ScanDirectionIsForward(dir))
    1684               3 :             *continuescan = false;
    1685 UBC           0 :         else if ((subkey->sk_flags & SK_BT_REQBKWD) &&
    1686                 :                  ScanDirectionIsBackward(dir))
    1687               0 :             *continuescan = false;
    1688                 :     }
    1689                 : 
    1690 CBC         966 :     return result;
    1691                 : }
    1692                 : 
    1693                 : /*
    1694                 :  * _bt_killitems - set LP_DEAD state for items an indexscan caller has
    1695                 :  * told us were killed
    1696                 :  *
    1697                 :  * scan->opaque, referenced locally through so, contains information about the
    1698                 :  * current page and killed tuples thereon (generally, this should only be
    1699                 :  * called if so->numKilled > 0).
    1700                 :  *
    1701                 :  * The caller does not have a lock on the page and may or may not have the
    1702                 :  * page pinned in a buffer.  Note that read-lock is sufficient for setting
    1703                 :  * LP_DEAD status (which is only a hint).
    1704                 :  *
    1705                 :  * We match items by heap TID before assuming they are the right ones to
    1706                 :  * delete.  We cope with cases where items have moved right due to insertions.
    1707                 :  * If an item has moved off the current page due to a split, we'll fail to
    1708                 :  * find it and do nothing (this is not an error case --- we assume the item
    1709                 :  * will eventually get marked in a future indexscan).
    1710                 :  *
    1711                 :  * Note that if we hold a pin on the target page continuously from initially
    1712                 :  * reading the items until applying this function, VACUUM cannot have deleted
    1713                 :  * any items from the page, and so there is no need to search left from the
    1714                 :  * recorded offset.  (This observation also guarantees that the item is still
    1715                 :  * the right one to delete, which might otherwise be questionable since heap
    1716                 :  * TIDs can get recycled.)  This holds true even if the page has been modified
    1717                 :  * by inserts and page splits, so there is no need to consult the LSN.
    1718                 :  *
    1719                 :  * If the pin was released after reading the page, then we re-read it.  If it
    1720                 :  * has been modified since we read it (as determined by the LSN), we dare not
    1721                 :  * flag any entries because it is possible that the old entry was vacuumed
    1722                 :  * away and the TID was re-used by a completely different heap tuple.
    1723                 :  */
    1724                 : void
    1725          118090 : _bt_killitems(IndexScanDesc scan)
    1726                 : {
    1727          118090 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1728                 :     Page        page;
    1729                 :     BTPageOpaque opaque;
    1730                 :     OffsetNumber minoff;
    1731                 :     OffsetNumber maxoff;
    1732                 :     int         i;
    1733          118090 :     int         numKilled = so->numKilled;
    1734          118090 :     bool        killedsomething = false;
    1735                 :     bool        droppedpin PG_USED_FOR_ASSERTS_ONLY;
    1736                 : 
    1737          118090 :     Assert(BTScanPosIsValid(so->currPos));
    1738                 : 
    1739                 :     /*
    1740                 :      * Always reset the scan state, so we don't look for same items on other
    1741                 :      * pages.
    1742                 :      */
    1743          118090 :     so->numKilled = 0;
    1744                 : 
    1745          118090 :     if (BTScanPosIsPinned(so->currPos))
    1746                 :     {
    1747                 :         /*
    1748                 :          * We have held the pin on this page since we read the index tuples,
    1749                 :          * so all we need to do is lock it.  The pin will have prevented
    1750                 :          * re-use of any TID on the page, so there is no need to check the
    1751                 :          * LSN.
    1752                 :          */
    1753           17408 :         droppedpin = false;
    1754           17408 :         _bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
    1755                 : 
    1756           17408 :         page = BufferGetPage(so->currPos.buf);
    1757                 :     }
    1758                 :     else
    1759                 :     {
    1760                 :         Buffer      buf;
    1761                 : 
    1762          100682 :         droppedpin = true;
    1763                 :         /* Attempt to re-read the buffer, getting pin and lock. */
    1764 GNC      100682 :         buf = _bt_getbuf(scan->indexRelation, scan->heapRelation,
    1765                 :                          so->currPos.currPage, BT_READ);
    1766                 : 
    1767 GIC      100682 :         page = BufferGetPage(buf);
    1768 CBC      100682 :         if (BufferGetLSNAtomic(buf) == so->currPos.lsn)
    1769          100648 :             so->currPos.buf = buf;
    1770 ECB             :         else
    1771                 :         {
    1772                 :             /* Modified while not pinned means hinting is not safe. */
    1773 GIC          34 :             _bt_relbuf(scan->indexRelation, buf);
    1774 CBC          34 :             return;
    1775 ECB             :         }
    1776                 :     }
    1777                 : 
    1778 GIC      118056 :     opaque = BTPageGetOpaque(page);
    1779 CBC      118056 :     minoff = P_FIRSTDATAKEY(opaque);
    1780          118056 :     maxoff = PageGetMaxOffsetNumber(page);
    1781 ECB             : 
    1782 GIC      380939 :     for (i = 0; i < numKilled; i++)
    1783 ECB             :     {
    1784 GIC      262883 :         int         itemIndex = so->killedItems[i];
    1785 CBC      262883 :         BTScanPosItem *kitem = &so->currPos.items[itemIndex];
    1786          262883 :         OffsetNumber offnum = kitem->indexOffset;
    1787 ECB             : 
    1788 GIC      262883 :         Assert(itemIndex >= so->currPos.firstItem &&
    1789 ECB             :                itemIndex <= so->currPos.lastItem);
    1790 GIC      262883 :         if (offnum < minoff)
    1791 LBC           0 :             continue;           /* pure paranoia */
    1792 GBC    10774381 :         while (offnum <= maxoff)
    1793 ECB             :         {
    1794 GIC    10707753 :             ItemId      iid = PageGetItemId(page, offnum);
    1795 CBC    10707753 :             IndexTuple  ituple = (IndexTuple) PageGetItem(page, iid);
    1796        10707753 :             bool        killtuple = false;
    1797 ECB             : 
    1798 GIC    10707753 :             if (BTreeTupleIsPosting(ituple))
    1799 ECB             :             {
    1800 GIC     3297359 :                 int         pi = i + 1;
    1801 CBC     3297359 :                 int         nposting = BTreeTupleGetNPosting(ituple);
    1802 ECB             :                 int         j;
    1803                 : 
    1804                 :                 /*
    1805                 :                  * We rely on the convention that heap TIDs in the scanpos
    1806                 :                  * items array are stored in ascending heap TID order for a
    1807                 :                  * group of TIDs that originally came from a posting list
    1808                 :                  * tuple.  This convention even applies during backwards
    1809                 :                  * scans, where returning the TIDs in descending order might
    1810                 :                  * seem more natural.  This is about effectiveness, not
    1811                 :                  * correctness.
    1812                 :                  *
    1813                 :                  * Note that the page may have been modified in almost any way
    1814                 :                  * since we first read it (in the !droppedpin case), so it's
    1815                 :                  * possible that this posting list tuple wasn't a posting list
    1816                 :                  * tuple when we first encountered its heap TIDs.
    1817                 :                  */
    1818 GIC     3365637 :                 for (j = 0; j < nposting; j++)
    1819 ECB             :                 {
    1820 GIC     3364038 :                     ItemPointer item = BTreeTupleGetPostingN(ituple, j);
    1821 ECB             : 
    1822 GIC     3364038 :                     if (!ItemPointerEquals(item, &kitem->heapTid))
    1823 CBC     3295760 :                         break;  /* out of posting list loop */
    1824 ECB             : 
    1825                 :                     /*
    1826                 :                      * kitem must have matching offnum when heap TIDs match,
    1827                 :                      * though only in the common case where the page can't
    1828                 :                      * have been concurrently modified
    1829                 :                      */
    1830 GIC       68278 :                     Assert(kitem->indexOffset == offnum || !droppedpin);
    1831 ECB             : 
    1832                 :                     /*
    1833                 :                      * Read-ahead to later kitems here.
    1834                 :                      *
    1835                 :                      * We rely on the assumption that not advancing kitem here
    1836                 :                      * will prevent us from considering the posting list tuple
    1837                 :                      * fully dead by not matching its next heap TID in next
    1838                 :                      * loop iteration.
    1839                 :                      *
    1840                 :                      * If, on the other hand, this is the final heap TID in
    1841                 :                      * the posting list tuple, then tuple gets killed
    1842                 :                      * regardless (i.e. we handle the case where the last
    1843                 :                      * kitem is also the last heap TID in the last index tuple
    1844                 :                      * correctly -- posting tuple still gets killed).
    1845                 :                      */
    1846 GIC       68278 :                     if (pi < numKilled)
    1847 CBC       22354 :                         kitem = &so->currPos.items[so->killedItems[pi++]];
    1848 ECB             :                 }
    1849                 : 
    1850                 :                 /*
    1851                 :                  * Don't bother advancing the outermost loop's int iterator to
    1852                 :                  * avoid processing killed items that relate to the same
    1853                 :                  * offnum/posting list tuple.  This micro-optimization hardly
    1854                 :                  * seems worth it.  (Further iterations of the outermost loop
    1855                 :                  * will fail to match on this same posting list's first heap
    1856                 :                  * TID instead, so we'll advance to the next offnum/index
    1857                 :                  * tuple pretty quickly.)
    1858                 :                  */
    1859 GIC     3297359 :                 if (j == nposting)
    1860 CBC        1599 :                     killtuple = true;
    1861 ECB             :             }
    1862 GIC     7410394 :             else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
    1863 CBC      194890 :                 killtuple = true;
    1864 ECB             : 
    1865                 :             /*
    1866                 :              * Mark index item as dead, if it isn't already.  Since this
    1867                 :              * happens while holding a buffer lock possibly in shared mode,
    1868                 :              * it's possible that multiple processes attempt to do this
    1869                 :              * simultaneously, leading to multiple full-page images being sent
    1870                 :              * to WAL (if wal_log_hints or data checksums are enabled), which
    1871                 :              * is undesirable.
    1872                 :              */
    1873 GIC    10707753 :             if (killtuple && !ItemIdIsDead(iid))
    1874 ECB             :             {
    1875                 :                 /* found the item/all posting list items */
    1876 GIC      196255 :                 ItemIdMarkDead(iid);
    1877 CBC      196255 :                 killedsomething = true;
    1878          196255 :                 break;          /* out of inner search loop */
    1879 ECB             :             }
    1880 GIC    10511498 :             offnum = OffsetNumberNext(offnum);
    1881 ECB             :         }
    1882                 :     }
    1883                 : 
    1884                 :     /*
    1885                 :      * Since this can be redone later if needed, mark as dirty hint.
    1886                 :      *
    1887                 :      * Whenever we mark anything LP_DEAD, we also set the page's
    1888                 :      * BTP_HAS_GARBAGE flag, which is likewise just a hint.  (Note that we
    1889                 :      * only rely on the page-level flag in !heapkeyspace indexes.)
    1890                 :      */
    1891 GIC      118056 :     if (killedsomething)
    1892 ECB             :     {
    1893 GIC       69968 :         opaque->btpo_flags |= BTP_HAS_GARBAGE;
    1894 CBC       69968 :         MarkBufferDirtyHint(so->currPos.buf, true);
    1895 ECB             :     }
    1896                 : 
    1897 GIC      118056 :     _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
    1898 ECB             : }
    1899                 : 
    1900                 : 
    1901                 : /*
    1902                 :  * The following routines manage a shared-memory area in which we track
    1903                 :  * assignment of "vacuum cycle IDs" to currently-active btree vacuuming
    1904                 :  * operations.  There is a single counter which increments each time we
    1905                 :  * start a vacuum to assign it a cycle ID.  Since multiple vacuums could
    1906                 :  * be active concurrently, we have to track the cycle ID for each active
    1907                 :  * vacuum; this requires at most MaxBackends entries (usually far fewer).
    1908                 :  * We assume at most one vacuum can be active for a given index.
    1909                 :  *
    1910                 :  * Access to the shared memory area is controlled by BtreeVacuumLock.
    1911                 :  * In principle we could use a separate lmgr locktag for each index,
    1912                 :  * but a single LWLock is much cheaper, and given the short time that
    1913                 :  * the lock is ever held, the concurrency hit should be minimal.
    1914                 :  */
    1915                 : 
    1916                 : typedef struct BTOneVacInfo
    1917                 : {
    1918                 :     LockRelId   relid;          /* global identifier of an index */
    1919                 :     BTCycleId   cycleid;        /* cycle ID for its active VACUUM */
    1920                 : } BTOneVacInfo;
    1921                 : 
    1922                 : typedef struct BTVacInfo
    1923                 : {
    1924                 :     BTCycleId   cycle_ctr;      /* cycle ID most recently assigned */
    1925                 :     int         num_vacuums;    /* number of currently active VACUUMs */
    1926                 :     int         max_vacuums;    /* allocated length of vacuums[] array */
    1927                 :     BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER];
    1928                 : } BTVacInfo;
    1929                 : 
    1930                 : static BTVacInfo *btvacinfo;
    1931                 : 
    1932                 : 
    1933                 : /*
    1934                 :  * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index,
    1935                 :  *      or zero if there is no active VACUUM
    1936                 :  *
    1937                 :  * Note: for correct interlocking, the caller must already hold pin and
    1938                 :  * exclusive lock on each buffer it will store the cycle ID into.  This
    1939                 :  * ensures that even if a VACUUM starts immediately afterwards, it cannot
    1940                 :  * process those pages until the page split is complete.
    1941                 :  */
    1942                 : BTCycleId
    1943 GIC       23479 : _bt_vacuum_cycleid(Relation rel)
    1944 ECB             : {
    1945 GIC       23479 :     BTCycleId   result = 0;
    1946 ECB             :     int         i;
    1947                 : 
    1948                 :     /* Share lock is enough since this is a read-only operation */
    1949 GIC       23479 :     LWLockAcquire(BtreeVacuumLock, LW_SHARED);
    1950 ECB             : 
    1951 GIC       23479 :     for (i = 0; i < btvacinfo->num_vacuums; i++)
    1952 ECB             :     {
    1953 UIC           0 :         BTOneVacInfo *vac = &btvacinfo->vacuums[i];
    1954 EUB             : 
    1955 UIC           0 :         if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
    1956 UBC           0 :             vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
    1957 EUB             :         {
    1958 UIC           0 :             result = vac->cycleid;
    1959 UBC           0 :             break;
    1960 EUB             :         }
    1961                 :     }
    1962                 : 
    1963 GIC       23479 :     LWLockRelease(BtreeVacuumLock);
    1964 CBC       23479 :     return result;
    1965 ECB             : }
    1966                 : 
    1967                 : /*
    1968                 :  * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation
    1969                 :  *
    1970                 :  * Note: the caller must guarantee that it will eventually call
    1971                 :  * _bt_end_vacuum, else we'll permanently leak an array slot.  To ensure
    1972                 :  * that this happens even in elog(FATAL) scenarios, the appropriate coding
    1973                 :  * is not just a PG_TRY, but
    1974                 :  *      PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel))
    1975                 :  */
    1976                 : BTCycleId
    1977 GIC        4011 : _bt_start_vacuum(Relation rel)
    1978 ECB             : {
    1979                 :     BTCycleId   result;
    1980                 :     int         i;
    1981                 :     BTOneVacInfo *vac;
    1982                 : 
    1983 GIC        4011 :     LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
    1984 ECB             : 
    1985                 :     /*
    1986                 :      * Assign the next cycle ID, being careful to avoid zero as well as the
    1987                 :      * reserved high values.
    1988                 :      */
    1989 GIC        4011 :     result = ++(btvacinfo->cycle_ctr);
    1990 CBC        4011 :     if (result == 0 || result > MAX_BT_CYCLE_ID)
    1991 LBC           0 :         result = btvacinfo->cycle_ctr = 1;
    1992 EUB             : 
    1993                 :     /* Let's just make sure there's no entry already for this index */
    1994 GIC        4012 :     for (i = 0; i < btvacinfo->num_vacuums; i++)
    1995 ECB             :     {
    1996 GIC           1 :         vac = &btvacinfo->vacuums[i];
    1997 CBC           1 :         if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
    1998 LBC           0 :             vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
    1999 EUB             :         {
    2000                 :             /*
    2001                 :              * Unlike most places in the backend, we have to explicitly
    2002                 :              * release our LWLock before throwing an error.  This is because
    2003                 :              * we expect _bt_end_vacuum() to be called before transaction
    2004                 :              * abort cleanup can run to release LWLocks.
    2005                 :              */
    2006 UIC           0 :             LWLockRelease(BtreeVacuumLock);
    2007 UBC           0 :             elog(ERROR, "multiple active vacuums for index \"%s\"",
    2008 EUB             :                  RelationGetRelationName(rel));
    2009                 :         }
    2010                 :     }
    2011                 : 
    2012                 :     /* OK, add an entry */
    2013 GIC        4011 :     if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
    2014 ECB             :     {
    2015 UIC           0 :         LWLockRelease(BtreeVacuumLock);
    2016 UBC           0 :         elog(ERROR, "out of btvacinfo slots");
    2017 EUB             :     }
    2018 GIC        4011 :     vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
    2019 CBC        4011 :     vac->relid = rel->rd_lockInfo.lockRelId;
    2020            4011 :     vac->cycleid = result;
    2021            4011 :     btvacinfo->num_vacuums++;
    2022 ECB             : 
    2023 GIC        4011 :     LWLockRelease(BtreeVacuumLock);
    2024 CBC        4011 :     return result;
    2025 ECB             : }
    2026                 : 
    2027                 : /*
    2028                 :  * _bt_end_vacuum --- mark a btree VACUUM operation as done
    2029                 :  *
    2030                 :  * Note: this is deliberately coded not to complain if no entry is found;
    2031                 :  * this allows the caller to put PG_TRY around the start_vacuum operation.
    2032                 :  */
    2033                 : void
    2034 GIC        4011 : _bt_end_vacuum(Relation rel)
    2035 ECB             : {
    2036                 :     int         i;
    2037                 : 
    2038 GIC        4011 :     LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);
    2039 ECB             : 
    2040                 :     /* Find the array entry */
    2041 GIC        4011 :     for (i = 0; i < btvacinfo->num_vacuums; i++)
    2042 ECB             :     {
    2043 GIC        4011 :         BTOneVacInfo *vac = &btvacinfo->vacuums[i];
    2044 ECB             : 
    2045 GIC        4011 :         if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
    2046 CBC        4011 :             vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
    2047 ECB             :         {
    2048                 :             /* Remove it by shifting down the last entry */
    2049 GIC        4011 :             *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
    2050 CBC        4011 :             btvacinfo->num_vacuums--;
    2051            4011 :             break;
    2052 ECB             :         }
    2053                 :     }
    2054                 : 
    2055 GIC        4011 :     LWLockRelease(BtreeVacuumLock);
    2056 CBC        4011 : }
    2057 ECB             : 
    2058                 : /*
    2059                 :  * _bt_end_vacuum wrapped as an on_shmem_exit callback function
    2060                 :  */
    2061                 : void
    2062 UIC           0 : _bt_end_vacuum_callback(int code, Datum arg)
    2063 EUB             : {
    2064 UIC           0 :     _bt_end_vacuum((Relation) DatumGetPointer(arg));
    2065 UBC           0 : }
    2066 EUB             : 
    2067                 : /*
    2068                 :  * BTreeShmemSize --- report amount of shared memory space needed
    2069                 :  */
    2070                 : Size
    2071 GIC        4564 : BTreeShmemSize(void)
    2072 ECB             : {
    2073                 :     Size        size;
    2074                 : 
    2075 GIC        4564 :     size = offsetof(BTVacInfo, vacuums);
    2076 CBC        4564 :     size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
    2077            4564 :     return size;
    2078 ECB             : }
    2079                 : 
    2080                 : /*
    2081                 :  * BTreeShmemInit --- initialize this module's shared memory
    2082                 :  */
    2083                 : void
    2084 GIC        1826 : BTreeShmemInit(void)
    2085 ECB             : {
    2086                 :     bool        found;
    2087                 : 
    2088 GIC        1826 :     btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
    2089 ECB             :                                               BTreeShmemSize(),
    2090                 :                                               &found);
    2091                 : 
    2092 GIC        1826 :     if (!IsUnderPostmaster)
    2093 ECB             :     {
    2094                 :         /* Initialize shared memory area */
    2095 GIC        1826 :         Assert(!found);
    2096 ECB             : 
    2097                 :         /*
    2098                 :          * It doesn't really matter what the cycle counter starts at, but
    2099                 :          * having it always start the same doesn't seem good.  Seed with
    2100                 :          * low-order bits of time() instead.
    2101                 :          */
    2102 GIC        1826 :         btvacinfo->cycle_ctr = (BTCycleId) time(NULL);
    2103 ECB             : 
    2104 GIC        1826 :         btvacinfo->num_vacuums = 0;
    2105 CBC        1826 :         btvacinfo->max_vacuums = MaxBackends;
    2106 ECB             :     }
    2107                 :     else
    2108 UIC           0 :         Assert(found);
    2109 GBC        1826 : }
    2110 ECB             : 
    2111                 : bytea *
    2112 GIC         115 : btoptions(Datum reloptions, bool validate)
    2113 ECB             : {
    2114                 :     static const relopt_parse_elt tab[] = {
    2115                 :         {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)},
    2116                 :         {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
    2117                 :         offsetof(BTOptions, vacuum_cleanup_index_scale_factor)},
    2118                 :         {"deduplicate_items", RELOPT_TYPE_BOOL,
    2119                 :         offsetof(BTOptions, deduplicate_items)}
    2120                 :     };
    2121                 : 
    2122 GIC         115 :     return (bytea *) build_reloptions(reloptions, validate,
    2123 ECB             :                                       RELOPT_KIND_BTREE,
    2124                 :                                       sizeof(BTOptions),
    2125                 :                                       tab, lengthof(tab));
    2126                 : }
    2127                 : 
    2128                 : /*
    2129                 :  *  btproperty() -- Check boolean properties of indexes.
    2130                 :  *
    2131                 :  * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel
    2132                 :  * to call btcanreturn.
    2133                 :  */
    2134                 : bool
    2135 GIC         378 : btproperty(Oid index_oid, int attno,
    2136 ECB             :            IndexAMProperty prop, const char *propname,
    2137                 :            bool *res, bool *isnull)
    2138                 : {
    2139 GIC         378 :     switch (prop)
    2140 ECB             :     {
    2141 GIC          21 :         case AMPROP_RETURNABLE:
    2142 ECB             :             /* answer only for columns, not AM or whole index */
    2143 GIC          21 :             if (attno == 0)
    2144 CBC           6 :                 return false;
    2145 ECB             :             /* otherwise, btree can always return data */
    2146 GIC          15 :             *res = true;
    2147 CBC          15 :             return true;
    2148 ECB             : 
    2149 GIC         357 :         default:
    2150 CBC         357 :             return false;       /* punt to generic code */
    2151 ECB             :     }
    2152                 : }
    2153                 : 
    2154                 : /*
    2155                 :  *  btbuildphasename() -- Return name of index build phase.
    2156                 :  */
    2157                 : char *
    2158 UIC           0 : btbuildphasename(int64 phasenum)
    2159 EUB             : {
    2160 UIC           0 :     switch (phasenum)
    2161 EUB             :     {
    2162 UIC           0 :         case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE:
    2163 UBC           0 :             return "initializing";
    2164               0 :         case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN:
    2165               0 :             return "scanning table";
    2166               0 :         case PROGRESS_BTREE_PHASE_PERFORMSORT_1:
    2167               0 :             return "sorting live tuples";
    2168               0 :         case PROGRESS_BTREE_PHASE_PERFORMSORT_2:
    2169               0 :             return "sorting dead tuples";
    2170               0 :         case PROGRESS_BTREE_PHASE_LEAF_LOAD:
    2171               0 :             return "loading tuples in tree";
    2172               0 :         default:
    2173               0 :             return NULL;
    2174 EUB             :     }
    2175                 : }
    2176                 : 
    2177                 : /*
    2178                 :  *  _bt_truncate() -- create tuple without unneeded suffix attributes.
    2179                 :  *
    2180                 :  * Returns truncated pivot index tuple allocated in caller's memory context,
    2181                 :  * with key attributes copied from caller's firstright argument.  If rel is
    2182                 :  * an INCLUDE index, non-key attributes will definitely be truncated away,
    2183                 :  * since they're not part of the key space.  More aggressive suffix
    2184                 :  * truncation can take place when it's clear that the returned tuple does not
    2185                 :  * need one or more suffix key attributes.  We only need to keep firstright
    2186                 :  * attributes up to and including the first non-lastleft-equal attribute.
    2187                 :  * Caller's insertion scankey is used to compare the tuples; the scankey's
    2188                 :  * argument values are not considered here.
    2189                 :  *
    2190                 :  * Note that returned tuple's t_tid offset will hold the number of attributes
    2191                 :  * present, so the original item pointer offset is not represented.  Caller
    2192                 :  * should only change truncated tuple's downlink.  Note also that truncated
    2193                 :  * key attributes are treated as containing "minus infinity" values by
    2194                 :  * _bt_compare().
    2195                 :  *
    2196                 :  * In the worst case (when a heap TID must be appended to distinguish lastleft
    2197                 :  * from firstright), the size of the returned tuple is the size of firstright
    2198                 :  * plus the size of an additional MAXALIGN()'d item pointer.  This guarantee
    2199                 :  * is important, since callers need to stay under the 1/3 of a page
    2200                 :  * restriction on tuple size.  If this routine is ever taught to truncate
    2201                 :  * within an attribute/datum, it will need to avoid returning an enlarged
    2202                 :  * tuple to caller when truncation + TOAST compression ends up enlarging the
    2203                 :  * final datum.
    2204                 :  */
    2205                 : IndexTuple
    2206 GIC       62102 : _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
    2207 ECB             :              BTScanInsert itup_key)
    2208                 : {
    2209 GIC       62102 :     TupleDesc   itupdesc = RelationGetDescr(rel);
    2210 CBC       62102 :     int16       nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
    2211 ECB             :     int         keepnatts;
    2212                 :     IndexTuple  pivot;
    2213                 :     IndexTuple  tidpivot;
    2214                 :     ItemPointer pivotheaptid;
    2215                 :     Size        newsize;
    2216                 : 
    2217                 :     /*
    2218                 :      * We should only ever truncate non-pivot tuples from leaf pages.  It's
    2219                 :      * never okay to truncate when splitting an internal page.
    2220                 :      */
    2221 GIC       62102 :     Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright));
    2222 ECB             : 
    2223                 :     /* Determine how many attributes must be kept in truncated tuple */
    2224 GIC       62102 :     keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key);
    2225 ECB             : 
    2226                 : #ifdef DEBUG_NO_TRUNCATE
    2227                 :     /* Force truncation to be ineffective for testing purposes */
    2228                 :     keepnatts = nkeyatts + 1;
    2229                 : #endif
    2230                 : 
    2231 GIC       62102 :     pivot = index_truncate_tuple(itupdesc, firstright,
    2232 ECB             :                                  Min(keepnatts, nkeyatts));
    2233                 : 
    2234 GIC       62102 :     if (BTreeTupleIsPosting(pivot))
    2235 ECB             :     {
    2236                 :         /*
    2237                 :          * index_truncate_tuple() just returns a straight copy of firstright
    2238                 :          * when it has no attributes to truncate.  When that happens, we may
    2239                 :          * need to truncate away a posting list here instead.
    2240                 :          */
    2241 GIC         880 :         Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1);
    2242 CBC         880 :         Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts);
    2243             880 :         pivot->t_info &= ~INDEX_SIZE_MASK;
    2244             880 :         pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright));
    2245 ECB             :     }
    2246                 : 
    2247                 :     /*
    2248                 :      * If there is a distinguishing key attribute within pivot tuple, we're
    2249                 :      * done
    2250                 :      */
    2251 GIC       62102 :     if (keepnatts <= nkeyatts)
    2252 ECB             :     {
    2253 GIC       61586 :         BTreeTupleSetNAtts(pivot, keepnatts, false);
    2254 CBC       61586 :         return pivot;
    2255 ECB             :     }
    2256                 : 
    2257                 :     /*
    2258                 :      * We have to store a heap TID in the new pivot tuple, since no non-TID
    2259                 :      * key attribute value in firstright distinguishes the right side of the
    2260                 :      * split from the left side.  nbtree conceptualizes this case as an
    2261                 :      * inability to truncate away any key attributes, since heap TID is
    2262                 :      * treated as just another key attribute (despite lacking a pg_attribute
    2263                 :      * entry).
    2264                 :      *
    2265                 :      * Use enlarged space that holds a copy of pivot.  We need the extra space
    2266                 :      * to store a heap TID at the end (using the special pivot tuple
    2267                 :      * representation).  Note that the original pivot already has firstright's
    2268                 :      * possible posting list/non-key attribute values removed at this point.
    2269                 :      */
    2270 GIC         516 :     newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData));
    2271 CBC         516 :     tidpivot = palloc0(newsize);
    2272             516 :     memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot)));
    2273 ECB             :     /* Cannot leak memory here */
    2274 GIC         516 :     pfree(pivot);
    2275 ECB             : 
    2276                 :     /*
    2277                 :      * Store all of firstright's key attribute values plus a tiebreaker heap
    2278                 :      * TID value in enlarged pivot tuple
    2279                 :      */
    2280 GIC         516 :     tidpivot->t_info &= ~INDEX_SIZE_MASK;
    2281 CBC         516 :     tidpivot->t_info |= newsize;
    2282             516 :     BTreeTupleSetNAtts(tidpivot, nkeyatts, true);
    2283             516 :     pivotheaptid = BTreeTupleGetHeapTID(tidpivot);
    2284 ECB             : 
    2285                 :     /*
    2286                 :      * Lehman & Yao use lastleft as the leaf high key in all cases, but don't
    2287                 :      * consider suffix truncation.  It seems like a good idea to follow that
    2288                 :      * example in cases where no truncation takes place -- use lastleft's heap
    2289                 :      * TID.  (This is also the closest value to negative infinity that's
    2290                 :      * legally usable.)
    2291                 :      */
    2292 GIC         516 :     ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid);
    2293 ECB             : 
    2294                 :     /*
    2295                 :      * We're done.  Assert() that heap TID invariants hold before returning.
    2296                 :      *
    2297                 :      * Lehman and Yao require that the downlink to the right page, which is to
    2298                 :      * be inserted into the parent page in the second phase of a page split be
    2299                 :      * a strict lower bound on items on the right page, and a non-strict upper
    2300                 :      * bound for items on the left page.  Assert that heap TIDs follow these
    2301                 :      * invariants, since a heap TID value is apparently needed as a
    2302                 :      * tiebreaker.
    2303                 :      */
    2304                 : #ifndef DEBUG_NO_TRUNCATE
    2305 GIC         516 :     Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft),
    2306 ECB             :                               BTreeTupleGetHeapTID(firstright)) < 0);
    2307 GIC         516 :     Assert(ItemPointerCompare(pivotheaptid,
    2308 ECB             :                               BTreeTupleGetHeapTID(lastleft)) >= 0);
    2309 GIC         516 :     Assert(ItemPointerCompare(pivotheaptid,
    2310 ECB             :                               BTreeTupleGetHeapTID(firstright)) < 0);
    2311                 : #else
    2312                 : 
    2313                 :     /*
    2314                 :      * Those invariants aren't guaranteed to hold for lastleft + firstright
    2315                 :      * heap TID attribute values when they're considered here only because
    2316                 :      * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually
    2317                 :      * needed as a tiebreaker).  DEBUG_NO_TRUNCATE must therefore use a heap
    2318                 :      * TID value that always works as a strict lower bound for items to the
    2319                 :      * right.  In particular, it must avoid using firstright's leading key
    2320                 :      * attribute values along with lastleft's heap TID value when lastleft's
    2321                 :      * TID happens to be greater than firstright's TID.
    2322                 :      */
    2323                 :     ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
    2324                 : 
    2325                 :     /*
    2326                 :      * Pivot heap TID should never be fully equal to firstright.  Note that
    2327                 :      * the pivot heap TID will still end up equal to lastleft's heap TID when
    2328                 :      * that's the only usable value.
    2329                 :      */
    2330                 :     ItemPointerSetOffsetNumber(pivotheaptid,
    2331                 :                                OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
    2332                 :     Assert(ItemPointerCompare(pivotheaptid,
    2333                 :                               BTreeTupleGetHeapTID(firstright)) < 0);
    2334                 : #endif
    2335                 : 
    2336 GIC         516 :     return tidpivot;
    2337 ECB             : }
    2338                 : 
    2339                 : /*
    2340                 :  * _bt_keep_natts - how many key attributes to keep when truncating.
    2341                 :  *
    2342                 :  * Caller provides two tuples that enclose a split point.  Caller's insertion
    2343                 :  * scankey is used to compare the tuples; the scankey's argument values are
    2344                 :  * not considered here.
    2345                 :  *
    2346                 :  * This can return a number of attributes that is one greater than the
    2347                 :  * number of key attributes for the index relation.  This indicates that the
    2348                 :  * caller must use a heap TID as a unique-ifier in new pivot tuple.
    2349                 :  */
    2350                 : static int
    2351 GIC       62102 : _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
    2352 ECB             :                BTScanInsert itup_key)
    2353                 : {
    2354 GIC       62102 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
    2355 CBC       62102 :     TupleDesc   itupdesc = RelationGetDescr(rel);
    2356 ECB             :     int         keepnatts;
    2357                 :     ScanKey     scankey;
    2358                 : 
    2359                 :     /*
    2360                 :      * _bt_compare() treats truncated key attributes as having the value minus
    2361                 :      * infinity, which would break searches within !heapkeyspace indexes.  We
    2362                 :      * must still truncate away non-key attribute values, though.
    2363                 :      */
    2364 GIC       62102 :     if (!itup_key->heapkeyspace)
    2365 LBC           0 :         return nkeyatts;
    2366 EUB             : 
    2367 GIC       62102 :     scankey = itup_key->scankeys;
    2368 CBC       62102 :     keepnatts = 1;
    2369           78658 :     for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++)
    2370 ECB             :     {
    2371                 :         Datum       datum1,
    2372                 :                     datum2;
    2373                 :         bool        isNull1,
    2374                 :                     isNull2;
    2375                 : 
    2376 GIC       78142 :         datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
    2377 CBC       78142 :         datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
    2378 ECB             : 
    2379 GIC       78142 :         if (isNull1 != isNull2)
    2380 CBC       61586 :             break;
    2381 ECB             : 
    2382 GIC      156284 :         if (!isNull1 &&
    2383 CBC       78142 :             DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
    2384 ECB             :                                             scankey->sk_collation,
    2385                 :                                             datum1,
    2386                 :                                             datum2)) != 0)
    2387 GIC       61586 :             break;
    2388 ECB             : 
    2389 GIC       16556 :         keepnatts++;
    2390 ECB             :     }
    2391                 : 
    2392                 :     /*
    2393                 :      * Assert that _bt_keep_natts_fast() agrees with us in passing.  This is
    2394                 :      * expected in an allequalimage index.
    2395                 :      */
    2396 GIC       62102 :     Assert(!itup_key->allequalimage ||
    2397 ECB             :            keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright));
    2398                 : 
    2399 GIC       62102 :     return keepnatts;
    2400 ECB             : }
    2401                 : 
    2402                 : /*
    2403                 :  * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts.
    2404                 :  *
    2405                 :  * This is exported so that a candidate split point can have its effect on
    2406                 :  * suffix truncation inexpensively evaluated ahead of time when finding a
    2407                 :  * split location.  A naive bitwise approach to datum comparisons is used to
    2408                 :  * save cycles.
    2409                 :  *
    2410                 :  * The approach taken here usually provides the same answer as _bt_keep_natts
    2411                 :  * will (for the same pair of tuples from a heapkeyspace index), since the
    2412                 :  * majority of btree opclasses can never indicate that two datums are equal
    2413                 :  * unless they're bitwise equal after detoasting.  When an index only has
    2414                 :  * "equal image" columns, routine is guaranteed to give the same result as
    2415                 :  * _bt_keep_natts would.
    2416                 :  *
    2417                 :  * Callers can rely on the fact that attributes considered equal here are
    2418                 :  * definitely also equal according to _bt_keep_natts, even when the index uses
    2419                 :  * an opclass or collation that is not "allequalimage"/deduplication-safe.
    2420                 :  * This weaker guarantee is good enough for nbtsplitloc.c caller, since false
    2421                 :  * negatives generally only have the effect of making leaf page splits use a
    2422                 :  * more balanced split point.
    2423                 :  */
    2424                 : int
    2425 GIC    10694770 : _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
    2426 ECB             : {
    2427 GIC    10694770 :     TupleDesc   itupdesc = RelationGetDescr(rel);
    2428 CBC    10694770 :     int         keysz = IndexRelationGetNumberOfKeyAttributes(rel);
    2429 ECB             :     int         keepnatts;
    2430                 : 
    2431 GIC    10694770 :     keepnatts = 1;
    2432 CBC    19967380 :     for (int attnum = 1; attnum <= keysz; attnum++)
    2433 ECB             :     {
    2434                 :         Datum       datum1,
    2435                 :                     datum2;
    2436                 :         bool        isNull1,
    2437                 :                     isNull2;
    2438                 :         Form_pg_attribute att;
    2439                 : 
    2440 GIC    18295223 :         datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1);
    2441 CBC    18295223 :         datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2);
    2442        18295223 :         att = TupleDescAttr(itupdesc, attnum - 1);
    2443 ECB             : 
    2444 GIC    18295223 :         if (isNull1 != isNull2)
    2445 CBC     9022613 :             break;
    2446 ECB             : 
    2447 GIC    18295151 :         if (!isNull1 &&
    2448 CBC    18292025 :             !datum_image_eq(datum1, datum2, att->attbyval, att->attlen))
    2449         9022541 :             break;
    2450 ECB             : 
    2451 GIC     9272610 :         keepnatts++;
    2452 ECB             :     }
    2453                 : 
    2454 GIC    10694770 :     return keepnatts;
    2455 ECB             : }
    2456                 : 
    2457                 : /*
    2458                 :  *  _bt_check_natts() -- Verify tuple has expected number of attributes.
    2459                 :  *
    2460                 :  * Returns value indicating if the expected number of attributes were found
    2461                 :  * for a particular offset on page.  This can be used as a general purpose
    2462                 :  * sanity check.
    2463                 :  *
    2464                 :  * Testing a tuple directly with BTreeTupleGetNAtts() should generally be
    2465                 :  * preferred to calling here.  That's usually more convenient, and is always
    2466                 :  * more explicit.  Call here instead when offnum's tuple may be a negative
    2467                 :  * infinity tuple that uses the pre-v11 on-disk representation, or when a low
    2468                 :  * context check is appropriate.  This routine is as strict as possible about
    2469                 :  * what is expected on each version of btree.
    2470                 :  */
    2471                 : bool
    2472 GIC   179023210 : _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
    2473 ECB             : {
    2474 GIC   179023210 :     int16       natts = IndexRelationGetNumberOfAttributes(rel);
    2475 CBC   179023210 :     int16       nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
    2476       179023210 :     BTPageOpaque opaque = BTPageGetOpaque(page);
    2477 ECB             :     IndexTuple  itup;
    2478                 :     int         tupnatts;
    2479                 : 
    2480                 :     /*
    2481                 :      * We cannot reliably test a deleted or half-dead page, since they have
    2482                 :      * dummy high keys
    2483                 :      */
    2484 GIC   179023210 :     if (P_IGNORE(opaque))
    2485 LBC           0 :         return true;
    2486 EUB             : 
    2487 GIC   179023210 :     Assert(offnum >= FirstOffsetNumber &&
    2488 ECB             :            offnum <= PageGetMaxOffsetNumber(page));
    2489                 : 
    2490 GIC   179023210 :     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    2491       179023210 :     tupnatts = BTreeTupleGetNAtts(itup, rel);
    2492 ECB             : 
    2493                 :     /* !heapkeyspace indexes do not support deduplication */
    2494 GIC   179023210 :     if (!heapkeyspace && BTreeTupleIsPosting(itup))
    2495 UBC           0 :         return false;
    2496                 : 
    2497                 :     /* Posting list tuples should never have "pivot heap TID" bit set */
    2498 CBC   179023210 :     if (BTreeTupleIsPosting(itup) &&
    2499 GBC     2465578 :         (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
    2500                 :          BT_PIVOT_HEAP_TID_ATTR) != 0)
    2501 LBC           0 :         return false;
    2502                 : 
    2503 ECB             :     /* INCLUDE indexes do not support deduplication */
    2504 GIC   179023210 :     if (natts != nkeyatts && BTreeTupleIsPosting(itup))
    2505 UIC           0 :         return false;
    2506                 : 
    2507 GIC   179023210 :     if (P_ISLEAF(opaque))
    2508                 :     {
    2509 CBC   136971736 :         if (offnum >= P_FIRSTDATAKEY(opaque))
    2510 EUB             :         {
    2511                 :             /*
    2512                 :              * Non-pivot tuple should never be explicitly marked as a pivot
    2513                 :              * tuple
    2514                 :              */
    2515 GIC   127086211 :             if (BTreeTupleIsPivot(itup))
    2516 UIC           0 :                 return false;
    2517                 : 
    2518 ECB             :             /*
    2519                 :              * Leaf tuples that are not the page high key (non-pivot tuples)
    2520                 :              * should never be truncated.  (Note that tupnatts must have been
    2521                 :              * inferred, even with a posting list tuple, because only pivot
    2522                 :              * tuples store tupnatts directly.)
    2523                 :              */
    2524 GIC   127086211 :             return tupnatts == natts;
    2525                 :         }
    2526 ECB             :         else
    2527                 :         {
    2528                 :             /*
    2529                 :              * Rightmost page doesn't contain a page high key, so tuple was
    2530                 :              * checked above as ordinary leaf tuple
    2531                 :              */
    2532 GIC     9885525 :             Assert(!P_RIGHTMOST(opaque));
    2533 ECB             : 
    2534 EUB             :             /*
    2535                 :              * !heapkeyspace high key tuple contains only key attributes. Note
    2536                 :              * that tupnatts will only have been explicitly represented in
    2537                 :              * !heapkeyspace indexes that happen to have non-key attributes.
    2538                 :              */
    2539 GIC     9885525 :             if (!heapkeyspace)
    2540 UIC           0 :                 return tupnatts == nkeyatts;
    2541 ECB             : 
    2542                 :             /* Use generic heapkeyspace pivot tuple handling */
    2543                 :         }
    2544                 :     }
    2545                 :     else                        /* !P_ISLEAF(opaque) */
    2546                 :     {
    2547 GIC    42051474 :         if (offnum == P_FIRSTDATAKEY(opaque))
    2548                 :         {
    2549 ECB             :             /*
    2550                 :              * The first tuple on any internal page (possibly the first after
    2551                 :              * its high key) is its negative infinity tuple.  Negative
    2552                 :              * infinity tuples are always truncated to zero attributes.  They
    2553                 :              * are a particular kind of pivot tuple.
    2554                 :              */
    2555 GIC     1890395 :             if (heapkeyspace)
    2556         1890395 :                 return tupnatts == 0;
    2557                 : 
    2558                 :             /*
    2559                 :              * The number of attributes won't be explicitly represented if the
    2560                 :              * negative infinity tuple was generated during a page split that
    2561                 :              * occurred with a version of Postgres before v11.  There must be
    2562                 :              * a problem when there is an explicit representation that is
    2563                 :              * non-zero, or when there is no explicit representation and the
    2564 EUB             :              * tuple is evidently not a pre-pg_upgrade tuple.
    2565                 :              *
    2566                 :              * Prior to v11, downlinks always had P_HIKEY as their offset.
    2567                 :              * Accept that as an alternative indication of a valid
    2568                 :              * !heapkeyspace negative infinity tuple.
    2569                 :              */
    2570 UIC           0 :             return tupnatts == 0 ||
    2571               0 :                 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY;
    2572                 :         }
    2573                 :         else
    2574                 :         {
    2575 ECB             :             /*
    2576 EUB             :              * !heapkeyspace downlink tuple with separator key contains only
    2577                 :              * key attributes.  Note that tupnatts will only have been
    2578                 :              * explicitly represented in !heapkeyspace indexes that happen to
    2579                 :              * have non-key attributes.
    2580                 :              */
    2581 GIC    40161079 :             if (!heapkeyspace)
    2582 UIC           0 :                 return tupnatts == nkeyatts;
    2583 ECB             : 
    2584                 :             /* Use generic heapkeyspace pivot tuple handling */
    2585                 :         }
    2586                 :     }
    2587                 : 
    2588                 :     /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */
    2589 GIC    50046604 :     Assert(heapkeyspace);
    2590 ECB             : 
    2591 EUB             :     /*
    2592                 :      * Explicit representation of the number of attributes is mandatory with
    2593                 :      * heapkeyspace index pivot tuples, regardless of whether or not there are
    2594 ECB             :      * non-key attributes.
    2595 EUB             :      */
    2596 GIC    50046604 :     if (!BTreeTupleIsPivot(itup))
    2597 UIC           0 :         return false;
    2598                 : 
    2599                 :     /* Pivot tuple should not use posting list representation (redundant) */
    2600 GIC    50046604 :     if (BTreeTupleIsPosting(itup))
    2601 LBC           0 :         return false;
    2602 EUB             : 
    2603                 :     /*
    2604                 :      * Heap TID is a tiebreaker key attribute, so it cannot be untruncated
    2605                 :      * when any other key attribute is truncated
    2606                 :      */
    2607 GIC    50046604 :     if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts)
    2608 UIC           0 :         return false;
    2609                 : 
    2610 ECB             :     /*
    2611                 :      * Pivot tuple must have at least one untruncated key attribute (minus
    2612                 :      * infinity pivot tuples are the only exception).  Pivot tuples can never
    2613                 :      * represent that there is a value present for a key attribute that
    2614                 :      * exceeds pg_index.indnkeyatts for the index.
    2615                 :      */
    2616 GIC    50046604 :     return tupnatts > 0 && tupnatts <= nkeyatts;
    2617                 : }
    2618                 : 
    2619                 : /*
    2620                 :  *
    2621                 :  *  _bt_check_third_page() -- check whether tuple fits on a btree page at all.
    2622                 :  *
    2623                 :  * We actually need to be able to fit three items on every page, so restrict
    2624                 :  * any one item to 1/3 the per-page available space.  Note that itemsz should
    2625                 :  * not include the ItemId overhead.
    2626 ECB             :  *
    2627                 :  * It might be useful to apply TOAST methods rather than throw an error here.
    2628                 :  * Using out of line storage would break assumptions made by suffix truncation
    2629                 :  * and by contrib/amcheck, though.
    2630                 :  */
    2631                 : void
    2632 CBC         132 : _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
    2633                 :                      Page page, IndexTuple newtup)
    2634                 : {
    2635 ECB             :     Size        itemsz;
    2636 EUB             :     BTPageOpaque opaque;
    2637                 : 
    2638 GIC         132 :     itemsz = MAXALIGN(IndexTupleSize(newtup));
    2639                 : 
    2640                 :     /* Double check item size against limit */
    2641             132 :     if (itemsz <= BTMaxItemSize(page))
    2642 UIC           0 :         return;
    2643 ECB             : 
    2644                 :     /*
    2645                 :      * Tuple is probably too large to fit on page, but it's possible that the
    2646                 :      * index uses version 2 or version 3, or that page is an internal page, in
    2647                 :      * which case a slightly higher limit applies.
    2648                 :      */
    2649 GIC         132 :     if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
    2650 GBC         132 :         return;
    2651 EUB             : 
    2652                 :     /*
    2653                 :      * Internal page insertions cannot fail here, because that would mean that
    2654                 :      * an earlier leaf level insertion that should have failed didn't
    2655                 :      */
    2656 UIC           0 :     opaque = BTPageGetOpaque(page);
    2657               0 :     if (!P_ISLEAF(opaque))
    2658               0 :         elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"",
    2659                 :              itemsz, RelationGetRelationName(rel));
    2660                 : 
    2661               0 :     ereport(ERROR,
    2662                 :             (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
    2663                 :              errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"",
    2664                 :                     itemsz,
    2665                 :                     needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION,
    2666                 :                     needheaptidspace ? BTMaxItemSize(page) :
    2667                 :                     BTMaxItemSizeNoHeapTid(page),
    2668                 :                     RelationGetRelationName(rel)),
    2669                 :              errdetail("Index row references tuple (%u,%u) in relation \"%s\".",
    2670                 :                        ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)),
    2671                 :                        ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)),
    2672                 :                        RelationGetRelationName(heap)),
    2673                 :              errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
    2674                 :                      "Consider a function index of an MD5 hash of the value, "
    2675                 :                      "or use full text indexing."),
    2676                 :              errtableconstraint(heap, RelationGetRelationName(rel))));
    2677                 : }
    2678                 : 
    2679                 : /*
    2680                 :  * Are all attributes in rel "equality is image equality" attributes?
    2681                 :  *
    2682                 :  * We use each attribute's BTEQUALIMAGE_PROC opclass procedure.  If any
    2683                 :  * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we
    2684 ECB             :  * return false; otherwise we return true.
    2685                 :  *
    2686                 :  * Returned boolean value is stored in index metapage during index builds.
    2687                 :  * Deduplication can only be used when we return true.
    2688                 :  */
    2689                 : bool
    2690 CBC       67018 : _bt_allequalimage(Relation rel, bool debugmessage)
    2691 ECB             : {
    2692 GIC       67018 :     bool        allequalimage = true;
    2693 ECB             : 
    2694                 :     /* INCLUDE indexes can never support deduplication */
    2695 CBC       67018 :     if (IndexRelationGetNumberOfAttributes(rel) !=
    2696           67018 :         IndexRelationGetNumberOfKeyAttributes(rel))
    2697             133 :         return false;
    2698                 : 
    2699 GIC      180400 :     for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++)
    2700 ECB             :     {
    2701 GIC      113957 :         Oid         opfamily = rel->rd_opfamily[i];
    2702          113957 :         Oid         opcintype = rel->rd_opcintype[i];
    2703          113957 :         Oid         collation = rel->rd_indcollation[i];
    2704                 :         Oid         equalimageproc;
    2705                 : 
    2706          113957 :         equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype,
    2707 ECB             :                                            BTEQUALIMAGE_PROC);
    2708                 : 
    2709                 :         /*
    2710                 :          * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to
    2711                 :          * be unsafe.  Otherwise, actually call proc and see what it says.
    2712                 :          */
    2713 GIC      113957 :         if (!OidIsValid(equalimageproc) ||
    2714          113525 :             !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation,
    2715                 :                                                ObjectIdGetDatum(opcintype))))
    2716 ECB             :         {
    2717 GIC         442 :             allequalimage = false;
    2718 CBC         442 :             break;
    2719 ECB             :         }
    2720                 :     }
    2721                 : 
    2722 CBC       66885 :     if (debugmessage)
    2723                 :     {
    2724 GIC       63980 :         if (allequalimage)
    2725           63538 :             elog(DEBUG1, "index \"%s\" can safely use deduplication",
    2726 ECB             :                  RelationGetRelationName(rel));
    2727                 :         else
    2728 GIC         442 :             elog(DEBUG1, "index \"%s\" cannot use deduplication",
    2729                 :                  RelationGetRelationName(rel));
    2730                 :     }
    2731                 : 
    2732           66885 :     return allequalimage;
    2733                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a