LCOV - differential code coverage report
Current view: top level - src/backend/access/nbtree - nbtree.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DUB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 91.9 % 445 409 9 19 8 6 224 7 172 21 226 1 2
Current Date: 2023-04-08 17:13:01 Functions: 95.7 % 23 22 1 21 1 1 21
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 [..60] days: 100.0 % 6 6 6
Legend: Lines: hit not hit (240..) days: 91.8 % 439 403 9 19 8 6 224 1 172 21 224
Function coverage date bins:
(240..) days: 50.0 % 44 22 1 21 1 1 20

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * nbtree.c
                                  4                 :  *    Implementation of Lehman and Yao's btree management algorithm for
                                  5                 :  *    Postgres.
                                  6                 :  *
                                  7                 :  * NOTES
                                  8                 :  *    This file contains only the public interface routines.
                                  9                 :  *
                                 10                 :  *
                                 11                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                 12                 :  * Portions Copyright (c) 1994, Regents of the University of California
                                 13                 :  *
                                 14                 :  * IDENTIFICATION
                                 15                 :  *    src/backend/access/nbtree/nbtree.c
                                 16                 :  *
                                 17                 :  *-------------------------------------------------------------------------
                                 18                 :  */
                                 19                 : #include "postgres.h"
                                 20                 : 
                                 21                 : #include "access/nbtree.h"
                                 22                 : #include "access/nbtxlog.h"
                                 23                 : #include "access/relscan.h"
                                 24                 : #include "access/xlog.h"
                                 25                 : #include "access/xloginsert.h"
                                 26                 : #include "commands/progress.h"
                                 27                 : #include "commands/vacuum.h"
                                 28                 : #include "miscadmin.h"
                                 29                 : #include "nodes/execnodes.h"
                                 30                 : #include "pgstat.h"
                                 31                 : #include "postmaster/autovacuum.h"
                                 32                 : #include "storage/condition_variable.h"
                                 33                 : #include "storage/indexfsm.h"
                                 34                 : #include "storage/ipc.h"
                                 35                 : #include "storage/lmgr.h"
                                 36                 : #include "storage/smgr.h"
                                 37                 : #include "utils/builtins.h"
                                 38                 : #include "utils/index_selfuncs.h"
                                 39                 : #include "utils/memutils.h"
                                 40                 : 
                                 41                 : 
                                 42                 : /*
                                 43                 :  * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
                                 44                 :  *
                                 45                 :  * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
                                 46                 :  * a new page; others must wait.
                                 47                 :  *
                                 48                 :  * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
                                 49                 :  * to a new page; some process can start doing that.
                                 50                 :  *
                                 51                 :  * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
                                 52                 :  * We reach this state once for every distinct combination of array keys.
                                 53                 :  */
                                 54                 : typedef enum
                                 55                 : {
                                 56                 :     BTPARALLEL_NOT_INITIALIZED,
                                 57                 :     BTPARALLEL_ADVANCING,
                                 58                 :     BTPARALLEL_IDLE,
                                 59                 :     BTPARALLEL_DONE
                                 60                 : } BTPS_State;
                                 61                 : 
                                 62                 : /*
                                 63                 :  * BTParallelScanDescData contains btree specific shared information required
                                 64                 :  * for parallel scan.
                                 65                 :  */
                                 66                 : typedef struct BTParallelScanDescData
                                 67                 : {
                                 68                 :     BlockNumber btps_scanPage;  /* latest or next page to be scanned */
                                 69                 :     BTPS_State  btps_pageStatus;    /* indicates whether next page is
                                 70                 :                                      * available for scan. see above for
                                 71                 :                                      * possible states of parallel scan. */
                                 72                 :     int         btps_arrayKeyCount; /* count indicating number of array scan
                                 73                 :                                      * keys processed by parallel scan */
                                 74                 :     slock_t     btps_mutex;     /* protects above variables */
                                 75                 :     ConditionVariable btps_cv;  /* used to synchronize parallel scan */
                                 76                 : }           BTParallelScanDescData;
                                 77                 : 
                                 78                 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
                                 79                 : 
                                 80                 : 
                                 81                 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
                                 82                 :                          IndexBulkDeleteCallback callback, void *callback_state,
                                 83                 :                          BTCycleId cycleid);
                                 84                 : static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
                                 85                 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
                                 86                 :                                           IndexTuple posting,
                                 87                 :                                           OffsetNumber updatedoffset,
                                 88                 :                                           int *nremaining);
                                 89                 : 
                                 90                 : 
                                 91                 : /*
                                 92                 :  * Btree handler function: return IndexAmRoutine with access method parameters
                                 93                 :  * and callbacks.
                                 94                 :  */
                                 95                 : Datum
 2639 tgl                        96 CBC     1158799 : bthandler(PG_FUNCTION_ARGS)
                                 97                 : {
                                 98         1158799 :     IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
                                 99                 : 
 2537 teodor                    100         1158799 :     amroutine->amstrategies = BTMaxStrategyNumber;
                                101         1158799 :     amroutine->amsupport = BTNProcs;
 1105 akorotkov                 102         1158799 :     amroutine->amoptsprocnum = BTOPTIONS_PROC;
 2639 tgl                       103         1158799 :     amroutine->amcanorder = true;
                                104         1158799 :     amroutine->amcanorderbyop = false;
                                105         1158799 :     amroutine->amcanbackward = true;
                                106         1158799 :     amroutine->amcanunique = true;
                                107         1158799 :     amroutine->amcanmulticol = true;
                                108         1158799 :     amroutine->amoptionalkey = true;
                                109         1158799 :     amroutine->amsearcharray = true;
                                110         1158799 :     amroutine->amsearchnulls = true;
                                111         1158799 :     amroutine->amstorage = false;
                                112         1158799 :     amroutine->amclusterable = true;
                                113         1158799 :     amroutine->ampredlocks = true;
 2244 rhaas                     114         1158799 :     amroutine->amcanparallel = true;
 1828 teodor                    115         1158799 :     amroutine->amcaninclude = true;
 1180 akapila                   116         1158799 :     amroutine->amusemaintenanceworkmem = false;
   20 tomas.vondra              117 GNC     1158799 :     amroutine->amsummarizing = false;
 1180 akapila                   118 CBC     1158799 :     amroutine->amparallelvacuumoptions =
 1180 akapila                   119 ECB             :         VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
 2639 tgl                       120 GIC     1158799 :     amroutine->amkeytype = InvalidOid;
 2639 tgl                       121 ECB             : 
 2639 tgl                       122 GIC     1158799 :     amroutine->ambuild = btbuild;
 2639 tgl                       123 CBC     1158799 :     amroutine->ambuildempty = btbuildempty;
                                124         1158799 :     amroutine->aminsert = btinsert;
                                125         1158799 :     amroutine->ambulkdelete = btbulkdelete;
                                126         1158799 :     amroutine->amvacuumcleanup = btvacuumcleanup;
                                127         1158799 :     amroutine->amcanreturn = btcanreturn;
                                128         1158799 :     amroutine->amcostestimate = btcostestimate;
                                129         1158799 :     amroutine->amoptions = btoptions;
 2430                           130         1158799 :     amroutine->amproperty = btproperty;
 1468 alvherre                  131         1158799 :     amroutine->ambuildphasename = btbuildphasename;
 2639 tgl                       132         1158799 :     amroutine->amvalidate = btvalidate;
  981                           133         1158799 :     amroutine->amadjustmembers = btadjustmembers;
 2639                           134         1158799 :     amroutine->ambeginscan = btbeginscan;
                                135         1158799 :     amroutine->amrescan = btrescan;
                                136         1158799 :     amroutine->amgettuple = btgettuple;
                                137         1158799 :     amroutine->amgetbitmap = btgetbitmap;
                                138         1158799 :     amroutine->amendscan = btendscan;
                                139         1158799 :     amroutine->ammarkpos = btmarkpos;
                                140         1158799 :     amroutine->amrestrpos = btrestrpos;
 2244 rhaas                     141         1158799 :     amroutine->amestimateparallelscan = btestimateparallelscan;
                                142         1158799 :     amroutine->aminitparallelscan = btinitparallelscan;
                                143         1158799 :     amroutine->amparallelrescan = btparallelrescan;
 2639 tgl                       144 ECB             : 
 2639 tgl                       145 GIC     1158799 :     PG_RETURN_POINTER(amroutine);
 2639 tgl                       146 ECB             : }
                                147                 : 
                                148                 : /*
                                149                 :  *  btbuildempty() -- build an empty btree index in the initialization fork
                                150                 :  */
                                151                 : void
 2639 tgl                       152 GIC          62 : btbuildempty(Relation index)
 4484 rhaas                     153 ECB             : {
                                154                 :     Page        metapage;
                                155                 : 
                                156                 :     /* Construct metapage. */
    1 tmunro                    157 GNC          62 :     metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
 1138 pg                        158 CBC          62 :     _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
 4484 rhaas                     159 ECB             : 
                                160                 :     /*
                                161                 :      * Write the page and log it.  It might seem that an immediate sync would
                                162                 :      * be sufficient to guarantee that the file exists on disk, but recovery
                                163                 :      * itself might remove it while replaying, for example, an
                                164                 :      * XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE record.  Therefore, we need
                                165                 :      * this even when wal_level=minimal.
                                166                 :      */
 3670 simon                     167 GIC          62 :     PageSetChecksumInplace(metapage, BTREE_METAPAGE);
  636 tgl                       168 CBC          62 :     smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, BTREE_METAPAGE,
                                169                 :               metapage, true);
  277 rhaas                     170 GNC          62 :     log_newpage(&RelationGetSmgr(index)->smgr_rlocator.locator, INIT_FORKNUM,
 1983 tgl                       171 ECB             :                 BTREE_METAPAGE, metapage, true);
                                172                 : 
                                173                 :     /*
                                174                 :      * An immediate sync is required even if we xlog'd the page, because the
                                175                 :      * write did not go through shared_buffers and therefore a concurrent
                                176                 :      * checkpoint may have moved the redo pointer past our xlog record.
                                177                 :      */
  636 tgl                       178 GIC          62 :     smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM);
 4484 rhaas                     179 CBC          62 : }
 4484 rhaas                     180 ECB             : 
                                181                 : /*
                                182                 :  *  btinsert() -- insert an index tuple into a btree.
                                183                 :  *
                                184                 :  *      Descend the tree recursively, find the appropriate location for our
                                185                 :  *      new tuple, and put it there.
                                186                 :  */
                                187                 : bool
 2639 tgl                       188 GIC     7167159 : btinsert(Relation rel, Datum *values, bool *isnull,
 2639 tgl                       189 ECB             :          ItemPointer ht_ctid, Relation heapRel,
                                190                 :          IndexUniqueCheck checkUnique,
                                191                 :          bool indexUnchanged,
                                192                 :          IndexInfo *indexInfo)
                                193                 : {
                                194                 :     bool        result;
                                195                 :     IndexTuple  itup;
                                196                 : 
                                197                 :     /* generate an index tuple */
 6593 tgl                       198 GIC     7167159 :     itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
 9345 bruce                     199 CBC     7167159 :     itup->t_tid = *ht_ctid;
 9345 bruce                     200 ECB             : 
  816 pg                        201 GIC     7167159 :     result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
 9345 bruce                     202 ECB             : 
 9345 bruce                     203 GIC     7166901 :     pfree(itup);
 9345 bruce                     204 ECB             : 
 2639 tgl                       205 GIC     7166901 :     return result;
 9770 scrappy                   206 ECB             : }
                                207                 : 
                                208                 : /*
                                209                 :  *  btgettuple() -- Get the next tuple in the scan.
                                210                 :  */
                                211                 : bool
 2639 tgl                       212 GIC    21029716 : btgettuple(IndexScanDesc scan, ScanDirection dir)
 9770 scrappy                   213 ECB             : {
 7625 tgl                       214 GIC    21029716 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
 7625 tgl                       215 ECB             :     bool        res;
                                216                 : 
                                217                 :     /* btree indexes are never lossy */
 5474 tgl                       218 GIC    21029716 :     scan->xs_recheck = false;
 5474 tgl                       219 ECB             : 
                                220                 :     /*
                                221                 :      * If we have any array keys, initialize them during first call for a
                                222                 :      * scan.  We can't do this in btrescan because we don't know the scan
                                223                 :      * direction at that time.
                                224                 :      */
 4193 tgl                       225 GIC    21029716 :     if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
 4193 tgl                       226 ECB             :     {
                                227                 :         /* punt if we have any unsatisfiable array keys */
 4193 tgl                       228 GIC          88 :         if (so->numArrayKeys < 0)
 2639 tgl                       229 LBC           0 :             return false;
 4193 tgl                       230 EUB             : 
 4193 tgl                       231 GIC          88 :         _bt_start_array_keys(scan, dir);
 4193 tgl                       232 ECB             :     }
                                233                 : 
                                234                 :     /* This loop handles advancing to the next array elements, if any */
                                235                 :     do
                                236                 :     {
                                237                 :         /*
                                238                 :          * If we've already initialized this scan, we can just advance it in
                                239                 :          * the appropriate direction.  If we haven't done so yet, we call
                                240                 :          * _bt_first() to get the first item in the scan.
                                241                 :          */
 4193 tgl                       242 GIC    21029962 :         if (!BTScanPosIsValid(so->currPos))
 4193 tgl                       243 CBC     9080657 :             res = _bt_first(scan, dir);
 4193 tgl                       244 ECB             :         else
                                245                 :         {
                                246                 :             /*
                                247                 :              * Check to see if we should kill the previously-fetched tuple.
                                248                 :              */
 4193 tgl                       249 GIC    11949305 :             if (scan->kill_prior_tuple)
 4193 tgl                       250 ECB             :             {
                                251                 :                 /*
                                252                 :                  * Yes, remember it for later. (We'll deal with all such
                                253                 :                  * tuples at once right before leaving the index page.)  The
                                254                 :                  * test for numKilled overrun is not just paranoia: if the
                                255                 :                  * caller reverses direction in the indexscan then the same
                                256                 :                  * item might get entered multiple times. It's not worth
                                257                 :                  * trying to optimize that, so we don't detect it, but instead
                                258                 :                  * just forget any excess entries.
                                259                 :                  */
 4193 tgl                       260 GIC      263130 :                 if (so->killedItems == NULL)
 4193 tgl                       261 CBC      117431 :                     so->killedItems = (int *)
 1138 pg                        262          117431 :                         palloc(MaxTIDsPerBTreePage * sizeof(int));
                                263          263130 :                 if (so->numKilled < MaxTIDsPerBTreePage)
 4193 tgl                       264          263130 :                     so->killedItems[so->numKilled++] = so->currPos.itemIndex;
 4193 tgl                       265 ECB             :             }
                                266                 : 
                                267                 :             /*
                                268                 :              * Now continue the scan.
                                269                 :              */
 4193 tgl                       270 GIC    11949305 :             res = _bt_next(scan, dir);
 7625 tgl                       271 ECB             :         }
                                272                 : 
                                273                 :         /* If we have a tuple, return it ... */
 4193 tgl                       274 GIC    21029962 :         if (res)
 4193 tgl                       275 CBC    15809739 :             break;
 4193 tgl                       276 ECB             :         /* ... otherwise see if we have more array keys to deal with */
 4193 tgl                       277 GIC     5220223 :     } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
 8986 bruce                     278 ECB             : 
 2639 tgl                       279 GIC    21029716 :     return res;
 9770 scrappy                   280 ECB             : }
                                281                 : 
                                282                 : /*
                                283                 :  * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
                                284                 :  */
                                285                 : int64
 2639 tgl                       286 GIC        5496 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 6587 tgl                       287 ECB             : {
 6587 tgl                       288 GIC        5496 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
 5477 tgl                       289 CBC        5496 :     int64       ntids = 0;
 5477 tgl                       290 ECB             :     ItemPointer heapTid;
                                291                 : 
                                292                 :     /*
                                293                 :      * If we have any array keys, initialize them.
                                294                 :      */
 4193 tgl                       295 GIC        5496 :     if (so->numArrayKeys)
 6587 tgl                       296 ECB             :     {
                                297                 :         /* punt if we have any unsatisfiable array keys */
 4193 tgl                       298 GIC         222 :         if (so->numArrayKeys < 0)
 2639 tgl                       299 LBC           0 :             return ntids;
 4193 tgl                       300 EUB             : 
 4193 tgl                       301 GIC         222 :         _bt_start_array_keys(scan, ForwardScanDirection);
 6587 tgl                       302 ECB             :     }
                                303                 : 
                                304                 :     /* This loop handles advancing to the next array elements, if any */
                                305                 :     do
                                306                 :     {
                                307                 :         /* Fetch the first page & tuple */
 4193 tgl                       308 GIC        6838 :         if (_bt_first(scan, ForwardScanDirection))
 6181 tgl                       309 ECB             :         {
                                310                 :             /* Save tuple ID, and continue scanning */
 1490 andres                    311 GIC        5466 :             heapTid = &scan->xs_heaptid;
 4193 tgl                       312 CBC        5466 :             tbm_add_tuples(tbm, heapTid, 1, false);
                                313            5466 :             ntids++;
 4193 tgl                       314 ECB             : 
                                315                 :             for (;;)
                                316                 :             {
                                317                 :                 /*
                                318                 :                  * Advance to next tuple within page.  This is the same as the
                                319                 :                  * easy case in _bt_next().
                                320                 :                  */
 4193 tgl                       321 GIC      850887 :                 if (++so->currPos.itemIndex > so->currPos.lastItem)
 4193 tgl                       322 ECB             :                 {
                                323                 :                     /* let _bt_next do the heavy lifting */
 4193 tgl                       324 GIC        7583 :                     if (!_bt_next(scan, ForwardScanDirection))
 4193 tgl                       325 CBC        5466 :                         break;
 4193 tgl                       326 ECB             :                 }
                                327                 : 
                                328                 :                 /* Save tuple ID, and continue scanning */
 4193 tgl                       329 GIC      845421 :                 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
 4193 tgl                       330 CBC      845421 :                 tbm_add_tuples(tbm, heapTid, 1, false);
                                331          845421 :                 ntids++;
 4193 tgl                       332 ECB             :             }
                                333                 :         }
                                334                 :         /* Now see if we have more array keys to deal with */
 4193 tgl                       335 GIC        6838 :     } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
 6587 tgl                       336 ECB             : 
 2639 tgl                       337 GIC        5496 :     return ntids;
 6587 tgl                       338 ECB             : }
                                339                 : 
                                340                 : /*
                                341                 :  *  btbeginscan() -- start a scan on a btree index
                                342                 :  */
                                343                 : IndexScanDesc
 2639 tgl                       344 GIC     8901539 : btbeginscan(Relation rel, int nkeys, int norderbys)
 9770 scrappy                   345 ECB             : {
                                346                 :     IndexScanDesc scan;
                                347                 :     BTScanOpaque so;
                                348                 : 
                                349                 :     /* no order by operators allowed */
 4511 tgl                       350 GIC     8901539 :     Assert(norderbys == 0);
 9345 bruce                     351 ECB             : 
                                352                 :     /* get the scan */
 4511 tgl                       353 GIC     8901539 :     scan = RelationGetIndexScan(rel, nkeys, norderbys);
 4511 tgl                       354 ECB             : 
                                355                 :     /* allocate private workspace */
 4511 tgl                       356 GIC     8901539 :     so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
 2937 kgrittn                   357 CBC     8901539 :     BTScanPosInvalidate(so->currPos);
                                358         8901539 :     BTScanPosInvalidate(so->markPos);
 4511 tgl                       359         8901539 :     if (scan->numberOfKeys > 0)
                                360         8896655 :         so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
 4511 tgl                       361 ECB             :     else
 4511 tgl                       362 GIC        4884 :         so->keyData = NULL;
 4193 tgl                       363 ECB             : 
 4193 tgl                       364 GIC     8901539 :     so->arrayKeyData = NULL; /* assume no array keys for now */
 4193 tgl                       365 CBC     8901539 :     so->numArrayKeys = 0;
                                366         8901539 :     so->arrayKeys = NULL;
                                367         8901539 :     so->arrayContext = NULL;
 4193 tgl                       368 ECB             : 
 4511 tgl                       369 GIC     8901539 :     so->killedItems = NULL;      /* until needed */
 4511 tgl                       370 CBC     8901539 :     so->numKilled = 0;
 4200 tgl                       371 ECB             : 
                                372                 :     /*
                                373                 :      * We don't know yet whether the scan will be index-only, so we do not
                                374                 :      * allocate the tuple workspace arrays until btrescan.  However, we set up
                                375                 :      * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
                                376                 :      */
 4200 tgl                       377 GIC     8901539 :     so->currTuples = so->markTuples = NULL;
 4200 tgl                       378 ECB             : 
 4193 tgl                       379 GIC     8901539 :     scan->xs_itupdesc = RelationGetDescr(rel);
 4193 tgl                       380 ECB             : 
 4511 tgl                       381 GIC     8901539 :     scan->opaque = so;
 9345 bruce                     382 ECB             : 
 2639 tgl                       383 GIC     8901539 :     return scan;
 9770 scrappy                   384 ECB             : }
                                385                 : 
                                386                 : /*
                                387                 :  *  btrescan() -- rescan an index relation
                                388                 :  */
                                389                 : void
 2639 tgl                       390 GIC     9087654 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 2639 tgl                       391 ECB             :          ScanKey orderbys, int norderbys)
                                392                 : {
 4511 tgl                       393 GIC     9087654 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
 8297 tgl                       394 ECB             : 
                                395                 :     /* we aren't holding any read locks, but gotta drop the pins */
 6181 tgl                       396 GIC     9087654 :     if (BTScanPosIsValid(so->currPos))
 9345 bruce                     397 ECB             :     {
                                398                 :         /* Before leaving current page, deal with any killed items */
 6181 tgl                       399 GIC       24995 :         if (so->numKilled > 0)
 2937 kgrittn                   400 CBC         152 :             _bt_killitems(scan);
                                401           24995 :         BTScanPosUnpinIfPinned(so->currPos);
                                402           24995 :         BTScanPosInvalidate(so->currPos);
 9345 bruce                     403 ECB             :     }
                                404                 : 
 6072 tgl                       405 GIC     9087654 :     so->markItemIndex = -1;
 2244 rhaas                     406 CBC     9087654 :     so->arrayKeyCount = 0;
 2937 kgrittn                   407         9087654 :     BTScanPosUnpinIfPinned(so->markPos);
                                408         9087654 :     BTScanPosInvalidate(so->markPos);
 9345 bruce                     409 ECB             : 
                                410                 :     /*
                                411                 :      * Allocate tuple workspace arrays, if needed for an index-only scan and
                                412                 :      * not already done in a previous rescan call.  To save on palloc
                                413                 :      * overhead, both workspaces are allocated as one palloc block; only this
                                414                 :      * function and btendscan know that.
                                415                 :      *
                                416                 :      * NOTE: this data structure also makes it safe to return data from a
                                417                 :      * "name" column, even though btree name_ops uses an underlying storage
                                418                 :      * datatype of cstring.  The risk there is that "name" is supposed to be
                                419                 :      * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
                                420                 :      * However, since we only return data out of tuples sitting in the
                                421                 :      * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
                                422                 :      * data out of the markTuples array --- running off the end of memory for
                                423                 :      * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
                                424                 :      * adding special-case treatment for name_ops elsewhere.
                                425                 :      */
 4200 tgl                       426 GIC     9087654 :     if (scan->xs_want_itup && so->currTuples == NULL)
 4200 tgl                       427 ECB             :     {
 4200 tgl                       428 GIC       50124 :         so->currTuples = (char *) palloc(BLCKSZ * 2);
 4200 tgl                       429 CBC       50124 :         so->markTuples = so->currTuples + BLCKSZ;
 4200 tgl                       430 ECB             :     }
                                431                 : 
                                432                 :     /*
                                433                 :      * Reset the scan keys
                                434                 :      */
 7322 tgl                       435 GIC     9087654 :     if (scankey && scan->numberOfKeys > 0)
 9345 bruce                     436 CBC     9082744 :         memmove(scan->keyData,
 9345 bruce                     437 ECB             :                 scankey,
 9345 bruce                     438 GIC     9082744 :                 scan->numberOfKeys * sizeof(ScanKeyData));
 7088 tgl                       439 CBC     9087654 :     so->numberOfKeys = 0;        /* until _bt_preprocess_keys sets it */
 9749 scrappy                   440 ECB             : 
                                441                 :     /* If any keys are SK_SEARCHARRAY type, set up array-key info */
 4193 tgl                       442 GIC     9087654 :     _bt_preprocess_array_keys(scan);
 9770 scrappy                   443 CBC     9087654 : }
 9770 scrappy                   444 ECB             : 
                                445                 : /*
                                446                 :  *  btendscan() -- close down a scan
                                447                 :  */
                                448                 : void
 2639 tgl                       449 GIC     8900856 : btendscan(IndexScanDesc scan)
 9770 scrappy                   450 ECB             : {
 6181 tgl                       451 GIC     8900856 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
 9345 bruce                     452 ECB             : 
                                453                 :     /* we aren't holding any read locks, but gotta drop the pins */
 6181 tgl                       454 GIC     8900856 :     if (BTScanPosIsValid(so->currPos))
 9345 bruce                     455 ECB             :     {
                                456                 :         /* Before leaving current page, deal with any killed items */
 6181 tgl                       457 GIC     3835001 :         if (so->numKilled > 0)
 2937 kgrittn                   458 CBC       78401 :             _bt_killitems(scan);
                                459         3835001 :         BTScanPosUnpinIfPinned(so->currPos);
 9345 bruce                     460 ECB             :     }
                                461                 : 
 6072 tgl                       462 GIC     8900856 :     so->markItemIndex = -1;
 2937 kgrittn                   463 CBC     8900856 :     BTScanPosUnpinIfPinned(so->markPos);
 2937 kgrittn                   464 ECB             : 
                                465                 :     /* No need to invalidate positions, the RAM is about to be freed. */
                                466                 : 
                                467                 :     /* Release storage */
 7032 neilc                     468 GIC     8900856 :     if (so->keyData != NULL)
 9345 bruce                     469 CBC     8896031 :         pfree(so->keyData);
 4193 tgl                       470 ECB             :     /* so->arrayKeyData and so->arrayKeys are in arrayContext */
 4193 tgl                       471 GIC     8900856 :     if (so->arrayContext != NULL)
 4193 tgl                       472 CBC         314 :         MemoryContextDelete(so->arrayContext);
                                473         8900856 :     if (so->killedItems != NULL)
                                474          117406 :         pfree(so->killedItems);
 4200                           475         8900856 :     if (so->currTuples != NULL)
                                476           50058 :         pfree(so->currTuples);
 4200 tgl                       477 ECB             :     /* so->markTuples should not be pfree'd, see btrescan */
 9345 bruce                     478 GIC     8900856 :     pfree(so);
 9770 scrappy                   479 CBC     8900856 : }
 9770 scrappy                   480 ECB             : 
                                481                 : /*
                                482                 :  *  btmarkpos() -- save current scan position
                                483                 :  */
                                484                 : void
 2639 tgl                       485 GIC       65029 : btmarkpos(IndexScanDesc scan)
 9770 scrappy                   486 ECB             : {
 6181 tgl                       487 GIC       65029 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
 9345 bruce                     488 ECB             : 
                                489                 :     /* There may be an old mark with a pin (but no lock). */
 2937 kgrittn                   490 GIC       65029 :     BTScanPosUnpinIfPinned(so->markPos);
 9345 bruce                     491 ECB             : 
                                492                 :     /*
                                493                 :      * Just record the current itemIndex.  If we later step to next page
                                494                 :      * before releasing the marked position, _bt_steppage makes a full copy of
                                495                 :      * the currPos struct in markPos.  If (as often happens) the mark is moved
                                496                 :      * before we leave the page, we don't have to do that work.
                                497                 :      */
 6181 tgl                       498 GIC       65029 :     if (BTScanPosIsValid(so->currPos))
 6072 tgl                       499 CBC       65029 :         so->markItemIndex = so->currPos.itemIndex;
 6072 tgl                       500 ECB             :     else
                                501                 :     {
 2937 kgrittn                   502 UIC           0 :         BTScanPosInvalidate(so->markPos);
 6072 tgl                       503 UBC           0 :         so->markItemIndex = -1;
 2937 kgrittn                   504 EUB             :     }
                                505                 : 
                                506                 :     /* Also record the current positions of any array keys */
 3846 tgl                       507 GIC       65029 :     if (so->numArrayKeys)
 3846 tgl                       508 CBC           3 :         _bt_mark_array_keys(scan);
 9770 scrappy                   509           65029 : }
 9770 scrappy                   510 ECB             : 
                                511                 : /*
                                512                 :  *  btrestrpos() -- restore scan to last saved position
                                513                 :  */
                                514                 : void
 2639 tgl                       515 GIC       27015 : btrestrpos(IndexScanDesc scan)
 9770 scrappy                   516 ECB             : {
 6181 tgl                       517 GIC       27015 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
 9345 bruce                     518 ECB             : 
                                519                 :     /* Restore the marked positions of any array keys */
 3846 tgl                       520 GIC       27015 :     if (so->numArrayKeys)
 3846 tgl                       521 CBC           3 :         _bt_restore_array_keys(scan);
 3846 tgl                       522 ECB             : 
 6072 tgl                       523 GIC       27015 :     if (so->markItemIndex >= 0)
 9345 bruce                     524 ECB             :     {
                                525                 :         /*
                                526                 :          * The scan has never moved to a new page since the last mark.  Just
                                527                 :          * restore the itemIndex.
                                528                 :          *
                                529                 :          * NB: In this case we can't count on anything in so->markPos to be
                                530                 :          * accurate.
                                531                 :          */
 6072 tgl                       532 GIC       26961 :         so->currPos.itemIndex = so->markItemIndex;
 6031 bruce                     533 ECB             :     }
                                534                 :     else
                                535                 :     {
                                536                 :         /*
                                537                 :          * The scan moved to a new page after last mark or restore, and we are
                                538                 :          * now restoring to the marked page.  We aren't holding any read
                                539                 :          * locks, but if we're still holding the pin for the current position,
                                540                 :          * we must drop it.
                                541                 :          */
 6072 tgl                       542 GIC          54 :         if (BTScanPosIsValid(so->currPos))
 6072 tgl                       543 ECB             :         {
                                544                 :             /* Before leaving current page, deal with any killed items */
 2937 kgrittn                   545 GIC          54 :             if (so->numKilled > 0)
 2937 kgrittn                   546 LBC           0 :                 _bt_killitems(scan);
 2937 kgrittn                   547 GBC          54 :             BTScanPosUnpinIfPinned(so->currPos);
 6072 tgl                       548 ECB             :         }
                                549                 : 
 6072 tgl                       550 GIC          54 :         if (BTScanPosIsValid(so->markPos))
 6072 tgl                       551 ECB             :         {
                                552                 :             /* bump pin on mark buffer for assignment to current buffer */
 2937 kgrittn                   553 GIC          54 :             if (BTScanPosIsPinned(so->markPos))
 2937 kgrittn                   554 LBC           0 :                 IncrBufferRefCount(so->markPos.buf);
 6072 tgl                       555 GBC          54 :             memcpy(&so->currPos, &so->markPos,
 6072 tgl                       556 ECB             :                    offsetof(BTScanPosData, items[1]) +
 6072 tgl                       557 GIC          54 :                    so->markPos.lastItem * sizeof(BTScanPosItem));
 4200 tgl                       558 CBC          54 :             if (so->currTuples)
 4200 tgl                       559 LBC           0 :                 memcpy(so->currTuples, so->markTuples,
 4200 tgl                       560 UBC           0 :                        so->markPos.nextTupleOffset);
 6072 tgl                       561 EUB             :         }
                                562                 :         else
 2937 kgrittn                   563 UIC           0 :             BTScanPosInvalidate(so->currPos);
 9345 bruce                     564 EUB             :     }
 9770 scrappy                   565 GIC       27015 : }
 9770 scrappy                   566 ECB             : 
                                567                 : /*
                                568                 :  * btestimateparallelscan -- estimate storage for BTParallelScanDescData
                                569                 :  */
                                570                 : Size
 2244 rhaas                     571 GIC          26 : btestimateparallelscan(void)
 2244 rhaas                     572 ECB             : {
 2244 rhaas                     573 GIC          26 :     return sizeof(BTParallelScanDescData);
 2244 rhaas                     574 ECB             : }
                                575                 : 
                                576                 : /*
                                577                 :  * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
                                578                 :  */
                                579                 : void
 2244 rhaas                     580 GIC          26 : btinitparallelscan(void *target)
 2244 rhaas                     581 ECB             : {
 2244 rhaas                     582 GIC          26 :     BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
 2244 rhaas                     583 ECB             : 
 2244 rhaas                     584 GIC          26 :     SpinLockInit(&bt_target->btps_mutex);
 2244 rhaas                     585 CBC          26 :     bt_target->btps_scanPage = InvalidBlockNumber;
                                586              26 :     bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
                                587              26 :     bt_target->btps_arrayKeyCount = 0;
                                588              26 :     ConditionVariableInit(&bt_target->btps_cv);
                                589              26 : }
 2244 rhaas                     590 ECB             : 
                                591                 : /*
                                592                 :  *  btparallelrescan() -- reset parallel scan
                                593                 :  */
                                594                 : void
 2244 rhaas                     595 GIC          12 : btparallelrescan(IndexScanDesc scan)
 2244 rhaas                     596 ECB             : {
                                597                 :     BTParallelScanDesc btscan;
 2244 rhaas                     598 GIC          12 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 2244 rhaas                     599 ECB             : 
 2244 rhaas                     600 GIC          12 :     Assert(parallel_scan);
 2244 rhaas                     601 ECB             : 
 2244 rhaas                     602 GIC          12 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 2244 rhaas                     603 ECB             :                                                   parallel_scan->ps_offset);
                                604                 : 
                                605                 :     /*
                                606                 :      * In theory, we don't need to acquire the spinlock here, because there
                                607                 :      * shouldn't be any other workers running at this point, but we do so for
                                608                 :      * consistency.
                                609                 :      */
 2244 rhaas                     610 GIC          12 :     SpinLockAcquire(&btscan->btps_mutex);
 2244 rhaas                     611 CBC          12 :     btscan->btps_scanPage = InvalidBlockNumber;
                                612              12 :     btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
                                613              12 :     btscan->btps_arrayKeyCount = 0;
                                614              12 :     SpinLockRelease(&btscan->btps_mutex);
                                615              12 : }
 2244 rhaas                     616 ECB             : 
                                617                 : /*
                                618                 :  * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
                                619                 :  *      page.  Other scans must wait until we call _bt_parallel_release()
                                620                 :  *      or _bt_parallel_done().
                                621                 :  *
                                622                 :  * The return value is true if we successfully seized the scan and false
                                623                 :  * if we did not.  The latter case occurs if no pages remain for the current
                                624                 :  * set of scankeys.
                                625                 :  *
                                626                 :  * If the return value is true, *pageno returns the next or current page
                                627                 :  * of the scan (depending on the scan direction).  An invalid block number
                                628                 :  * means the scan hasn't yet started, and P_NONE means we've reached the end.
                                629                 :  * The first time a participating process reaches the last page, it will return
                                630                 :  * true and set *pageno to P_NONE; after that, further attempts to seize the
                                631                 :  * scan will return false.
                                632                 :  *
                                633                 :  * Callers should ignore the value of pageno if the return value is false.
                                634                 :  */
                                635                 : bool
 2244 rhaas                     636 GIC         828 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
 2244 rhaas                     637 ECB             : {
 2244 rhaas                     638 GIC         828 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
 2244 rhaas                     639 ECB             :     BTPS_State  pageStatus;
 2244 rhaas                     640 GIC         828 :     bool        exit_loop = false;
 2244 rhaas                     641 CBC         828 :     bool        status = true;
                                642             828 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 2244 rhaas                     643 ECB             :     BTParallelScanDesc btscan;
                                644                 : 
 2244 rhaas                     645 GIC         828 :     *pageno = P_NONE;
 2244 rhaas                     646 ECB             : 
 2244 rhaas                     647 GIC         828 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 2244 rhaas                     648 ECB             :                                                   parallel_scan->ps_offset);
                                649                 : 
                                650                 :     while (1)
                                651                 :     {
 2244 rhaas                     652 GIC         838 :         SpinLockAcquire(&btscan->btps_mutex);
 2244 rhaas                     653 CBC         838 :         pageStatus = btscan->btps_pageStatus;
 2244 rhaas                     654 ECB             : 
 2244 rhaas                     655 GIC         838 :         if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
 2244 rhaas                     656 ECB             :         {
                                657                 :             /* Parallel scan has already advanced to a new set of scankeys. */
 2244 rhaas                     658 UIC           0 :             status = false;
 2244 rhaas                     659 EUB             :         }
 2244 rhaas                     660 GIC         838 :         else if (pageStatus == BTPARALLEL_DONE)
 2244 rhaas                     661 ECB             :         {
                                662                 :             /*
                                663                 :              * We're done with this set of scankeys.  This may be the end, or
                                664                 :              * there could be more sets to try.
                                665                 :              */
 2244 rhaas                     666 GIC         146 :             status = false;
 2244 rhaas                     667 ECB             :         }
 2244 rhaas                     668 GIC         692 :         else if (pageStatus != BTPARALLEL_ADVANCING)
 2244 rhaas                     669 ECB             :         {
                                670                 :             /*
                                671                 :              * We have successfully seized control of the scan for the purpose
                                672                 :              * of advancing it to a new page!
                                673                 :              */
 2244 rhaas                     674 GIC         682 :             btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
 2244 rhaas                     675 CBC         682 :             *pageno = btscan->btps_scanPage;
                                676             682 :             exit_loop = true;
 2244 rhaas                     677 ECB             :         }
 2244 rhaas                     678 GIC         838 :         SpinLockRelease(&btscan->btps_mutex);
 2244 rhaas                     679 CBC         838 :         if (exit_loop || !status)
 2244 rhaas                     680 ECB             :             break;
 2244 rhaas                     681 GIC          10 :         ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
 2244 rhaas                     682 ECB             :     }
 2244 rhaas                     683 GIC         828 :     ConditionVariableCancelSleep();
 2244 rhaas                     684 ECB             : 
 2244 rhaas                     685 GIC         828 :     return status;
 2244 rhaas                     686 ECB             : }
                                687                 : 
                                688                 : /*
                                689                 :  * _bt_parallel_release() -- Complete the process of advancing the scan to a
                                690                 :  *      new page.  We now have the new value btps_scanPage; some other backend
                                691                 :  *      can now begin advancing the scan.
                                692                 :  */
                                693                 : void
 2244 rhaas                     694 GIC         644 : _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
 2244 rhaas                     695 ECB             : {
 2244 rhaas                     696 GIC         644 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 2244 rhaas                     697 ECB             :     BTParallelScanDesc btscan;
                                698                 : 
 2244 rhaas                     699 GIC         644 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 2244 rhaas                     700 ECB             :                                                   parallel_scan->ps_offset);
                                701                 : 
 2244 rhaas                     702 GIC         644 :     SpinLockAcquire(&btscan->btps_mutex);
 2244 rhaas                     703 CBC         644 :     btscan->btps_scanPage = scan_page;
                                704             644 :     btscan->btps_pageStatus = BTPARALLEL_IDLE;
                                705             644 :     SpinLockRelease(&btscan->btps_mutex);
                                706             644 :     ConditionVariableSignal(&btscan->btps_cv);
                                707             644 : }
 2244 rhaas                     708 ECB             : 
                                709                 : /*
                                710                 :  * _bt_parallel_done() -- Mark the parallel scan as complete.
                                711                 :  *
                                712                 :  * When there are no pages left to scan, this function should be called to
                                713                 :  * notify other workers.  Otherwise, they might wait forever for the scan to
                                714                 :  * advance to the next page.
                                715                 :  */
                                716                 : void
 2244 rhaas                     717 GIC     5227696 : _bt_parallel_done(IndexScanDesc scan)
 2244 rhaas                     718 ECB             : {
 2244 rhaas                     719 GIC     5227696 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
 2244 rhaas                     720 CBC     5227696 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 2244 rhaas                     721 ECB             :     BTParallelScanDesc btscan;
 2244 rhaas                     722 GIC     5227696 :     bool        status_changed = false;
 2244 rhaas                     723 ECB             : 
                                724                 :     /* Do nothing, for non-parallel scans */
 2244 rhaas                     725 GIC     5227696 :     if (parallel_scan == NULL)
 2244 rhaas                     726 CBC     5227658 :         return;
 2244 rhaas                     727 ECB             : 
 2244 rhaas                     728 GIC          38 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 2244 rhaas                     729 ECB             :                                                   parallel_scan->ps_offset);
                                730                 : 
                                731                 :     /*
                                732                 :      * Mark the parallel scan as done for this combination of scan keys,
                                733                 :      * unless some other process already did so.  See also
                                734                 :      * _bt_advance_array_keys.
                                735                 :      */
 2244 rhaas                     736 GIC          38 :     SpinLockAcquire(&btscan->btps_mutex);
 2244 rhaas                     737 CBC          38 :     if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
                                738              38 :         btscan->btps_pageStatus != BTPARALLEL_DONE)
 2244 rhaas                     739 ECB             :     {
 2244 rhaas                     740 GIC          38 :         btscan->btps_pageStatus = BTPARALLEL_DONE;
 2244 rhaas                     741 CBC          38 :         status_changed = true;
 2244 rhaas                     742 ECB             :     }
 2244 rhaas                     743 GIC          38 :     SpinLockRelease(&btscan->btps_mutex);
 2244 rhaas                     744 ECB             : 
                                745                 :     /* wake up all the workers associated with this parallel scan */
 2244 rhaas                     746 GIC          38 :     if (status_changed)
 2244 rhaas                     747 CBC          38 :         ConditionVariableBroadcast(&btscan->btps_cv);
 2244 rhaas                     748 ECB             : }
                                749                 : 
                                750                 : /*
                                751                 :  * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
                                752                 :  *          keys.
                                753                 :  *
                                754                 :  * Updates the count of array keys processed for both local and parallel
                                755                 :  * scans.
                                756                 :  */
                                757                 : void
 2244 rhaas                     758 UIC           0 : _bt_parallel_advance_array_keys(IndexScanDesc scan)
 2244 rhaas                     759 EUB             : {
 2244 rhaas                     760 UIC           0 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
 2244 rhaas                     761 UBC           0 :     ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
 2244 rhaas                     762 EUB             :     BTParallelScanDesc btscan;
                                763                 : 
 2244 rhaas                     764 UIC           0 :     btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
 2244 rhaas                     765 EUB             :                                                   parallel_scan->ps_offset);
                                766                 : 
 2244 rhaas                     767 UIC           0 :     so->arrayKeyCount++;
 2244 rhaas                     768 UBC           0 :     SpinLockAcquire(&btscan->btps_mutex);
                                769               0 :     if (btscan->btps_pageStatus == BTPARALLEL_DONE)
 2244 rhaas                     770 EUB             :     {
 2244 rhaas                     771 UIC           0 :         btscan->btps_scanPage = InvalidBlockNumber;
 2244 rhaas                     772 UBC           0 :         btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
                                773               0 :         btscan->btps_arrayKeyCount++;
 2244 rhaas                     774 EUB             :     }
 2244 rhaas                     775 UIC           0 :     SpinLockRelease(&btscan->btps_mutex);
 2244 rhaas                     776 UBC           0 : }
 2244 rhaas                     777 EUB             : 
                                778                 : /*
                                779                 :  * Bulk deletion of all index entries pointing to a set of heap tuples.
                                780                 :  * The set of target tuples is specified via a callback routine that tells
                                781                 :  * whether any given heap tuple (identified by ItemPointer) is being deleted.
                                782                 :  *
                                783                 :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
                                784                 :  */
                                785                 : IndexBulkDeleteResult *
 2639 tgl                       786 GIC        4011 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 2639 tgl                       787 ECB             :              IndexBulkDeleteCallback callback, void *callback_state)
                                788                 : {
 6186 tgl                       789 GIC        4011 :     Relation    rel = info->index;
 6180 tgl                       790 ECB             :     BTCycleId   cycleid;
                                791                 : 
                                792                 :     /* allocate stats if first time through, else re-use existing struct */
 6180 tgl                       793 GIC        4011 :     if (stats == NULL)
 6180 tgl                       794 CBC        4011 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 6266 tgl                       795 ECB             : 
                                796                 :     /* Establish the vacuum cycle ID to use for this scan */
                                797                 :     /* The ENSURE stuff ensures we clean up shared memory on failure */
 5471 tgl                       798 GIC        4011 :     PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 7938 tgl                       799 ECB             :     {
 6180 tgl                       800 GIC        4011 :         cycleid = _bt_start_vacuum(rel);
 7188 bruce                     801 ECB             : 
 1073 pg                        802 GIC        4011 :         btvacuumscan(info, stats, callback, callback_state, cycleid);
 6180 tgl                       803 ECB             :     }
 5471 tgl                       804 GIC        4011 :     PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
 5471 tgl                       805 CBC        4011 :     _bt_end_vacuum(rel);
 7938 tgl                       806 ECB             : 
 2639 tgl                       807 GIC        4011 :     return stats;
 9770 scrappy                   808 ECB             : }
                                809                 : 
                                810                 : /*
                                811                 :  * Post-VACUUM cleanup.
                                812                 :  *
                                813                 :  * Result: a palloc'd struct containing statistical info for VACUUM displays.
                                814                 :  */
                                815                 : IndexBulkDeleteResult *
 2639 tgl                       816 GIC       94456 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 7351 tgl                       817 ECB             : {
                                818                 :     BlockNumber num_delpages;
                                819                 : 
                                820                 :     /* No-op in ANALYZE ONLY mode */
 5129 tgl                       821 GIC       94456 :     if (info->analyze_only)
 2639 tgl                       822 CBC       39995 :         return stats;
 5129 tgl                       823 ECB             : 
                                824                 :     /*
                                825                 :      * If btbulkdelete was called, we need not do anything (we just maintain
                                826                 :      * the information used within _bt_vacuum_needs_cleanup() by calling
                                827                 :      * _bt_set_cleanup_info() below).
                                828                 :      *
                                829                 :      * If btbulkdelete was _not_ called, then we have a choice to make: we
                                830                 :      * must decide whether or not a btvacuumscan() call is needed now (i.e.
                                831                 :      * whether the ongoing VACUUM operation can entirely avoid a physical scan
                                832                 :      * of the index).  A call to _bt_vacuum_needs_cleanup() decides it for us
                                833                 :      * now.
                                834                 :      */
 6186 tgl                       835 GIC       54461 :     if (stats == NULL)
 6180 tgl                       836 ECB             :     {
                                837                 :         /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
    8 andres                    838 GNC       50713 :         if (!_bt_vacuum_needs_cleanup(info->index, info->heaprel))
 1831 teodor                    839 CBC       50708 :             return NULL;
 1831 teodor                    840 ECB             : 
                                841                 :         /*
                                842                 :          * Since we aren't going to actually delete any leaf items, there's no
                                843                 :          * need to go through all the vacuum-cycle-ID pushups here.
                                844                 :          *
                                845                 :          * Posting list tuples are a source of inaccuracy for cleanup-only
                                846                 :          * scans.  btvacuumscan() will assume that the number of index tuples
                                847                 :          * from each page can be used as num_index_tuples, even though
                                848                 :          * num_index_tuples is supposed to represent the number of TIDs in the
                                849                 :          * index.  This naive approach can underestimate the number of tuples
                                850                 :          * in the index significantly.
                                851                 :          *
                                852                 :          * We handle the problem by making num_index_tuples an estimate in
                                853                 :          * cleanup-only case.
                                854                 :          */
 6186 tgl                       855 GIC           5 :         stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
 1073 pg                        856 CBC           5 :         btvacuumscan(info, stats, NULL, NULL, 0);
  760                           857               5 :         stats->estimated_count = true;
 6180 tgl                       858 ECB             :     }
                                859                 : 
                                860                 :     /*
                                861                 :      * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
                                862                 :      *
                                863                 :      * num_delpages is the number of deleted pages now in the index that were
                                864                 :      * not safe to place in the FSM to be recycled just yet.  num_delpages is
                                865                 :      * greater than 0 only when _bt_pagedel() actually deleted pages during
                                866                 :      * our call to btvacuumscan().  Even then, _bt_pendingfsm_finalize() must
                                867                 :      * have failed to place any newly deleted pages in the FSM just moments
                                868                 :      * ago.  (Actually, there are edge cases where recycling of the current
                                869                 :      * VACUUM's newly deleted pages does not even become safe by the time the
                                870                 :      * next VACUUM comes around.  See nbtree/README.)
                                871                 :      */
  774 pg                        872 GIC        3753 :     Assert(stats->pages_deleted >= stats->pages_free);
  774 pg                        873 CBC        3753 :     num_delpages = stats->pages_deleted - stats->pages_free;
    8 andres                    874 GNC        3753 :     _bt_set_cleanup_info(info->index, info->heaprel, num_delpages);
  774 pg                        875 ECB             : 
                                876                 :     /*
                                877                 :      * It's quite possible for us to be fooled by concurrent page splits into
                                878                 :      * double-counting some index tuples, so disbelieve any total that exceeds
                                879                 :      * the underlying heap's count ... if we know that accurately.  Otherwise
                                880                 :      * this might just make matters worse.
                                881                 :      */
 4808 tgl                       882 GIC        3753 :     if (!info->estimated_count)
 6180 tgl                       883 ECB             :     {
 6180 tgl                       884 GIC        3731 :         if (stats->num_index_tuples > info->num_heap_tuples)
 6180 tgl                       885 CBC           7 :             stats->num_index_tuples = info->num_heap_tuples;
 6180 tgl                       886 ECB             :     }
                                887                 : 
 2639 tgl                       888 GIC        3753 :     return stats;
 6180 tgl                       889 ECB             : }
                                890                 : 
                                891                 : /*
                                892                 :  * btvacuumscan --- scan the index for VACUUMing purposes
                                893                 :  *
                                894                 :  * This combines the functions of looking for leaf tuples that are deletable
                                895                 :  * according to the vacuum callback, looking for empty pages that can be
                                896                 :  * deleted, and looking for old deleted pages that can be recycled.  Both
                                897                 :  * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
                                898                 :  * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
                                899                 :  *
                                900                 :  * The caller is responsible for initially allocating/zeroing a stats struct
                                901                 :  * and for obtaining a vacuum cycle ID if necessary.
                                902                 :  */
                                903                 : static void
 6180 tgl                       904 GIC        4016 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 6180 tgl                       905 ECB             :              IndexBulkDeleteCallback callback, void *callback_state,
                                906                 :              BTCycleId cycleid)
                                907                 : {
 6180 tgl                       908 GIC        4016 :     Relation    rel = info->index;
 6180 tgl                       909 ECB             :     BTVacState  vstate;
                                910                 :     BlockNumber num_pages;
                                911                 :     BlockNumber scanblkno;
                                912                 :     bool        needLock;
                                913                 : 
                                914                 :     /*
                                915                 :      * Reset fields that track information about the entire index now.  This
                                916                 :      * avoids double-counting in the case where a single VACUUM command
                                917                 :      * requires multiple scans of the index.
                                918                 :      *
                                919                 :      * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
                                920                 :      * since they track information about the VACUUM command, and so must last
                                921                 :      * across each call to btvacuumscan().
                                922                 :      *
                                923                 :      * (Note that pages_free is treated as state about the whole index, not
                                924                 :      * the current VACUUM.  This is appropriate because RecordFreeIndexPage()
                                925                 :      * calls are idempotent, and get repeated for the same deleted pages in
                                926                 :      * some scenarios.  The point for us is to track the number of recyclable
                                927                 :      * pages in the index at the end of the VACUUM command.)
                                928                 :      */
  774 pg                        929 GIC        4016 :     stats->num_pages = 0;
 6180 tgl                       930 CBC        4016 :     stats->num_index_tuples = 0;
                                931            4016 :     stats->pages_deleted = 0;
  774 pg                        932            4016 :     stats->pages_free = 0;
 6180 tgl                       933 ECB             : 
                                934                 :     /* Set up info to pass down to btvacuumpage */
 6180 tgl                       935 GIC        4016 :     vstate.info = info;
 6180 tgl                       936 CBC        4016 :     vstate.stats = stats;
                                937            4016 :     vstate.callback = callback;
                                938            4016 :     vstate.callback_state = callback_state;
                                939            4016 :     vstate.cycleid = cycleid;
 7351 tgl                       940 ECB             : 
                                941                 :     /* Create a temporary memory context to run _bt_pagedel in */
 6180 tgl                       942 GIC        4016 :     vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
 6180 tgl                       943 ECB             :                                                   "_bt_pagedel",
                                944                 :                                                   ALLOCSET_DEFAULT_SIZES);
                                945                 : 
                                946                 :     /* Initialize vstate fields used by _bt_pendingfsm_finalize */
  749 pg                        947 GIC        4016 :     vstate.bufsize = 0;
  749 pg                        948 CBC        4016 :     vstate.maxbufsize = 0;
                                949            4016 :     vstate.pendingpages = NULL;
                                950            4016 :     vstate.npendingpages = 0;
  749 pg                        951 ECB             :     /* Consider applying _bt_pendingfsm_finalize optimization */
  749 pg                        952 GIC        4016 :     _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
  749 pg                        953 ECB             : 
                                954                 :     /*
                                955                 :      * The outer loop iterates over all index pages except the metapage, in
                                956                 :      * physical order (we hope the kernel will cooperate in providing
                                957                 :      * read-ahead for speed).  It is critical that we visit all leaf pages,
                                958                 :      * including ones added after we start the scan, else we might fail to
                                959                 :      * delete some deletable tuples.  Hence, we must repeatedly check the
                                960                 :      * relation length.  We must acquire the relation-extension lock while
                                961                 :      * doing so to avoid a race condition: if someone else is extending the
                                962                 :      * relation, there is a window where bufmgr/smgr have created a new
                                963                 :      * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
                                964                 :      * we manage to scan such a page here, we'll improperly assume it can be
                                965                 :      * recycled.  Taking the lock synchronizes things enough to prevent a
                                966                 :      * problem: either num_pages won't include the new page, or _bt_getbuf
                                967                 :      * already has write lock on the buffer and it will be fully initialized
                                968                 :      * before we can examine it.  Also, we need not worry if a page is added
                                969                 :      * immediately after we look; the page splitting code already has
                                970                 :      * write-lock on the left page before it adds a right page, so we must
                                971                 :      * already have processed any tuples due to be moved into such a page.
                                972                 :      *
                                973                 :      * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
                                974                 :      * think the use of the extension lock is still required.
                                975                 :      *
                                976                 :      * We can skip locking for new or temp relations, however, since no one
                                977                 :      * else could be accessing them.
                                978                 :      */
 6180 tgl                       979 GIC        4016 :     needLock = !RELATION_IS_LOCAL(rel);
                                980                 : 
 1072 pg                        981            4016 :     scanblkno = BTREE_METAPAGE + 1;
                                982                 :     for (;;)
 6180 tgl                       983 ECB             :     {
                                984                 :         /* Get the current relation length */
 6180 tgl                       985 CBC        7935 :         if (needLock)
 6180 tgl                       986 GIC        7933 :             LockRelationForExtension(rel, ExclusiveLock);
                                987            7935 :         num_pages = RelationGetNumberOfBlocks(rel);
                                988            7935 :         if (needLock)
 6180 tgl                       989 CBC        7933 :             UnlockRelationForExtension(rel, ExclusiveLock);
 6180 tgl                       990 ECB             : 
 1468 alvherre                  991 CBC        7935 :         if (info->report_progress)
                                992             429 :             pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
 1468 alvherre                  993 ECB             :                                          num_pages);
                                994                 : 
 6180 tgl                       995                 :         /* Quit if we've scanned the whole relation */
 1072 pg                        996 CBC        7935 :         if (scanblkno >= num_pages)
 6180 tgl                       997 GIC        4016 :             break;
                                998                 :         /* Iterate over pages, then loop back to recheck length */
 1072 pg                        999           36840 :         for (; scanblkno < num_pages; scanblkno++)
 6265 tgl                      1000 ECB             :         {
 1072 pg                       1001 CBC       32921 :             btvacuumpage(&vstate, scanblkno);
 1468 alvherre                 1002 GIC       32921 :             if (info->report_progress)
 1468 alvherre                 1003 CBC         211 :                 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
                               1004                 :                                              scanblkno);
 6265 tgl                      1005 ECB             :         }
 7351                          1006                 :     }
                               1007                 : 
                               1008                 :     /* Set statistics num_pages field to final size of index */
  774 pg                       1009 GIC        4016 :     stats->num_pages = num_pages;
                               1010                 : 
 6180 tgl                      1011            4016 :     MemoryContextDelete(vstate.pagedelcontext);
                               1012                 : 
 1836 tgl                      1013 ECB             :     /*
                               1014                 :      * If there were any calls to _bt_pagedel() during scan of the index then
  749 pg                       1015                 :      * see if any of the resulting pages can be placed in the FSM now.  When
                               1016                 :      * it's not safe we'll have to leave it up to a future VACUUM operation.
                               1017                 :      *
                               1018                 :      * Finally, if we placed any pages in the FSM (either just now or during
                               1019                 :      * the scan), forcibly update the upper-level FSM pages to ensure that
                               1020                 :      * searchers can find them.
                               1021                 :      */
  749 pg                       1022 GIC        4016 :     _bt_pendingfsm_finalize(rel, &vstate);
  774                          1023            4016 :     if (stats->pages_free > 0)
 1836 tgl                      1024              18 :         IndexFreeSpaceMapVacuum(rel);
 6180                          1025            4016 : }
 7351 tgl                      1026 ECB             : 
 6180                          1027                 : /*
                               1028                 :  * btvacuumpage --- VACUUM one page
                               1029                 :  *
                               1030                 :  * This processes a single page for btvacuumscan().  In some cases we must
                               1031                 :  * backtrack to re-examine and VACUUM pages that were the scanblkno during
                               1032                 :  * a previous call here.  This is how we handle page splits (that happened
                               1033                 :  * after our cycleid was acquired) whose right half page happened to reuse
                               1034                 :  * a block that we might have processed at some point before it was
                               1035                 :  * recycled (i.e. before the page split).
                               1036                 :  */
                               1037                 : static void
 1072 pg                       1038 GIC       32921 : btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
                               1039                 : {
 6180 tgl                      1040           32921 :     IndexVacuumInfo *info = vstate->info;
                               1041           32921 :     IndexBulkDeleteResult *stats = vstate->stats;
 6180 tgl                      1042 CBC       32921 :     IndexBulkDeleteCallback callback = vstate->callback;
 6180 tgl                      1043 GIC       32921 :     void       *callback_state = vstate->callback_state;
 6180 tgl                      1044 CBC       32921 :     Relation    rel = info->index;
    6 pg                       1045 GNC       32921 :     Relation    heaprel = info->heaprel;
 1072 pg                       1046 ECB             :     bool        attempt_pagedel;
 1060 tgl                      1047                 :     BlockNumber blkno,
                               1048                 :                 backtrack_to;
 6180                          1049                 :     Buffer      buf;
                               1050                 :     Page        page;
                               1051                 :     BTPageOpaque opaque;
                               1052                 : 
 1072 pg                       1053 GIC       32921 :     blkno = scanblkno;
                               1054                 : 
                               1055           32921 : backtrack:
                               1056                 : 
                               1057           32921 :     attempt_pagedel = false;
 1072 pg                       1058 CBC       32921 :     backtrack_to = P_NONE;
                               1059                 : 
 6180 tgl                      1060 ECB             :     /* call vacuum_delay_point while not holding any buffer lock */
 6180 tgl                      1061 GIC       32921 :     vacuum_delay_point();
 6180 tgl                      1062 ECB             : 
                               1063                 :     /*
                               1064                 :      * We can't use _bt_getbuf() here because it always applies
                               1065                 :      * _bt_checkpage(), which will barf on an all-zero page. We want to
 5793                          1066                 :      * recycle all-zero pages, not fail.  Also, we want to use a nondefault
                               1067                 :      * buffer access strategy.
                               1068                 :      */
 5273 heikki.linnakangas       1069 GIC       32921 :     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
                               1070                 :                              info->strategy);
  992 pg                       1071           32921 :     _bt_lockbuf(rel, buf, BT_READ);
 2545 kgrittn                  1072           32921 :     page = BufferGetPage(buf);
 1072 pg                       1073           32921 :     opaque = NULL;
 6180 tgl                      1074 CBC       32921 :     if (!PageIsNew(page))
                               1075                 :     {
                               1076           32921 :         _bt_checkpage(rel, buf);
  373 michael                  1077           32921 :         opaque = BTPageGetOpaque(page);
 2813 heikki.linnakangas       1078 ECB             :     }
 6180 tgl                      1079                 : 
 1072 pg                       1080 GIC       32921 :     Assert(blkno <= scanblkno);
 1072 pg                       1081 CBC       32921 :     if (blkno != scanblkno)
 6180 tgl                      1082 ECB             :     {
                               1083                 :         /*
                               1084                 :          * We're backtracking.
 1072 pg                       1085                 :          *
                               1086                 :          * We followed a right link to a sibling leaf page (a page that
                               1087                 :          * happens to be from a block located before scanblkno).  The only
                               1088                 :          * case we want to do anything with is a live leaf page having the
                               1089                 :          * current vacuum cycle ID.
                               1090                 :          *
                               1091                 :          * The page had better be in a state that's consistent with what we
                               1092                 :          * expect.  Check for conditions that imply corruption in passing.  It
                               1093                 :          * can't be half-dead because only an interrupted VACUUM process can
                               1094                 :          * leave pages in that state, so we'd definitely have dealt with it
                               1095                 :          * back when the page was the scanblkno page (half-dead pages are
                               1096                 :          * always marked fully deleted by _bt_pagedel()).  This assumes that
                               1097                 :          * there can be only one vacuum process running at a time.
                               1098                 :          */
 1072 pg                       1099 UIC           0 :         if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
                               1100                 :         {
                               1101               0 :             Assert(false);
                               1102                 :             ereport(LOG,
                               1103                 :                     (errcode(ERRCODE_INDEX_CORRUPTED),
 1072 pg                       1104 EUB             :                      errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
                               1105                 :                                      blkno, scanblkno, RelationGetRelationName(rel))));
                               1106                 :             _bt_relbuf(rel, buf);
                               1107                 :             return;
                               1108                 :         }
                               1109                 : 
                               1110                 :         /*
                               1111                 :          * We may have already processed the page in an earlier call, when the
                               1112                 :          * page was scanblkno.  This happens when the leaf page split occurred
                               1113                 :          * after the scan began, but before the right sibling page became the
                               1114                 :          * scanblkno.
                               1115                 :          *
                               1116                 :          * Page may also have been deleted by current btvacuumpage() call,
                               1117                 :          * since _bt_pagedel() sometimes deletes the right sibling page of
                               1118                 :          * scanblkno in passing (it does so after we decided where to
                               1119                 :          * backtrack to).  We don't need to process this page as a deleted
                               1120                 :          * page a second time now (in fact, it would be wrong to count it as a
                               1121                 :          * deleted page in the bulk delete statistics a second time).
                               1122                 :          */
 1072 pg                       1123 UIC           0 :         if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
                               1124                 :         {
                               1125                 :             /* Done with current scanblkno (and all lower split pages) */
 6180 tgl                      1126               0 :             _bt_relbuf(rel, buf);
                               1127               0 :             return;
 6180 tgl                      1128 EUB             :         }
                               1129                 :     }
                               1130                 : 
    6 pg                       1131 GNC       32921 :     if (!opaque || BTPageIsRecyclable(page, heaprel))
 6180 tgl                      1132 EUB             :     {
                               1133                 :         /* Okay to recycle this page (which could be leaf or internal) */
 5304 heikki.linnakangas       1134 GIC         120 :         RecordFreeIndexPage(rel, blkno);
 6180 tgl                      1135             120 :         stats->pages_deleted++;
  774 pg                       1136 CBC         120 :         stats->pages_free++;
                               1137                 :     }
 6180 tgl                      1138 GIC       32801 :     else if (P_ISDELETED(opaque))
 6180 tgl                      1139 ECB             :     {
 1073 pg                       1140                 :         /*
                               1141                 :          * Already deleted page (which could be leaf or internal).  Can't
                               1142                 :          * recycle yet.
                               1143                 :          */
 6180 tgl                      1144 GIC          97 :         stats->pages_deleted++;
                               1145                 :     }
 6003                          1146           32704 :     else if (P_ISHALFDEAD(opaque))
                               1147                 :     {
                               1148                 :         /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
  773 pg                       1149 LBC           0 :         attempt_pagedel = true;
                               1150                 : 
 1073 pg                       1151 ECB             :         /*
                               1152                 :          * _bt_pagedel() will increment both pages_newly_deleted and
                               1153                 :          * pages_deleted stats in all cases (barring corruption)
 1073 pg                       1154 EUB             :          */
                               1155                 :     }
 6180 tgl                      1156 GIC       32704 :     else if (P_ISLEAF(opaque))
                               1157                 :     {
                               1158                 :         OffsetNumber deletable[MaxIndexTuplesPerPage];
                               1159                 :         int         ndeletable;
                               1160                 :         BTVacuumPosting updatable[MaxIndexTuplesPerPage];
 1138 pg                       1161 ECB             :         int         nupdatable;
                               1162                 :         OffsetNumber offnum,
                               1163                 :                     minoff,
                               1164                 :                     maxoff;
                               1165                 :         int         nhtidsdead,
                               1166                 :                     nhtidslive;
                               1167                 : 
                               1168                 :         /*
                               1169                 :          * Trade in the initial read lock for a full cleanup lock on this
                               1170                 :          * page.  We must get such a lock on every leaf page over the course
                               1171                 :          * of the vacuum scan, whether or not it actually contains any
                               1172                 :          * deletable tuples --- see nbtree/README.
                               1173                 :          */
  992 pg                       1174 GIC       30039 :         _bt_upgradelockbufcleanup(rel, buf);
                               1175                 : 
                               1176                 :         /*
                               1177                 :          * Check whether we need to backtrack to earlier pages.  What we are
                               1178                 :          * concerned about is a page split that happened since we started the
 1072 pg                       1179 ECB             :          * vacuum scan.  If the split moved tuples on the right half of the
                               1180                 :          * split (i.e. the tuples that sort high) to a block that we already
                               1181                 :          * passed over, then we might have missed the tuples.  We need to
                               1182                 :          * backtrack now.  (Must do this before possibly clearing btpo_cycleid
                               1183                 :          * or deleting scanblkno page below!)
                               1184                 :          */
 6180 tgl                      1185 GIC       30039 :         if (vstate->cycleid != 0 &&
                               1186           29981 :             opaque->btpo_cycleid == vstate->cycleid &&
 6180 tgl                      1187 UIC           0 :             !(opaque->btpo_flags & BTP_SPLIT_END) &&
                               1188               0 :             !P_RIGHTMOST(opaque) &&
 1072 pg                       1189               0 :             opaque->btpo_next < scanblkno)
 1072 pg                       1190 LBC           0 :             backtrack_to = opaque->btpo_next;
 6180 tgl                      1191 ECB             : 
 6180 tgl                      1192 GBC       30039 :         ndeletable = 0;
 1138 pg                       1193           30039 :         nupdatable = 0;
 6180 tgl                      1194           30039 :         minoff = P_FIRSTDATAKEY(opaque);
                               1195           30039 :         maxoff = PageGetMaxOffsetNumber(page);
 1138 pg                       1196 GIC       30039 :         nhtidsdead = 0;
 1138 pg                       1197 CBC       30039 :         nhtidslive = 0;
 6180 tgl                      1198           30039 :         if (callback)
 6180 tgl                      1199 ECB             :         {
  560 pg                       1200                 :             /* btbulkdelete callback tells us what to delete (or update) */
 6180 tgl                      1201 CBC       29981 :             for (offnum = minoff;
                               1202         5948136 :                  offnum <= maxoff;
                               1203         5918155 :                  offnum = OffsetNumberNext(offnum))
                               1204                 :             {
                               1205                 :                 IndexTuple  itup;
 6180 tgl                      1206 ECB             : 
 6180 tgl                      1207 CBC     5918155 :                 itup = (IndexTuple) PageGetItem(page,
 6180 tgl                      1208 ECB             :                                                 PageGetItemId(page, offnum));
                               1209                 : 
 1138 pg                       1210 GIC     5918155 :                 Assert(!BTreeTupleIsPivot(itup));
                               1211         5918155 :                 if (!BTreeTupleIsPosting(itup))
 1138 pg                       1212 ECB             :                 {
                               1213                 :                     /* Regular tuple, standard table TID representation */
 1138 pg                       1214 GIC     5713458 :                     if (callback(&itup->t_tid, callback_state))
 1138 pg                       1215 ECB             :                     {
 1138 pg                       1216 CBC      712323 :                         deletable[ndeletable++] = offnum;
 1138 pg                       1217 GIC      712323 :                         nhtidsdead++;
                               1218                 :                     }
 1138 pg                       1219 ECB             :                     else
 1138 pg                       1220 GIC     5001135 :                         nhtidslive++;
 1138 pg                       1221 ECB             :                 }
                               1222                 :                 else
                               1223                 :                 {
                               1224                 :                     BTVacuumPosting vacposting;
                               1225                 :                     int         nremaining;
                               1226                 : 
                               1227                 :                     /* Posting list tuple */
 1138 pg                       1228 GIC      204697 :                     vacposting = btreevacuumposting(vstate, itup, offnum,
                               1229                 :                                                     &nremaining);
                               1230          204697 :                     if (vacposting == NULL)
                               1231                 :                     {
                               1232                 :                         /*
 1138 pg                       1233 ECB             :                          * All table TIDs from the posting tuple remain, so no
                               1234                 :                          * delete or update required
                               1235                 :                          */
 1138 pg                       1236 GIC      118403 :                         Assert(nremaining == BTreeTupleGetNPosting(itup));
                               1237                 :                     }
                               1238           86294 :                     else if (nremaining > 0)
                               1239                 :                     {
                               1240                 : 
 1138 pg                       1241 ECB             :                         /*
                               1242                 :                          * Store metadata about posting list tuple in
                               1243                 :                          * updatable array for entire page.  Existing tuple
                               1244                 :                          * will be updated during the later call to
                               1245                 :                          * _bt_delitems_vacuum().
                               1246                 :                          */
 1138 pg                       1247 GIC       74159 :                         Assert(nremaining < BTreeTupleGetNPosting(itup));
                               1248           74159 :                         updatable[nupdatable++] = vacposting;
                               1249           74159 :                         nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
                               1250                 :                     }
                               1251                 :                     else
 1138 pg                       1252 ECB             :                     {
                               1253                 :                         /*
                               1254                 :                          * All table TIDs from the posting list must be
                               1255                 :                          * deleted.  We'll delete the index tuple completely
                               1256                 :                          * (no update required).
                               1257                 :                          */
 1138 pg                       1258 GIC       12135 :                         Assert(nremaining == 0);
                               1259           12135 :                         deletable[ndeletable++] = offnum;
                               1260           12135 :                         nhtidsdead += BTreeTupleGetNPosting(itup);
                               1261           12135 :                         pfree(vacposting);
                               1262                 :                     }
 1138 pg                       1263 ECB             : 
 1138 pg                       1264 CBC      204697 :                     nhtidslive += nremaining;
 1138 pg                       1265 ECB             :                 }
 6180 tgl                      1266                 :             }
                               1267                 :         }
                               1268                 : 
                               1269                 :         /*
                               1270                 :          * Apply any needed deletes or updates.  We issue just one
                               1271                 :          * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
                               1272                 :          */
 1138 pg                       1273 GIC       30039 :         if (ndeletable > 0 || nupdatable > 0)
                               1274                 :         {
 1072                          1275           16990 :             Assert(nhtidsdead >= ndeletable + nupdatable);
 1138                          1276           16990 :             _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
                               1277                 :                                 nupdatable);
 4859 simon                    1278 ECB             : 
 1138 pg                       1279 GIC       16990 :             stats->tuples_removed += nhtidsdead;
 6180 tgl                      1280 ECB             :             /* must recompute maxoff */
 6180 tgl                      1281 CBC       16990 :             maxoff = PageGetMaxOffsetNumber(page);
                               1282                 : 
                               1283                 :             /* can't leak memory here */
 1138 pg                       1284           91149 :             for (int i = 0; i < nupdatable; i++)
 1138 pg                       1285 GIC       74159 :                 pfree(updatable[i]);
 6180 tgl                      1286 ECB             :         }
                               1287                 :         else
                               1288                 :         {
                               1289                 :             /*
 1073 pg                       1290                 :              * If the leaf page has been split during this vacuum cycle, it
                               1291                 :              * seems worth expending a write to clear btpo_cycleid even if we
                               1292                 :              * don't have any deletions to do.  (If we do, _bt_delitems_vacuum
                               1293                 :              * takes care of this.)  This ensures we won't process the page
                               1294                 :              * again.
                               1295                 :              *
                               1296                 :              * We treat this like a hint-bit update because there's no need to
                               1297                 :              * WAL-log it.
                               1298                 :              */
 1138 pg                       1299 GIC       13049 :             Assert(nhtidsdead == 0);
 6180 tgl                      1300           13049 :             if (vstate->cycleid != 0 &&
                               1301           12991 :                 opaque->btpo_cycleid == vstate->cycleid)
                               1302                 :             {
 6180 tgl                      1303 UIC           0 :                 opaque->btpo_cycleid = 0;
 3583 jdavis                   1304 LBC           0 :                 MarkBufferDirtyHint(buf, true);
 6180 tgl                      1305 ECB             :             }
                               1306                 :         }
                               1307                 : 
 6180 tgl                      1308 EUB             :         /*
 1073 pg                       1309                 :          * If the leaf page is now empty, try to delete it; else count the
                               1310                 :          * live tuples (live table TIDs in posting lists are counted as
                               1311                 :          * separate live tuples).  We don't delete when backtracking, though,
                               1312                 :          * since that would require teaching _bt_pagedel() about backtracking
                               1313                 :          * (doesn't seem worth adding more complexity to deal with that).
                               1314                 :          *
                               1315                 :          * We don't count the number of live TIDs during cleanup-only calls to
                               1316                 :          * btvacuumscan (i.e. when callback is not set).  We count the number
                               1317                 :          * of index tuples directly instead.  This avoids the expense of
                               1318                 :          * directly examining all of the tuples on each page.  VACUUM will
                               1319                 :          * treat num_index_tuples as an estimate in cleanup-only case, so it
                               1320                 :          * doesn't matter that this underestimates num_index_tuples
                               1321                 :          * significantly in some cases.
                               1322                 :          */
 6180 tgl                      1323 GIC       30039 :         if (minoff > maxoff)
 1072 pg                       1324            2730 :             attempt_pagedel = (blkno == scanblkno);
  886                          1325           27309 :         else if (callback)
 1138                          1326           27255 :             stats->num_index_tuples += nhtidslive;
                               1327                 :         else
  886 pg                       1328 CBC          54 :             stats->num_index_tuples += maxoff - minoff + 1;
 1138 pg                       1329 ECB             : 
 1072 pg                       1330 CBC       30039 :         Assert(!attempt_pagedel || nhtidslive == 0);
 6180 tgl                      1331 ECB             :     }
                               1332                 : 
 1072 pg                       1333 CBC       32921 :     if (attempt_pagedel)
                               1334                 :     {
 6180 tgl                      1335 ECB             :         MemoryContext oldcontext;
                               1336                 : 
                               1337                 :         /* Run pagedel in a temp context to avoid memory leakage */
 6180 tgl                      1338 CBC        2730 :         MemoryContextReset(vstate->pagedelcontext);
 6180 tgl                      1339 GIC        2730 :         oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
                               1340                 : 
                               1341                 :         /*
                               1342                 :          * _bt_pagedel maintains the bulk delete stats on our behalf;
  773 pg                       1343 ECB             :          * pages_newly_deleted and pages_deleted are likely to be incremented
                               1344                 :          * during call
                               1345                 :          */
 1072 pg                       1346 GIC        2730 :         Assert(blkno == scanblkno);
  773                          1347            2730 :         _bt_pagedel(rel, buf, vstate);
                               1348                 : 
 6180 tgl                      1349            2730 :         MemoryContextSwitchTo(oldcontext);
                               1350                 :         /* pagedel released buffer, so we shouldn't */
 6180 tgl                      1351 ECB             :     }
                               1352                 :     else
 6180 tgl                      1353 GIC       30191 :         _bt_relbuf(rel, buf);
 6180 tgl                      1354 ECB             : 
 1072 pg                       1355 GIC       32921 :     if (backtrack_to != P_NONE)
                               1356                 :     {
 1072 pg                       1357 UIC           0 :         blkno = backtrack_to;
 1072 pg                       1358 LBC           0 :         goto backtrack;
                               1359                 :     }
 7351 tgl                      1360 ECB             : }
                               1361                 : 
 1138 pg                       1362 EUB             : /*
                               1363                 :  * btreevacuumposting --- determine TIDs still needed in posting list
                               1364                 :  *
                               1365                 :  * Returns metadata describing how to build replacement tuple without the TIDs
                               1366                 :  * that VACUUM needs to delete.  Returned value is NULL in the common case
                               1367                 :  * where no changes are needed to caller's posting list tuple (we avoid
                               1368                 :  * allocating memory here as an optimization).
                               1369                 :  *
                               1370                 :  * The number of TIDs that should remain in the posting list tuple is set for
                               1371                 :  * caller in *nremaining.
                               1372                 :  */
                               1373                 : static BTVacuumPosting
 1138 pg                       1374 GIC      204697 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
                               1375                 :                    OffsetNumber updatedoffset, int *nremaining)
                               1376                 : {
                               1377          204697 :     int         live = 0;
                               1378          204697 :     int         nitem = BTreeTupleGetNPosting(posting);
 1138 pg                       1379 CBC      204697 :     ItemPointer items = BTreeTupleGetPosting(posting);
 1138 pg                       1380 GIC      204697 :     BTVacuumPosting vacposting = NULL;
                               1381                 : 
 1138 pg                       1382 CBC     1120832 :     for (int i = 0; i < nitem; i++)
 1138 pg                       1383 ECB             :     {
 1138 pg                       1384 CBC      916135 :         if (!vstate->callback(items + i, vstate->callback_state))
 1138 pg                       1385 ECB             :         {
                               1386                 :             /* Live table TID */
 1138 pg                       1387 CBC      744283 :             live++;
                               1388                 :         }
                               1389          171852 :         else if (vacposting == NULL)
                               1390                 :         {
                               1391                 :             /*
 1138 pg                       1392 ECB             :              * First dead table TID encountered.
                               1393                 :              *
                               1394                 :              * It's now clear that we need to delete one or more dead table
                               1395                 :              * TIDs, so start maintaining metadata describing how to update
                               1396                 :              * existing posting list tuple.
                               1397                 :              */
 1138 pg                       1398 GIC       86294 :             vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
                               1399                 :                                 nitem * sizeof(uint16));
                               1400                 : 
                               1401           86294 :             vacposting->itup = posting;
                               1402           86294 :             vacposting->updatedoffset = updatedoffset;
 1138 pg                       1403 CBC       86294 :             vacposting->ndeletedtids = 0;
 1138 pg                       1404 GIC       86294 :             vacposting->deletetids[vacposting->ndeletedtids++] = i;
                               1405                 :         }
 1138 pg                       1406 ECB             :         else
                               1407                 :         {
                               1408                 :             /* Second or subsequent dead table TID */
 1138 pg                       1409 CBC       85558 :             vacposting->deletetids[vacposting->ndeletedtids++] = i;
                               1410                 :         }
                               1411                 :     }
                               1412                 : 
 1138 pg                       1413 GIC      204697 :     *nremaining = live;
 1138 pg                       1414 CBC      204697 :     return vacposting;
                               1415                 : }
                               1416                 : 
                               1417                 : /*
 4130 tgl                      1418 ECB             :  *  btcanreturn() -- Check whether btree indexes support index-only scans.
                               1419                 :  *
                               1420                 :  * btrees always do, so this is trivial.
                               1421                 :  */
                               1422                 : bool
 2639 tgl                      1423 GIC      398723 : btcanreturn(Relation index, int attno)
                               1424                 : {
                               1425          398723 :     return true;
                               1426                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a