LCOV - differential code coverage report
Current view: top level - src/backend/access/table - tableam.c (source / functions) Coverage Total Hit UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 89.7 % 185 166 12 7 1 76 4 85 11 76 5
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 19 19 12 1 6 12 1
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*----------------------------------------------------------------------
       2                 :  *
       3                 :  * tableam.c
       4                 :  *      Table access method routines too big to be inline functions.
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/access/table/tableam.c
      12                 :  *
      13                 :  * NOTES
      14                 :  *    Note that most function in here are documented in tableam.h, rather than
      15                 :  *    here. That's because there's a lot of inline functions in tableam.h and
      16                 :  *    it'd be harder to understand if one constantly had to switch between files.
      17                 :  *
      18                 :  *----------------------------------------------------------------------
      19                 :  */
      20                 : #include "postgres.h"
      21                 : 
      22                 : #include <math.h>
      23                 : 
      24                 : #include "access/syncscan.h"
      25                 : #include "access/tableam.h"
      26                 : #include "access/xact.h"
      27                 : #include "optimizer/plancat.h"
      28                 : #include "port/pg_bitutils.h"
      29                 : #include "storage/bufmgr.h"
      30                 : #include "storage/shmem.h"
      31                 : #include "storage/smgr.h"
      32                 : 
      33                 : /*
      34                 :  * Constants to control the behavior of block allocation to parallel workers
      35                 :  * during a parallel seqscan.  Technically these values do not need to be
      36                 :  * powers of 2, but having them as powers of 2 makes the math more optimal
      37                 :  * and makes the ramp-down stepping more even.
      38                 :  */
      39                 : 
      40                 : /* The number of I/O chunks we try to break a parallel seqscan down into */
      41                 : #define PARALLEL_SEQSCAN_NCHUNKS            2048
      42                 : /* Ramp down size of allocations when we've only this number of chunks left */
      43                 : #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS    64
      44                 : /* Cap the size of parallel I/O chunks to this number of blocks */
      45                 : #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE     8192
      46                 : 
      47                 : /* GUC variables */
      48                 : char       *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
      49                 : bool        synchronize_seqscans = true;
      50                 : 
      51                 : 
      52                 : /* ----------------------------------------------------------------------------
      53                 :  * Slot functions.
      54                 :  * ----------------------------------------------------------------------------
      55                 :  */
      56                 : 
      57                 : const TupleTableSlotOps *
      58 CBC    15845404 : table_slot_callbacks(Relation relation)
      59                 : {
      60                 :     const TupleTableSlotOps *tts_cb;
      61                 : 
      62        15845404 :     if (relation->rd_tableam)
      63        15841525 :         tts_cb = relation->rd_tableam->slot_callbacks(relation);
      64            3879 :     else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
      65                 :     {
      66                 :         /*
      67                 :          * Historically FDWs expect to store heap tuples in slots. Continue
      68                 :          * handing them one, to make it less painful to adapt FDWs to new
      69                 :          * versions. The cost of a heap slot over a virtual slot is pretty
      70                 :          * small.
      71                 :          */
      72             208 :         tts_cb = &TTSOpsHeapTuple;
      73                 :     }
      74                 :     else
      75                 :     {
      76                 :         /*
      77                 :          * These need to be supported, as some parts of the code (like COPY)
      78                 :          * need to create slots for such relations too. It seems better to
      79                 :          * centralize the knowledge that a heap slot is the right thing in
      80                 :          * that case here.
      81                 :          */
      82            3671 :         Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
      83                 :                relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
      84            3671 :         tts_cb = &TTSOpsVirtual;
      85                 :     }
      86                 : 
      87        15845404 :     return tts_cb;
      88                 : }
      89                 : 
      90                 : TupleTableSlot *
      91        15666100 : table_slot_create(Relation relation, List **reglist)
      92                 : {
      93                 :     const TupleTableSlotOps *tts_cb;
      94                 :     TupleTableSlot *slot;
      95                 : 
      96        15666100 :     tts_cb = table_slot_callbacks(relation);
      97        15666100 :     slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
      98                 : 
      99        15666100 :     if (reglist)
     100          142014 :         *reglist = lappend(*reglist, slot);
     101                 : 
     102        15666100 :     return slot;
     103                 : }
     104                 : 
     105                 : 
     106                 : /* ----------------------------------------------------------------------------
     107                 :  * Table scan functions.
     108                 :  * ----------------------------------------------------------------------------
     109                 :  */
     110                 : 
     111                 : TableScanDesc
     112          124520 : table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
     113                 : {
     114          124520 :     uint32      flags = SO_TYPE_SEQSCAN |
     115                 :     SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
     116          124520 :     Oid         relid = RelationGetRelid(relation);
     117          124520 :     Snapshot    snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
     118                 : 
     119          124520 :     return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
     120                 :                                             NULL, flags);
     121                 : }
     122                 : 
     123                 : void
     124             138 : table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot)
     125                 : {
     126             138 :     Assert(IsMVCCSnapshot(snapshot));
     127                 : 
     128             138 :     RegisterSnapshot(snapshot);
     129             138 :     scan->rs_snapshot = snapshot;
     130             138 :     scan->rs_flags |= SO_TEMP_SNAPSHOT;
     131             138 : }
     132                 : 
     133                 : 
     134                 : /* ----------------------------------------------------------------------------
     135                 :  * Parallel table scan related functions.
     136                 :  * ----------------------------------------------------------------------------
     137                 :  */
     138                 : 
     139                 : Size
     140             512 : table_parallelscan_estimate(Relation rel, Snapshot snapshot)
     141                 : {
     142             512 :     Size        sz = 0;
     143                 : 
     144             512 :     if (IsMVCCSnapshot(snapshot))
     145             441 :         sz = add_size(sz, EstimateSnapshotSpace(snapshot));
     146                 :     else
     147              71 :         Assert(snapshot == SnapshotAny);
     148                 : 
     149             512 :     sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
     150                 : 
     151             512 :     return sz;
     152                 : }
     153                 : 
     154                 : void
     155             512 : table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
     156                 :                               Snapshot snapshot)
     157                 : {
     158             512 :     Size        snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
     159                 : 
     160             512 :     pscan->phs_snapshot_off = snapshot_off;
     161                 : 
     162             512 :     if (IsMVCCSnapshot(snapshot))
     163                 :     {
     164             441 :         SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
     165             441 :         pscan->phs_snapshot_any = false;
     166                 :     }
     167                 :     else
     168                 :     {
     169              71 :         Assert(snapshot == SnapshotAny);
     170              71 :         pscan->phs_snapshot_any = true;
     171                 :     }
     172             512 : }
     173                 : 
     174                 : TableScanDesc
     175 GNC        1895 : table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
     176                 : {
     177                 :     Snapshot    snapshot;
     178 CBC        1895 :     uint32      flags = SO_TYPE_SEQSCAN |
     179                 :     SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
     180                 : 
     181 GNC        1895 :     Assert(RelationGetRelid(relation) == pscan->phs_relid);
     182                 : 
     183            1895 :     if (!pscan->phs_snapshot_any)
     184                 :     {
     185                 :         /* Snapshot was serialized -- restore it */
     186            1753 :         snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
     187 CBC        1753 :         RegisterSnapshot(snapshot);
     188 GIC        1753 :         flags |= SO_TEMP_SNAPSHOT;
     189                 :     }
     190                 :     else
     191                 :     {
     192 ECB             :         /* SnapshotAny passed by caller (not serialized) */
     193 GIC         142 :         snapshot = SnapshotAny;
     194                 :     }
     195 ECB             : 
     196 GIC        1895 :     return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
     197                 :                                             pscan, flags);
     198                 : }
     199                 : 
     200                 : 
     201                 : /* ----------------------------------------------------------------------------
     202                 :  * Index scan related functions.
     203                 :  * ----------------------------------------------------------------------------
     204                 :  */
     205                 : 
     206                 : /*
     207                 :  * To perform that check simply start an index scan, create the necessary
     208                 :  * slot, do the heap lookup, and shut everything down again. This could be
     209                 :  * optimized, but is unlikely to matter from a performance POV. If there
     210                 :  * frequently are live index pointers also matching a unique index key, the
     211                 :  * CPU overhead of this routine is unlikely to matter.
     212                 :  *
     213                 :  * Note that *tid may be modified when we return true if the AM supports
     214                 :  * storing multiple row versions reachable via a single index entry (like
     215                 :  * heap's HOT).
     216                 :  */
     217 ECB             : bool
     218 GIC     5817243 : table_index_fetch_tuple_check(Relation rel,
     219                 :                               ItemPointer tid,
     220                 :                               Snapshot snapshot,
     221                 :                               bool *all_dead)
     222                 : {
     223                 :     IndexFetchTableData *scan;
     224 ECB             :     TupleTableSlot *slot;
     225 GIC     5817243 :     bool        call_again = false;
     226                 :     bool        found;
     227 ECB             : 
     228 CBC     5817243 :     slot = table_slot_create(rel, NULL);
     229         5817243 :     scan = table_index_fetch_begin(rel);
     230 GIC     5817243 :     found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
     231 ECB             :                                     all_dead);
     232 CBC     5817243 :     table_index_fetch_end(scan);
     233 GIC     5817243 :     ExecDropSingleTupleTableSlot(slot);
     234 ECB             : 
     235 GIC     5817243 :     return found;
     236                 : }
     237                 : 
     238                 : 
     239                 : /* ------------------------------------------------------------------------
     240                 :  * Functions for non-modifying operations on individual tuples
     241                 :  * ------------------------------------------------------------------------
     242                 :  */
     243                 : 
     244 ECB             : void
     245 GIC         153 : table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
     246 ECB             : {
     247 CBC         153 :     Relation    rel = scan->rs_rd;
     248 GIC         153 :     const TableAmRoutine *tableam = rel->rd_tableam;
     249                 : 
     250                 :     /*
     251                 :      * We don't expect direct calls to table_tuple_get_latest_tid with valid
     252                 :      * CheckXidAlive for catalog or regular tables.  See detailed comments in
     253                 :      * xact.c where these variables are declared.
     254 ECB             :      */
     255 GBC         153 :     if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
     256 UIC           0 :         elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
     257                 : 
     258                 :     /*
     259                 :      * Since this can be called with user-supplied TID, don't trust the input
     260                 :      * too much.
     261 ECB             :      */
     262 CBC         153 :     if (!tableam->tuple_tid_valid(scan, tid))
     263 GIC           6 :         ereport(ERROR,
     264                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     265                 :                  errmsg("tid (%u, %u) is not valid for relation \"%s\"",
     266                 :                         ItemPointerGetBlockNumberNoCheck(tid),
     267                 :                         ItemPointerGetOffsetNumberNoCheck(tid),
     268                 :                         RelationGetRelationName(rel))));
     269 ECB             : 
     270 CBC         147 :     tableam->tuple_get_latest_tid(scan, tid);
     271 GIC         147 : }
     272                 : 
     273                 : 
     274                 : /* ----------------------------------------------------------------------------
     275                 :  * Functions to make modifications a bit simpler.
     276                 :  * ----------------------------------------------------------------------------
     277                 :  */
     278                 : 
     279                 : /*
     280                 :  * simple_table_tuple_insert - insert a tuple
     281                 :  *
     282                 :  * Currently, this routine differs from table_tuple_insert only in supplying a
     283                 :  * default command ID and not allowing access to the speedup options.
     284                 :  */
     285 ECB             : void
     286 GIC       75587 : simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
     287 ECB             : {
     288 CBC       75587 :     table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
     289 GIC       75587 : }
     290                 : 
     291                 : /*
     292                 :  * simple_table_tuple_delete - delete a tuple
     293                 :  *
     294                 :  * This routine may be used to delete a tuple when concurrent updates of
     295                 :  * the target tuple are not expected (for example, because we have a lock
     296                 :  * on the relation associated with the tuple).  Any failure is reported
     297                 :  * via ereport().
     298                 :  */
     299 ECB             : void
     300 GIC       40299 : simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
     301                 : {
     302                 :     TM_Result   result;
     303                 :     TM_FailureData tmfd;
     304 ECB             : 
     305 GIC       40299 :     result = table_tuple_delete(rel, tid,
     306                 :                                 GetCurrentCommandId(true),
     307                 :                                 snapshot, InvalidSnapshot,
     308                 :                                 true /* wait for commit */ ,
     309                 :                                 &tmfd, false /* changingPart */ );
     310 ECB             : 
     311 GIC       40299 :     switch (result)
     312 EUB             :     {
     313 UIC           0 :         case TM_SelfModified:
     314 EUB             :             /* Tuple was already updated in current command? */
     315 UIC           0 :             elog(ERROR, "tuple already updated by self");
     316                 :             break;
     317 ECB             : 
     318 GIC       40299 :         case TM_Ok:
     319 ECB             :             /* done successfully */
     320 GIC       40299 :             break;
     321 EUB             : 
     322 UBC           0 :         case TM_Updated:
     323 UIC           0 :             elog(ERROR, "tuple concurrently updated");
     324                 :             break;
     325 EUB             : 
     326 UBC           0 :         case TM_Deleted:
     327 UIC           0 :             elog(ERROR, "tuple concurrently deleted");
     328                 :             break;
     329 EUB             : 
     330 UBC           0 :         default:
     331 UIC           0 :             elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
     332                 :             break;
     333 ECB             :     }
     334 GIC       40299 : }
     335                 : 
     336                 : /*
     337                 :  * simple_table_tuple_update - replace a tuple
     338                 :  *
     339                 :  * This routine may be used to update a tuple when concurrent updates of
     340                 :  * the target tuple are not expected (for example, because we have a lock
     341                 :  * on the relation associated with the tuple).  Any failure is reported
     342                 :  * via ereport().
     343                 :  */
     344 ECB             : void
     345 GIC       31905 : simple_table_tuple_update(Relation rel, ItemPointer otid,
     346                 :                           TupleTableSlot *slot,
     347                 :                           Snapshot snapshot,
     348                 :                           TU_UpdateIndexes *update_indexes)
     349                 : {
     350                 :     TM_Result   result;
     351                 :     TM_FailureData tmfd;
     352                 :     LockTupleMode lockmode;
     353 ECB             : 
     354 GIC       31905 :     result = table_tuple_update(rel, otid, slot,
     355                 :                                 GetCurrentCommandId(true),
     356                 :                                 snapshot, InvalidSnapshot,
     357                 :                                 true /* wait for commit */ ,
     358                 :                                 &tmfd, &lockmode, update_indexes);
     359 ECB             : 
     360 GIC       31905 :     switch (result)
     361 EUB             :     {
     362 UIC           0 :         case TM_SelfModified:
     363 EUB             :             /* Tuple was already updated in current command? */
     364 UIC           0 :             elog(ERROR, "tuple already updated by self");
     365                 :             break;
     366 ECB             : 
     367 GIC       31905 :         case TM_Ok:
     368 ECB             :             /* done successfully */
     369 GIC       31905 :             break;
     370 EUB             : 
     371 UBC           0 :         case TM_Updated:
     372 UIC           0 :             elog(ERROR, "tuple concurrently updated");
     373                 :             break;
     374 EUB             : 
     375 UBC           0 :         case TM_Deleted:
     376 UIC           0 :             elog(ERROR, "tuple concurrently deleted");
     377                 :             break;
     378 EUB             : 
     379 UBC           0 :         default:
     380 UIC           0 :             elog(ERROR, "unrecognized table_tuple_update status: %u", result);
     381                 :             break;
     382 ECB             :     }
     383 GIC       31905 : }
     384                 : 
     385                 : 
     386                 : /* ----------------------------------------------------------------------------
     387                 :  * Helper functions to implement parallel scans for block oriented AMs.
     388                 :  * ----------------------------------------------------------------------------
     389                 :  */
     390                 : 
     391 ECB             : Size
     392 GIC         512 : table_block_parallelscan_estimate(Relation rel)
     393 ECB             : {
     394 GIC         512 :     return sizeof(ParallelBlockTableScanDescData);
     395                 : }
     396                 : 
     397 ECB             : Size
     398 GIC         512 : table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
     399 ECB             : {
     400 GIC         512 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     401 ECB             : 
     402 CBC         512 :     bpscan->base.phs_relid = RelationGetRelid(rel);
     403 GIC         512 :     bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
     404 ECB             :     /* compare phs_syncscan initialization to similar logic in initscan */
     405 CBC        1369 :     bpscan->base.phs_syncscan = synchronize_seqscans &&
     406             857 :         !RelationUsesLocalBuffers(rel) &&
     407             345 :         bpscan->phs_nblocks > NBuffers / 4;
     408             512 :     SpinLockInit(&bpscan->phs_mutex);
     409             512 :     bpscan->phs_startblock = InvalidBlockNumber;
     410 GIC         512 :     pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
     411 ECB             : 
     412 GIC         512 :     return sizeof(ParallelBlockTableScanDescData);
     413                 : }
     414                 : 
     415 ECB             : void
     416 GIC         114 : table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
     417 ECB             : {
     418 GIC         114 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
     419 ECB             : 
     420 CBC         114 :     pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
     421 GIC         114 : }
     422                 : 
     423                 : /*
     424                 :  * find and set the scan's startblock
     425                 :  *
     426                 :  * Determine where the parallel seq scan should start.  This function may be
     427                 :  * called many times, once by each parallel worker.  We must be careful only
     428                 :  * to set the startblock once.
     429                 :  */
     430 ECB             : void
     431 GIC        1296 : table_block_parallelscan_startblock_init(Relation rel,
     432                 :                                          ParallelBlockTableScanWorker pbscanwork,
     433                 :                                          ParallelBlockTableScanDesc pbscan)
     434 ECB             : {
     435 GIC        1296 :     BlockNumber sync_startpage = InvalidBlockNumber;
     436                 : 
     437 ECB             :     /* Reset the state we use for controlling allocation size. */
     438 GIC        1296 :     memset(pbscanwork, 0, sizeof(*pbscanwork));
     439                 : 
     440                 :     StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
     441                 :                      "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
     442                 : 
     443                 :     /*
     444                 :      * We determine the chunk size based on the size of the relation. First we
     445                 :      * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
     446                 :      * take the next highest power of 2 number of the chunk size.  This means
     447                 :      * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
     448                 :      * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
     449 ECB             :      */
     450 GIC        1296 :     pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
     451                 :                                                        PARALLEL_SEQSCAN_NCHUNKS, 1));
     452                 : 
     453                 :     /*
     454                 :      * Ensure we don't go over the maximum chunk size with larger tables. This
     455                 :      * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
     456                 :      * tables.  Too large a chunk size has been shown to be detrimental to
     457                 :      * synchronous scan performance.
     458 ECB             :      */
     459 GIC        1296 :     pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
     460                 :                                       PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
     461 ECB             : 
     462 GIC        1297 : retry:
     463 ECB             :     /* Grab the spinlock. */
     464 GIC        1297 :     SpinLockAcquire(&pbscan->phs_mutex);
     465                 : 
     466                 :     /*
     467                 :      * If the scan's startblock has not yet been initialized, we must do so
     468                 :      * now.  If this is not a synchronized scan, we just start at block 0, but
     469                 :      * if it is a synchronized scan, we must get the starting position from
     470                 :      * the synchronized scan machinery.  We can't hold the spinlock while
     471                 :      * doing that, though, so release the spinlock, get the information we
     472                 :      * need, and retry.  If nobody else has initialized the scan in the
     473                 :      * meantime, we'll fill in the value we fetched on the second time
     474                 :      * through.
     475 ECB             :      */
     476 GIC        1297 :     if (pbscan->phs_startblock == InvalidBlockNumber)
     477 ECB             :     {
     478 CBC         408 :         if (!pbscan->base.phs_syncscan)
     479             406 :             pbscan->phs_startblock = 0;
     480               2 :         else if (sync_startpage != InvalidBlockNumber)
     481 GIC           1 :             pbscan->phs_startblock = sync_startpage;
     482                 :         else
     483 ECB             :         {
     484 CBC           1 :             SpinLockRelease(&pbscan->phs_mutex);
     485               1 :             sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
     486 GIC           1 :             goto retry;
     487                 :         }
     488 ECB             :     }
     489 CBC        1296 :     SpinLockRelease(&pbscan->phs_mutex);
     490 GIC        1296 : }
     491                 : 
     492                 : /*
     493                 :  * get the next page to scan
     494                 :  *
     495                 :  * Get the next page to scan.  Even if there are no pages left to scan,
     496                 :  * another backend could have grabbed a page to scan and not yet finished
     497                 :  * looking at it, so it doesn't follow that the scan is done when the first
     498                 :  * backend gets an InvalidBlockNumber return.
     499                 :  */
     500 ECB             : BlockNumber
     501 GIC       97139 : table_block_parallelscan_nextpage(Relation rel,
     502                 :                                   ParallelBlockTableScanWorker pbscanwork,
     503                 :                                   ParallelBlockTableScanDesc pbscan)
     504                 : {
     505                 :     BlockNumber page;
     506                 :     uint64      nallocated;
     507                 : 
     508                 :     /*
     509                 :      * The logic below allocates block numbers out to parallel workers in a
     510                 :      * way that each worker will receive a set of consecutive block numbers to
     511                 :      * scan.  Earlier versions of this would allocate the next highest block
     512                 :      * number to the next worker to call this function.  This would generally
     513                 :      * result in workers never receiving consecutive block numbers.  Some
     514                 :      * operating systems would not detect the sequential I/O pattern due to
     515                 :      * each backend being a different process which could result in poor
     516                 :      * performance due to inefficient or no readahead.  To work around this
     517                 :      * issue, we now allocate a range of block numbers for each worker and
     518                 :      * when they come back for another block, we give them the next one in
     519                 :      * that range until the range is complete.  When the worker completes the
     520                 :      * range of blocks we then allocate another range for it and return the
     521                 :      * first block number from that range.
     522                 :      *
     523                 :      * Here we name these ranges of blocks "chunks".  The initial size of
     524                 :      * these chunks is determined in table_block_parallelscan_startblock_init
     525                 :      * based on the size of the relation.  Towards the end of the scan, we
     526                 :      * start making reductions in the size of the chunks in order to attempt
     527                 :      * to divide the remaining work over all the workers as evenly as
     528                 :      * possible.
     529                 :      *
     530                 :      * Here pbscanwork is local worker memory.  phsw_chunk_remaining tracks
     531                 :      * the number of blocks remaining in the chunk.  When that reaches 0 then
     532                 :      * we must allocate a new chunk for the worker.
     533                 :      *
     534                 :      * phs_nallocated tracks how many blocks have been allocated to workers
     535                 :      * already.  When phs_nallocated >= rs_nblocks, all blocks have been
     536                 :      * allocated.
     537                 :      *
     538                 :      * Because we use an atomic fetch-and-add to fetch the current value, the
     539                 :      * phs_nallocated counter will exceed rs_nblocks, because workers will
     540                 :      * still increment the value, when they try to allocate the next block but
     541                 :      * all blocks have been allocated already. The counter must be 64 bits
     542                 :      * wide because of that, to avoid wrapping around when rs_nblocks is close
     543                 :      * to 2^32.
     544                 :      *
     545                 :      * The actual block to return is calculated by adding the counter to the
     546                 :      * starting block number, modulo nblocks.
     547                 :      */
     548                 : 
     549                 :     /*
     550                 :      * First check if we have any remaining blocks in a previous chunk for
     551                 :      * this worker.  We must consume all of the blocks from that before we
     552                 :      * allocate a new chunk to the worker.
     553 ECB             :      */
     554 GIC       97139 :     if (pbscanwork->phsw_chunk_remaining > 0)
     555                 :     {
     556                 :         /*
     557                 :          * Give them the next block in the range and update the remaining
     558                 :          * number of blocks.
     559 ECB             :          */
     560 CBC        6511 :         nallocated = ++pbscanwork->phsw_nallocated;
     561 GIC        6511 :         pbscanwork->phsw_chunk_remaining--;
     562                 :     }
     563                 :     else
     564                 :     {
     565                 :         /*
     566                 :          * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
     567                 :          * remaining in the scan, we half the chunk size.  Since we reduce the
     568                 :          * chunk size here, we'll hit this again after doing
     569                 :          * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size.  After a few
     570                 :          * iterations of this, we'll end up doing the last few blocks with the
     571                 :          * chunk size set to 1.
     572 ECB             :          */
     573 CBC       90628 :         if (pbscanwork->phsw_chunk_size > 1 &&
     574            2215 :             pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
     575            2215 :             (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
     576 GIC           2 :             pbscanwork->phsw_chunk_size >>= 1;
     577 ECB             : 
     578 CBC       90628 :         nallocated = pbscanwork->phsw_nallocated =
     579           90628 :             pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
     580 GIC       90628 :                                     pbscanwork->phsw_chunk_size);
     581                 : 
     582                 :         /*
     583                 :          * Set the remaining number of blocks in this chunk so that subsequent
     584                 :          * calls from this worker continue on with this chunk until it's done.
     585 ECB             :          */
     586 GIC       90628 :         pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
     587                 :     }
     588 ECB             : 
     589 CBC       97139 :     if (nallocated >= pbscan->phs_nblocks)
     590 GIC        1296 :         page = InvalidBlockNumber;  /* all blocks have been allocated */
     591 ECB             :     else
     592 GIC       95843 :         page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
     593                 : 
     594                 :     /*
     595                 :      * Report scan location.  Normally, we report the current page number.
     596                 :      * When we reach the end of the scan, though, we report the starting page,
     597                 :      * not the ending page, just so the starting positions for later scans
     598                 :      * doesn't slew backwards.  We only report the position at the end of the
     599                 :      * scan once, though: subsequent callers will report nothing.
     600 ECB             :      */
     601 GIC       97139 :     if (pbscan->base.phs_syncscan)
     602 ECB             :     {
     603 CBC        8852 :         if (page != InvalidBlockNumber)
     604            8850 :             ss_report_location(rel, page);
     605               2 :         else if (nallocated == pbscan->phs_nblocks)
     606 GIC           1 :             ss_report_location(rel, pbscan->phs_startblock);
     607                 :     }
     608 ECB             : 
     609 GIC       97139 :     return page;
     610                 : }
     611                 : 
     612                 : /* ----------------------------------------------------------------------------
     613                 :  * Helper functions to implement relation sizing for block oriented AMs.
     614                 :  * ----------------------------------------------------------------------------
     615                 :  */
     616                 : 
     617                 : /*
     618                 :  * table_block_relation_size
     619                 :  *
     620                 :  * If a table AM uses the various relation forks as the sole place where data
     621                 :  * is stored, and if it uses them in the expected manner (e.g. the actual data
     622                 :  * is in the main fork rather than some other), it can use this implementation
     623                 :  * of the relation_size callback rather than implementing its own.
     624                 :  */
     625 ECB             : uint64
     626 GIC     1613557 : table_block_relation_size(Relation rel, ForkNumber forkNumber)
     627 ECB             : {
     628 GIC     1613557 :     uint64      nblocks = 0;
     629                 : 
     630 ECB             :     /* InvalidForkNumber indicates returning the size for all forks */
     631 GIC     1613557 :     if (forkNumber == InvalidForkNumber)
     632 EUB             :     {
     633 UBC           0 :         for (int i = 0; i < MAX_FORKNUM; i++)
     634 UIC           0 :             nblocks += smgrnblocks(RelationGetSmgr(rel), i);
     635                 :     }
     636 ECB             :     else
     637 GIC     1613557 :         nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber);
     638 ECB             : 
     639 GIC     1613539 :     return nblocks * BLCKSZ;
     640                 : }
     641                 : 
     642                 : /*
     643                 :  * table_block_relation_estimate_size
     644                 :  *
     645                 :  * This function can't be directly used as the implementation of the
     646                 :  * relation_estimate_size callback, because it has a few additional parameters.
     647                 :  * Instead, it is intended to be used as a helper function; the caller can
     648                 :  * pass through the arguments to its relation_estimate_size function plus the
     649                 :  * additional values required here.
     650                 :  *
     651                 :  * overhead_bytes_per_tuple should contain the approximate number of bytes
     652                 :  * of storage required to store a tuple above and beyond what is required for
     653                 :  * the tuple data proper. Typically, this would include things like the
     654                 :  * size of the tuple header and item pointer. This is only used for query
     655                 :  * planning, so a table AM where the value is not constant could choose to
     656                 :  * pass a "best guess".
     657                 :  *
     658                 :  * usable_bytes_per_page should contain the approximate number of bytes per
     659                 :  * page usable for tuple data, excluding the page header and any anticipated
     660                 :  * special space.
     661                 :  */
     662 ECB             : void
     663 GIC      174520 : table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
     664                 :                                    BlockNumber *pages, double *tuples,
     665                 :                                    double *allvisfrac,
     666                 :                                    Size overhead_bytes_per_tuple,
     667                 :                                    Size usable_bytes_per_page)
     668                 : {
     669                 :     BlockNumber curpages;
     670                 :     BlockNumber relpages;
     671                 :     double      reltuples;
     672                 :     BlockNumber relallvisible;
     673                 :     double      density;
     674                 : 
     675 ECB             :     /* it should have storage, so we can call the smgr */
     676 GIC      174520 :     curpages = RelationGetNumberOfBlocks(rel);
     677                 : 
     678 ECB             :     /* coerce values in pg_class to more desirable types */
     679 CBC      174520 :     relpages = (BlockNumber) rel->rd_rel->relpages;
     680          174520 :     reltuples = (double) rel->rd_rel->reltuples;
     681 GIC      174520 :     relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
     682                 : 
     683                 :     /*
     684                 :      * HACK: if the relation has never yet been vacuumed, use a minimum size
     685                 :      * estimate of 10 pages.  The idea here is to avoid assuming a
     686                 :      * newly-created table is really small, even if it currently is, because
     687                 :      * that may not be true once some data gets loaded into it.  Once a vacuum
     688                 :      * or analyze cycle has been done on it, it's more reasonable to believe
     689                 :      * the size is somewhat stable.
     690                 :      *
     691                 :      * (Note that this is only an issue if the plan gets cached and used again
     692                 :      * after the table has been filled.  What we're trying to avoid is using a
     693                 :      * nestloop-type plan on a table that has grown substantially since the
     694                 :      * plan was made.  Normally, autovacuum/autoanalyze will occur once enough
     695                 :      * inserts have happened and cause cached-plan invalidation; but that
     696                 :      * doesn't happen instantaneously, and it won't happen at all for cases
     697                 :      * such as temporary tables.)
     698                 :      *
     699                 :      * We test "never vacuumed" by seeing whether reltuples < 0.
     700                 :      *
     701                 :      * If the table has inheritance children, we don't apply this heuristic.
     702                 :      * Totally empty parent tables are quite common, so we should be willing
     703                 :      * to believe that they are empty.
     704 ECB             :      */
     705 CBC      174520 :     if (curpages < 10 &&
     706           49110 :         reltuples < 0 &&
     707           49110 :         !rel->rd_rel->relhassubclass)
     708 GIC       47955 :         curpages = 10;
     709                 : 
     710 ECB             :     /* report estimated # pages */
     711 GIC      174520 :     *pages = curpages;
     712 ECB             :     /* quick exit if rel is clearly empty */
     713 GIC      174520 :     if (curpages == 0)
     714 ECB             :     {
     715 CBC        5014 :         *tuples = 0;
     716            5014 :         *allvisfrac = 0;
     717 GIC        5014 :         return;
     718                 :     }
     719                 : 
     720 ECB             :     /* estimate number of tuples from previous tuple density */
     721 CBC      169506 :     if (reltuples >= 0 && relpages > 0)
     722 GIC      107615 :         density = reltuples / (double) relpages;
     723                 :     else
     724                 :     {
     725                 :         /*
     726                 :          * When we have no data because the relation was never yet vacuumed,
     727                 :          * estimate tuple width from attribute datatypes.  We assume here that
     728                 :          * the pages are completely full, which is OK for tables but is
     729                 :          * probably an overestimate for indexes.  Fortunately
     730                 :          * get_relation_info() can clamp the overestimate to the parent
     731                 :          * table's size.
     732                 :          *
     733                 :          * Note: this code intentionally disregards alignment considerations,
     734                 :          * because (a) that would be gilding the lily considering how crude
     735                 :          * the estimate is, (b) it creates platform dependencies in the
     736                 :          * default plans which are kind of a headache for regression testing,
     737                 :          * and (c) different table AMs might use different padding schemes.
     738                 :          */
     739                 :         int32       tuple_width;
     740 ECB             : 
     741 CBC       61891 :         tuple_width = get_rel_data_width(rel, attr_widths);
     742 GIC       61891 :         tuple_width += overhead_bytes_per_tuple;
     743 ECB             :         /* note: integer division is intentional here */
     744 GIC       61891 :         density = usable_bytes_per_page / tuple_width;
     745 ECB             :     }
     746 GIC      169506 :     *tuples = rint(density * (double) curpages);
     747                 : 
     748                 :     /*
     749                 :      * We use relallvisible as-is, rather than scaling it up like we do for
     750                 :      * the pages and tuples counts, on the theory that any pages added since
     751                 :      * the last VACUUM are most likely not marked all-visible.  But costsize.c
     752                 :      * wants it converted to a fraction.
     753 ECB             :      */
     754 CBC      169506 :     if (relallvisible == 0 || curpages <= 0)
     755           96598 :         *allvisfrac = 0;
     756           72908 :     else if ((double) relallvisible >= curpages)
     757 GIC       34883 :         *allvisfrac = 1;
     758 ECB             :     else
     759 GIC       38025 :         *allvisfrac = (double) relallvisible / curpages;
     760                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a