LCOV - differential code coverage report
Current view: top level - src/backend/access/table - tableam.c (source / functions) Coverage Total Hit UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 89.7 % 185 166 12 7 1 76 4 85 11 76 5
Current Date: 2023-04-08 17:13:01 Functions: 100.0 % 19 19 12 1 6 12 1
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (180,240] days: 100.0 % 4 4 4
Legend: Lines: hit not hit (240..) days: 89.5 % 181 162 12 7 1 76 85 11 76
Function coverage date bins:
(180,240] days: 100.0 % 1 1 1
(240..) days: 60.0 % 30 18 12 6 12

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*----------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * tableam.c
                                  4                 :  *      Table access method routines too big to be inline functions.
                                  5                 :  *
                                  6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                  7                 :  * Portions Copyright (c) 1994, Regents of the University of California
                                  8                 :  *
                                  9                 :  *
                                 10                 :  * IDENTIFICATION
                                 11                 :  *    src/backend/access/table/tableam.c
                                 12                 :  *
                                 13                 :  * NOTES
                                 14                 :  *    Note that most function in here are documented in tableam.h, rather than
                                 15                 :  *    here. That's because there's a lot of inline functions in tableam.h and
                                 16                 :  *    it'd be harder to understand if one constantly had to switch between files.
                                 17                 :  *
                                 18                 :  *----------------------------------------------------------------------
                                 19                 :  */
                                 20                 : #include "postgres.h"
                                 21                 : 
                                 22                 : #include <math.h>
                                 23                 : 
                                 24                 : #include "access/syncscan.h"
                                 25                 : #include "access/tableam.h"
                                 26                 : #include "access/xact.h"
                                 27                 : #include "optimizer/plancat.h"
                                 28                 : #include "port/pg_bitutils.h"
                                 29                 : #include "storage/bufmgr.h"
                                 30                 : #include "storage/shmem.h"
                                 31                 : #include "storage/smgr.h"
                                 32                 : 
                                 33                 : /*
                                 34                 :  * Constants to control the behavior of block allocation to parallel workers
                                 35                 :  * during a parallel seqscan.  Technically these values do not need to be
                                 36                 :  * powers of 2, but having them as powers of 2 makes the math more optimal
                                 37                 :  * and makes the ramp-down stepping more even.
                                 38                 :  */
                                 39                 : 
                                 40                 : /* The number of I/O chunks we try to break a parallel seqscan down into */
                                 41                 : #define PARALLEL_SEQSCAN_NCHUNKS            2048
                                 42                 : /* Ramp down size of allocations when we've only this number of chunks left */
                                 43                 : #define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS    64
                                 44                 : /* Cap the size of parallel I/O chunks to this number of blocks */
                                 45                 : #define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE     8192
                                 46                 : 
                                 47                 : /* GUC variables */
                                 48                 : char       *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
                                 49                 : bool        synchronize_seqscans = true;
                                 50                 : 
                                 51                 : 
                                 52                 : /* ----------------------------------------------------------------------------
                                 53                 :  * Slot functions.
                                 54                 :  * ----------------------------------------------------------------------------
                                 55                 :  */
                                 56                 : 
                                 57                 : const TupleTableSlotOps *
 1490 andres                     58 CBC    15845404 : table_slot_callbacks(Relation relation)
                                 59                 : {
                                 60                 :     const TupleTableSlotOps *tts_cb;
                                 61                 : 
                                 62        15845404 :     if (relation->rd_tableam)
                                 63        15841525 :         tts_cb = relation->rd_tableam->slot_callbacks(relation);
                                 64            3879 :     else if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
                                 65                 :     {
                                 66                 :         /*
                                 67                 :          * Historically FDWs expect to store heap tuples in slots. Continue
                                 68                 :          * handing them one, to make it less painful to adapt FDWs to new
                                 69                 :          * versions. The cost of a heap slot over a virtual slot is pretty
                                 70                 :          * small.
                                 71                 :          */
                                 72             208 :         tts_cb = &TTSOpsHeapTuple;
                                 73                 :     }
                                 74                 :     else
                                 75                 :     {
                                 76                 :         /*
                                 77                 :          * These need to be supported, as some parts of the code (like COPY)
                                 78                 :          * need to create slots for such relations too. It seems better to
                                 79                 :          * centralize the knowledge that a heap slot is the right thing in
                                 80                 :          * that case here.
                                 81                 :          */
                                 82            3671 :         Assert(relation->rd_rel->relkind == RELKIND_VIEW ||
                                 83                 :                relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
                                 84            3671 :         tts_cb = &TTSOpsVirtual;
                                 85                 :     }
                                 86                 : 
                                 87        15845404 :     return tts_cb;
                                 88                 : }
                                 89                 : 
                                 90                 : TupleTableSlot *
                                 91        15666100 : table_slot_create(Relation relation, List **reglist)
                                 92                 : {
                                 93                 :     const TupleTableSlotOps *tts_cb;
                                 94                 :     TupleTableSlot *slot;
                                 95                 : 
                                 96        15666100 :     tts_cb = table_slot_callbacks(relation);
                                 97        15666100 :     slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), tts_cb);
                                 98                 : 
                                 99        15666100 :     if (reglist)
                                100          142014 :         *reglist = lappend(*reglist, slot);
                                101                 : 
                                102        15666100 :     return slot;
                                103                 : }
                                104                 : 
                                105                 : 
                                106                 : /* ----------------------------------------------------------------------------
                                107                 :  * Table scan functions.
                                108                 :  * ----------------------------------------------------------------------------
                                109                 :  */
                                110                 : 
                                111                 : TableScanDesc
                                112          124520 : table_beginscan_catalog(Relation relation, int nkeys, struct ScanKeyData *key)
                                113                 : {
 1421                           114          124520 :     uint32      flags = SO_TYPE_SEQSCAN |
                                115                 :     SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE | SO_TEMP_SNAPSHOT;
 1490                           116          124520 :     Oid         relid = RelationGetRelid(relation);
                                117          124520 :     Snapshot    snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
                                118                 : 
 1471                           119          124520 :     return relation->rd_tableam->scan_begin(relation, snapshot, nkeys, key,
                                120                 :                                             NULL, flags);
                                121                 : }
                                122                 : 
                                123                 : void
 1490                           124             138 : table_scan_update_snapshot(TableScanDesc scan, Snapshot snapshot)
                                125                 : {
                                126             138 :     Assert(IsMVCCSnapshot(snapshot));
                                127                 : 
                                128             138 :     RegisterSnapshot(snapshot);
                                129             138 :     scan->rs_snapshot = snapshot;
 1421                           130             138 :     scan->rs_flags |= SO_TEMP_SNAPSHOT;
 1490                           131             138 : }
                                132                 : 
                                133                 : 
                                134                 : /* ----------------------------------------------------------------------------
                                135                 :  * Parallel table scan related functions.
                                136                 :  * ----------------------------------------------------------------------------
                                137                 :  */
                                138                 : 
                                139                 : Size
                                140             512 : table_parallelscan_estimate(Relation rel, Snapshot snapshot)
                                141                 : {
                                142             512 :     Size        sz = 0;
                                143                 : 
                                144             512 :     if (IsMVCCSnapshot(snapshot))
                                145             441 :         sz = add_size(sz, EstimateSnapshotSpace(snapshot));
                                146                 :     else
                                147              71 :         Assert(snapshot == SnapshotAny);
                                148                 : 
                                149             512 :     sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel));
                                150                 : 
                                151             512 :     return sz;
                                152                 : }
                                153                 : 
                                154                 : void
                                155             512 : table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan,
                                156                 :                               Snapshot snapshot)
                                157                 : {
                                158             512 :     Size        snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan);
                                159                 : 
                                160             512 :     pscan->phs_snapshot_off = snapshot_off;
                                161                 : 
                                162             512 :     if (IsMVCCSnapshot(snapshot))
                                163                 :     {
                                164             441 :         SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off);
                                165             441 :         pscan->phs_snapshot_any = false;
                                166                 :     }
                                167                 :     else
                                168                 :     {
                                169              71 :         Assert(snapshot == SnapshotAny);
                                170              71 :         pscan->phs_snapshot_any = true;
                                171                 :     }
                                172             512 : }
                                173                 : 
                                174                 : TableScanDesc
  202 pg                        175 GNC        1895 : table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
                                176                 : {
                                177                 :     Snapshot    snapshot;
 1421 andres                    178 CBC        1895 :     uint32      flags = SO_TYPE_SEQSCAN |
                                179                 :     SO_ALLOW_STRAT | SO_ALLOW_SYNC | SO_ALLOW_PAGEMODE;
                                180                 : 
  202 pg                        181 GNC        1895 :     Assert(RelationGetRelid(relation) == pscan->phs_relid);
                                182                 : 
                                183            1895 :     if (!pscan->phs_snapshot_any)
                                184                 :     {
                                185                 :         /* Snapshot was serialized -- restore it */
                                186            1753 :         snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off);
 1490 andres                    187 CBC        1753 :         RegisterSnapshot(snapshot);
 1421 andres                    188 GIC        1753 :         flags |= SO_TEMP_SNAPSHOT;
                                189                 :     }
                                190                 :     else
                                191                 :     {
 1490 andres                    192 ECB             :         /* SnapshotAny passed by caller (not serialized) */
 1490 andres                    193 GIC         142 :         snapshot = SnapshotAny;
                                194                 :     }
 1490 andres                    195 ECB             : 
 1471 andres                    196 GIC        1895 :     return relation->rd_tableam->scan_begin(relation, snapshot, 0, NULL,
                                197                 :                                             pscan, flags);
                                198                 : }
                                199                 : 
                                200                 : 
                                201                 : /* ----------------------------------------------------------------------------
                                202                 :  * Index scan related functions.
                                203                 :  * ----------------------------------------------------------------------------
                                204                 :  */
                                205                 : 
                                206                 : /*
                                207                 :  * To perform that check simply start an index scan, create the necessary
                                208                 :  * slot, do the heap lookup, and shut everything down again. This could be
                                209                 :  * optimized, but is unlikely to matter from a performance POV. If there
                                210                 :  * frequently are live index pointers also matching a unique index key, the
                                211                 :  * CPU overhead of this routine is unlikely to matter.
                                212                 :  *
                                213                 :  * Note that *tid may be modified when we return true if the AM supports
                                214                 :  * storing multiple row versions reachable via a single index entry (like
                                215                 :  * heap's HOT).
                                216                 :  */
 1476 andres                    217 ECB             : bool
 1476 andres                    218 GIC     5817243 : table_index_fetch_tuple_check(Relation rel,
                                219                 :                               ItemPointer tid,
                                220                 :                               Snapshot snapshot,
                                221                 :                               bool *all_dead)
                                222                 : {
                                223                 :     IndexFetchTableData *scan;
 1476 andres                    224 ECB             :     TupleTableSlot *slot;
 1476 andres                    225 GIC     5817243 :     bool        call_again = false;
                                226                 :     bool        found;
 1476 andres                    227 ECB             : 
 1476 andres                    228 CBC     5817243 :     slot = table_slot_create(rel, NULL);
                                229         5817243 :     scan = table_index_fetch_begin(rel);
 1476 andres                    230 GIC     5817243 :     found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
 1476 andres                    231 ECB             :                                     all_dead);
 1476 andres                    232 CBC     5817243 :     table_index_fetch_end(scan);
 1476 andres                    233 GIC     5817243 :     ExecDropSingleTupleTableSlot(slot);
 1476 andres                    234 ECB             : 
 1476 andres                    235 GIC     5817243 :     return found;
                                236                 : }
                                237                 : 
                                238                 : 
                                239                 : /* ------------------------------------------------------------------------
                                240                 :  * Functions for non-modifying operations on individual tuples
                                241                 :  * ------------------------------------------------------------------------
                                242                 :  */
                                243                 : 
 1423 andres                    244 ECB             : void
 1417 andres                    245 GIC         153 : table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
 1423 andres                    246 ECB             : {
 1418 tgl                       247 CBC         153 :     Relation    rel = scan->rs_rd;
 1423 andres                    248 GIC         153 :     const TableAmRoutine *tableam = rel->rd_tableam;
                                249                 : 
                                250                 :     /*
                                251                 :      * We don't expect direct calls to table_tuple_get_latest_tid with valid
                                252                 :      * CheckXidAlive for catalog or regular tables.  See detailed comments in
                                253                 :      * xact.c where these variables are declared.
  974 akapila                   254 ECB             :      */
  974 akapila                   255 GBC         153 :     if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
  974 akapila                   256 UIC           0 :         elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
                                257                 : 
                                258                 :     /*
                                259                 :      * Since this can be called with user-supplied TID, don't trust the input
                                260                 :      * too much.
 1423 andres                    261 ECB             :      */
 1423 andres                    262 CBC         153 :     if (!tableam->tuple_tid_valid(scan, tid))
 1423 andres                    263 GIC           6 :         ereport(ERROR,
                                264                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                265                 :                  errmsg("tid (%u, %u) is not valid for relation \"%s\"",
                                266                 :                         ItemPointerGetBlockNumberNoCheck(tid),
                                267                 :                         ItemPointerGetOffsetNumberNoCheck(tid),
                                268                 :                         RelationGetRelationName(rel))));
 1423 andres                    269 ECB             : 
 1422 tgl                       270 CBC         147 :     tableam->tuple_get_latest_tid(scan, tid);
 1423 andres                    271 GIC         147 : }
                                272                 : 
                                273                 : 
                                274                 : /* ----------------------------------------------------------------------------
                                275                 :  * Functions to make modifications a bit simpler.
                                276                 :  * ----------------------------------------------------------------------------
                                277                 :  */
                                278                 : 
                                279                 : /*
                                280                 :  * simple_table_tuple_insert - insert a tuple
                                281                 :  *
                                282                 :  * Currently, this routine differs from table_tuple_insert only in supplying a
                                283                 :  * default command ID and not allowing access to the speedup options.
                                284                 :  */
 1478 andres                    285 ECB             : void
 1417 andres                    286 GIC       75587 : simple_table_tuple_insert(Relation rel, TupleTableSlot *slot)
 1478 andres                    287 ECB             : {
 1417 andres                    288 CBC       75587 :     table_tuple_insert(rel, slot, GetCurrentCommandId(true), 0, NULL);
 1478 andres                    289 GIC       75587 : }
                                290                 : 
                                291                 : /*
                                292                 :  * simple_table_tuple_delete - delete a tuple
                                293                 :  *
                                294                 :  * This routine may be used to delete a tuple when concurrent updates of
                                295                 :  * the target tuple are not expected (for example, because we have a lock
                                296                 :  * on the relation associated with the tuple).  Any failure is reported
                                297                 :  * via ereport().
                                298                 :  */
 1478 andres                    299 ECB             : void
 1417 andres                    300 GIC       40299 : simple_table_tuple_delete(Relation rel, ItemPointer tid, Snapshot snapshot)
                                301                 : {
                                302                 :     TM_Result   result;
                                303                 :     TM_FailureData tmfd;
 1478 andres                    304 ECB             : 
 1417 andres                    305 GIC       40299 :     result = table_tuple_delete(rel, tid,
                                306                 :                                 GetCurrentCommandId(true),
                                307                 :                                 snapshot, InvalidSnapshot,
                                308                 :                                 true /* wait for commit */ ,
                                309                 :                                 &tmfd, false /* changingPart */ );
 1478 andres                    310 ECB             : 
 1478 andres                    311 GIC       40299 :     switch (result)
 1478 andres                    312 EUB             :     {
 1478 andres                    313 UIC           0 :         case TM_SelfModified:
 1478 andres                    314 EUB             :             /* Tuple was already updated in current command? */
 1478 andres                    315 UIC           0 :             elog(ERROR, "tuple already updated by self");
                                316                 :             break;
 1478 andres                    317 ECB             : 
 1478 andres                    318 GIC       40299 :         case TM_Ok:
 1478 andres                    319 ECB             :             /* done successfully */
 1478 andres                    320 GIC       40299 :             break;
 1478 andres                    321 EUB             : 
 1478 andres                    322 UBC           0 :         case TM_Updated:
 1478 andres                    323 UIC           0 :             elog(ERROR, "tuple concurrently updated");
                                324                 :             break;
 1478 andres                    325 EUB             : 
 1478 andres                    326 UBC           0 :         case TM_Deleted:
 1478 andres                    327 UIC           0 :             elog(ERROR, "tuple concurrently deleted");
                                328                 :             break;
 1478 andres                    329 EUB             : 
 1478 andres                    330 UBC           0 :         default:
 1417 andres                    331 UIC           0 :             elog(ERROR, "unrecognized table_tuple_delete status: %u", result);
                                332                 :             break;
 1478 andres                    333 ECB             :     }
 1478 andres                    334 GIC       40299 : }
                                335                 : 
                                336                 : /*
                                337                 :  * simple_table_tuple_update - replace a tuple
                                338                 :  *
                                339                 :  * This routine may be used to update a tuple when concurrent updates of
                                340                 :  * the target tuple are not expected (for example, because we have a lock
                                341                 :  * on the relation associated with the tuple).  Any failure is reported
                                342                 :  * via ereport().
                                343                 :  */
 1478 andres                    344 ECB             : void
 1417 andres                    345 GIC       31905 : simple_table_tuple_update(Relation rel, ItemPointer otid,
                                346                 :                           TupleTableSlot *slot,
                                347                 :                           Snapshot snapshot,
                                348                 :                           TU_UpdateIndexes *update_indexes)
                                349                 : {
                                350                 :     TM_Result   result;
                                351                 :     TM_FailureData tmfd;
                                352                 :     LockTupleMode lockmode;
 1478 andres                    353 ECB             : 
 1417 andres                    354 GIC       31905 :     result = table_tuple_update(rel, otid, slot,
                                355                 :                                 GetCurrentCommandId(true),
                                356                 :                                 snapshot, InvalidSnapshot,
                                357                 :                                 true /* wait for commit */ ,
                                358                 :                                 &tmfd, &lockmode, update_indexes);
 1478 andres                    359 ECB             : 
 1478 andres                    360 GIC       31905 :     switch (result)
 1478 andres                    361 EUB             :     {
 1478 andres                    362 UIC           0 :         case TM_SelfModified:
 1478 andres                    363 EUB             :             /* Tuple was already updated in current command? */
 1478 andres                    364 UIC           0 :             elog(ERROR, "tuple already updated by self");
                                365                 :             break;
 1478 andres                    366 ECB             : 
 1478 andres                    367 GIC       31905 :         case TM_Ok:
 1478 andres                    368 ECB             :             /* done successfully */
 1478 andres                    369 GIC       31905 :             break;
 1478 andres                    370 EUB             : 
 1478 andres                    371 UBC           0 :         case TM_Updated:
 1478 andres                    372 UIC           0 :             elog(ERROR, "tuple concurrently updated");
                                373                 :             break;
 1478 andres                    374 EUB             : 
 1478 andres                    375 UBC           0 :         case TM_Deleted:
 1478 andres                    376 UIC           0 :             elog(ERROR, "tuple concurrently deleted");
                                377                 :             break;
 1478 andres                    378 EUB             : 
 1478 andres                    379 UBC           0 :         default:
 1417 andres                    380 UIC           0 :             elog(ERROR, "unrecognized table_tuple_update status: %u", result);
                                381                 :             break;
 1478 andres                    382 ECB             :     }
 1478 andres                    383 GIC       31905 : }
                                384                 : 
                                385                 : 
                                386                 : /* ----------------------------------------------------------------------------
                                387                 :  * Helper functions to implement parallel scans for block oriented AMs.
                                388                 :  * ----------------------------------------------------------------------------
                                389                 :  */
                                390                 : 
 1490 andres                    391 ECB             : Size
 1490 andres                    392 GIC         512 : table_block_parallelscan_estimate(Relation rel)
 1490 andres                    393 ECB             : {
 1490 andres                    394 GIC         512 :     return sizeof(ParallelBlockTableScanDescData);
                                395                 : }
                                396                 : 
 1490 andres                    397 ECB             : Size
 1490 andres                    398 GIC         512 : table_block_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
 1490 andres                    399 ECB             : {
 1490 andres                    400 GIC         512 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
 1490 andres                    401 ECB             : 
 1490 andres                    402 CBC         512 :     bpscan->base.phs_relid = RelationGetRelid(rel);
 1490 andres                    403 GIC         512 :     bpscan->phs_nblocks = RelationGetNumberOfBlocks(rel);
 1490 andres                    404 ECB             :     /* compare phs_syncscan initialization to similar logic in initscan */
 1490 andres                    405 CBC        1369 :     bpscan->base.phs_syncscan = synchronize_seqscans &&
                                406             857 :         !RelationUsesLocalBuffers(rel) &&
                                407             345 :         bpscan->phs_nblocks > NBuffers / 4;
                                408             512 :     SpinLockInit(&bpscan->phs_mutex);
                                409             512 :     bpscan->phs_startblock = InvalidBlockNumber;
 1490 andres                    410 GIC         512 :     pg_atomic_init_u64(&bpscan->phs_nallocated, 0);
 1490 andres                    411 ECB             : 
 1490 andres                    412 GIC         512 :     return sizeof(ParallelBlockTableScanDescData);
                                413                 : }
                                414                 : 
 1490 andres                    415 ECB             : void
 1490 andres                    416 GIC         114 : table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
 1490 andres                    417 ECB             : {
 1490 andres                    418 GIC         114 :     ParallelBlockTableScanDesc bpscan = (ParallelBlockTableScanDesc) pscan;
 1490 andres                    419 ECB             : 
 1490 andres                    420 CBC         114 :     pg_atomic_write_u64(&bpscan->phs_nallocated, 0);
 1490 andres                    421 GIC         114 : }
                                422                 : 
                                423                 : /*
                                424                 :  * find and set the scan's startblock
                                425                 :  *
                                426                 :  * Determine where the parallel seq scan should start.  This function may be
                                427                 :  * called many times, once by each parallel worker.  We must be careful only
                                428                 :  * to set the startblock once.
                                429                 :  */
 1490 andres                    430 ECB             : void
  987 drowley                   431 GIC        1296 : table_block_parallelscan_startblock_init(Relation rel,
                                432                 :                                          ParallelBlockTableScanWorker pbscanwork,
                                433                 :                                          ParallelBlockTableScanDesc pbscan)
 1490 andres                    434 ECB             : {
 1490 andres                    435 GIC        1296 :     BlockNumber sync_startpage = InvalidBlockNumber;
                                436                 : 
  987 drowley                   437 ECB             :     /* Reset the state we use for controlling allocation size. */
  987 drowley                   438 GIC        1296 :     memset(pbscanwork, 0, sizeof(*pbscanwork));
                                439                 : 
                                440                 :     StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
                                441                 :                      "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
                                442                 : 
                                443                 :     /*
                                444                 :      * We determine the chunk size based on the size of the relation. First we
                                445                 :      * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
                                446                 :      * take the next highest power of 2 number of the chunk size.  This means
                                447                 :      * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
                                448                 :      * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
  987 drowley                   449 ECB             :      */
  987 drowley                   450 GIC        1296 :     pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
                                451                 :                                                        PARALLEL_SEQSCAN_NCHUNKS, 1));
                                452                 : 
                                453                 :     /*
                                454                 :      * Ensure we don't go over the maximum chunk size with larger tables. This
                                455                 :      * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
                                456                 :      * tables.  Too large a chunk size has been shown to be detrimental to
                                457                 :      * synchronous scan performance.
  987 drowley                   458 ECB             :      */
  987 drowley                   459 GIC        1296 :     pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
                                460                 :                                       PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
  987 drowley                   461 ECB             : 
 1490 andres                    462 GIC        1297 : retry:
 1490 andres                    463 ECB             :     /* Grab the spinlock. */
 1490 andres                    464 GIC        1297 :     SpinLockAcquire(&pbscan->phs_mutex);
                                465                 : 
                                466                 :     /*
                                467                 :      * If the scan's startblock has not yet been initialized, we must do so
                                468                 :      * now.  If this is not a synchronized scan, we just start at block 0, but
                                469                 :      * if it is a synchronized scan, we must get the starting position from
                                470                 :      * the synchronized scan machinery.  We can't hold the spinlock while
                                471                 :      * doing that, though, so release the spinlock, get the information we
                                472                 :      * need, and retry.  If nobody else has initialized the scan in the
                                473                 :      * meantime, we'll fill in the value we fetched on the second time
                                474                 :      * through.
 1490 andres                    475 ECB             :      */
 1490 andres                    476 GIC        1297 :     if (pbscan->phs_startblock == InvalidBlockNumber)
 1490 andres                    477 ECB             :     {
 1490 andres                    478 CBC         408 :         if (!pbscan->base.phs_syncscan)
                                479             406 :             pbscan->phs_startblock = 0;
                                480               2 :         else if (sync_startpage != InvalidBlockNumber)
 1490 andres                    481 GIC           1 :             pbscan->phs_startblock = sync_startpage;
                                482                 :         else
 1490 andres                    483 ECB             :         {
 1490 andres                    484 CBC           1 :             SpinLockRelease(&pbscan->phs_mutex);
                                485               1 :             sync_startpage = ss_get_location(rel, pbscan->phs_nblocks);
 1490 andres                    486 GIC           1 :             goto retry;
                                487                 :         }
 1490 andres                    488 ECB             :     }
 1490 andres                    489 CBC        1296 :     SpinLockRelease(&pbscan->phs_mutex);
 1490 andres                    490 GIC        1296 : }
                                491                 : 
                                492                 : /*
                                493                 :  * get the next page to scan
                                494                 :  *
                                495                 :  * Get the next page to scan.  Even if there are no pages left to scan,
                                496                 :  * another backend could have grabbed a page to scan and not yet finished
                                497                 :  * looking at it, so it doesn't follow that the scan is done when the first
                                498                 :  * backend gets an InvalidBlockNumber return.
                                499                 :  */
 1490 andres                    500 ECB             : BlockNumber
  987 drowley                   501 GIC       97139 : table_block_parallelscan_nextpage(Relation rel,
                                502                 :                                   ParallelBlockTableScanWorker pbscanwork,
                                503                 :                                   ParallelBlockTableScanDesc pbscan)
                                504                 : {
                                505                 :     BlockNumber page;
                                506                 :     uint64      nallocated;
                                507                 : 
                                508                 :     /*
                                509                 :      * The logic below allocates block numbers out to parallel workers in a
                                510                 :      * way that each worker will receive a set of consecutive block numbers to
                                511                 :      * scan.  Earlier versions of this would allocate the next highest block
                                512                 :      * number to the next worker to call this function.  This would generally
                                513                 :      * result in workers never receiving consecutive block numbers.  Some
                                514                 :      * operating systems would not detect the sequential I/O pattern due to
                                515                 :      * each backend being a different process which could result in poor
                                516                 :      * performance due to inefficient or no readahead.  To work around this
                                517                 :      * issue, we now allocate a range of block numbers for each worker and
                                518                 :      * when they come back for another block, we give them the next one in
                                519                 :      * that range until the range is complete.  When the worker completes the
                                520                 :      * range of blocks we then allocate another range for it and return the
                                521                 :      * first block number from that range.
                                522                 :      *
                                523                 :      * Here we name these ranges of blocks "chunks".  The initial size of
                                524                 :      * these chunks is determined in table_block_parallelscan_startblock_init
                                525                 :      * based on the size of the relation.  Towards the end of the scan, we
                                526                 :      * start making reductions in the size of the chunks in order to attempt
                                527                 :      * to divide the remaining work over all the workers as evenly as
                                528                 :      * possible.
                                529                 :      *
                                530                 :      * Here pbscanwork is local worker memory.  phsw_chunk_remaining tracks
                                531                 :      * the number of blocks remaining in the chunk.  When that reaches 0 then
                                532                 :      * we must allocate a new chunk for the worker.
                                533                 :      *
                                534                 :      * phs_nallocated tracks how many blocks have been allocated to workers
                                535                 :      * already.  When phs_nallocated >= rs_nblocks, all blocks have been
                                536                 :      * allocated.
                                537                 :      *
                                538                 :      * Because we use an atomic fetch-and-add to fetch the current value, the
                                539                 :      * phs_nallocated counter will exceed rs_nblocks, because workers will
                                540                 :      * still increment the value, when they try to allocate the next block but
                                541                 :      * all blocks have been allocated already. The counter must be 64 bits
                                542                 :      * wide because of that, to avoid wrapping around when rs_nblocks is close
                                543                 :      * to 2^32.
                                544                 :      *
                                545                 :      * The actual block to return is calculated by adding the counter to the
                                546                 :      * starting block number, modulo nblocks.
                                547                 :      */
                                548                 : 
                                549                 :     /*
                                550                 :      * First check if we have any remaining blocks in a previous chunk for
                                551                 :      * this worker.  We must consume all of the blocks from that before we
                                552                 :      * allocate a new chunk to the worker.
  987 drowley                   553 ECB             :      */
  987 drowley                   554 GIC       97139 :     if (pbscanwork->phsw_chunk_remaining > 0)
                                555                 :     {
                                556                 :         /*
                                557                 :          * Give them the next block in the range and update the remaining
                                558                 :          * number of blocks.
  987 drowley                   559 ECB             :          */
  987 drowley                   560 CBC        6511 :         nallocated = ++pbscanwork->phsw_nallocated;
  987 drowley                   561 GIC        6511 :         pbscanwork->phsw_chunk_remaining--;
                                562                 :     }
                                563                 :     else
                                564                 :     {
                                565                 :         /*
                                566                 :          * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
                                567                 :          * remaining in the scan, we half the chunk size.  Since we reduce the
                                568                 :          * chunk size here, we'll hit this again after doing
                                569                 :          * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size.  After a few
                                570                 :          * iterations of this, we'll end up doing the last few blocks with the
                                571                 :          * chunk size set to 1.
  987 drowley                   572 ECB             :          */
  987 drowley                   573 CBC       90628 :         if (pbscanwork->phsw_chunk_size > 1 &&
                                574            2215 :             pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
                                575            2215 :             (pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
  987 drowley                   576 GIC           2 :             pbscanwork->phsw_chunk_size >>= 1;
  987 drowley                   577 ECB             : 
  987 drowley                   578 CBC       90628 :         nallocated = pbscanwork->phsw_nallocated =
                                579           90628 :             pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
  987 drowley                   580 GIC       90628 :                                     pbscanwork->phsw_chunk_size);
                                581                 : 
                                582                 :         /*
                                583                 :          * Set the remaining number of blocks in this chunk so that subsequent
                                584                 :          * calls from this worker continue on with this chunk until it's done.
  987 drowley                   585 ECB             :          */
  987 drowley                   586 GIC       90628 :         pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
                                587                 :     }
  987 drowley                   588 ECB             : 
 1490 andres                    589 CBC       97139 :     if (nallocated >= pbscan->phs_nblocks)
 1490 andres                    590 GIC        1296 :         page = InvalidBlockNumber;  /* all blocks have been allocated */
 1490 andres                    591 ECB             :     else
 1490 andres                    592 GIC       95843 :         page = (nallocated + pbscan->phs_startblock) % pbscan->phs_nblocks;
                                593                 : 
                                594                 :     /*
                                595                 :      * Report scan location.  Normally, we report the current page number.
                                596                 :      * When we reach the end of the scan, though, we report the starting page,
                                597                 :      * not the ending page, just so the starting positions for later scans
                                598                 :      * doesn't slew backwards.  We only report the position at the end of the
                                599                 :      * scan once, though: subsequent callers will report nothing.
 1490 andres                    600 ECB             :      */
 1490 andres                    601 GIC       97139 :     if (pbscan->base.phs_syncscan)
 1490 andres                    602 ECB             :     {
 1490 andres                    603 CBC        8852 :         if (page != InvalidBlockNumber)
                                604            8850 :             ss_report_location(rel, page);
                                605               2 :         else if (nallocated == pbscan->phs_nblocks)
 1490 andres                    606 GIC           1 :             ss_report_location(rel, pbscan->phs_startblock);
                                607                 :     }
 1490 andres                    608 ECB             : 
 1490 andres                    609 GIC       97139 :     return page;
                                610                 : }
                                611                 : 
                                612                 : /* ----------------------------------------------------------------------------
                                613                 :  * Helper functions to implement relation sizing for block oriented AMs.
                                614                 :  * ----------------------------------------------------------------------------
                                615                 :  */
                                616                 : 
                                617                 : /*
                                618                 :  * table_block_relation_size
                                619                 :  *
                                620                 :  * If a table AM uses the various relation forks as the sole place where data
                                621                 :  * is stored, and if it uses them in the expected manner (e.g. the actual data
                                622                 :  * is in the main fork rather than some other), it can use this implementation
                                623                 :  * of the relation_size callback rather than implementing its own.
                                624                 :  */
 1371 rhaas                     625 ECB             : uint64
 1371 rhaas                     626 GIC     1613557 : table_block_relation_size(Relation rel, ForkNumber forkNumber)
 1371 rhaas                     627 ECB             : {
 1371 rhaas                     628 GIC     1613557 :     uint64      nblocks = 0;
                                629                 : 
 1371 rhaas                     630 ECB             :     /* InvalidForkNumber indicates returning the size for all forks */
 1371 rhaas                     631 GIC     1613557 :     if (forkNumber == InvalidForkNumber)
 1371 rhaas                     632 EUB             :     {
 1371 rhaas                     633 UBC           0 :         for (int i = 0; i < MAX_FORKNUM; i++)
  636 tgl                       634 UIC           0 :             nblocks += smgrnblocks(RelationGetSmgr(rel), i);
                                635                 :     }
 1371 rhaas                     636 ECB             :     else
  636 tgl                       637 GIC     1613557 :         nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber);
 1371 rhaas                     638 ECB             : 
 1371 rhaas                     639 GIC     1613539 :     return nblocks * BLCKSZ;
                                640                 : }
                                641                 : 
                                642                 : /*
                                643                 :  * table_block_relation_estimate_size
                                644                 :  *
                                645                 :  * This function can't be directly used as the implementation of the
                                646                 :  * relation_estimate_size callback, because it has a few additional parameters.
                                647                 :  * Instead, it is intended to be used as a helper function; the caller can
                                648                 :  * pass through the arguments to its relation_estimate_size function plus the
                                649                 :  * additional values required here.
                                650                 :  *
                                651                 :  * overhead_bytes_per_tuple should contain the approximate number of bytes
                                652                 :  * of storage required to store a tuple above and beyond what is required for
                                653                 :  * the tuple data proper. Typically, this would include things like the
                                654                 :  * size of the tuple header and item pointer. This is only used for query
                                655                 :  * planning, so a table AM where the value is not constant could choose to
                                656                 :  * pass a "best guess".
                                657                 :  *
                                658                 :  * usable_bytes_per_page should contain the approximate number of bytes per
                                659                 :  * page usable for tuple data, excluding the page header and any anticipated
                                660                 :  * special space.
                                661                 :  */
 1371 rhaas                     662 ECB             : void
 1371 rhaas                     663 GIC      174520 : table_block_relation_estimate_size(Relation rel, int32 *attr_widths,
                                664                 :                                    BlockNumber *pages, double *tuples,
                                665                 :                                    double *allvisfrac,
                                666                 :                                    Size overhead_bytes_per_tuple,
                                667                 :                                    Size usable_bytes_per_page)
                                668                 : {
                                669                 :     BlockNumber curpages;
                                670                 :     BlockNumber relpages;
                                671                 :     double      reltuples;
                                672                 :     BlockNumber relallvisible;
                                673                 :     double      density;
                                674                 : 
 1371 rhaas                     675 ECB             :     /* it should have storage, so we can call the smgr */
 1371 rhaas                     676 GIC      174520 :     curpages = RelationGetNumberOfBlocks(rel);
                                677                 : 
 1371 rhaas                     678 ECB             :     /* coerce values in pg_class to more desirable types */
 1371 rhaas                     679 CBC      174520 :     relpages = (BlockNumber) rel->rd_rel->relpages;
                                680          174520 :     reltuples = (double) rel->rd_rel->reltuples;
 1371 rhaas                     681 GIC      174520 :     relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
                                682                 : 
                                683                 :     /*
                                684                 :      * HACK: if the relation has never yet been vacuumed, use a minimum size
                                685                 :      * estimate of 10 pages.  The idea here is to avoid assuming a
                                686                 :      * newly-created table is really small, even if it currently is, because
                                687                 :      * that may not be true once some data gets loaded into it.  Once a vacuum
                                688                 :      * or analyze cycle has been done on it, it's more reasonable to believe
                                689                 :      * the size is somewhat stable.
                                690                 :      *
                                691                 :      * (Note that this is only an issue if the plan gets cached and used again
                                692                 :      * after the table has been filled.  What we're trying to avoid is using a
                                693                 :      * nestloop-type plan on a table that has grown substantially since the
                                694                 :      * plan was made.  Normally, autovacuum/autoanalyze will occur once enough
                                695                 :      * inserts have happened and cause cached-plan invalidation; but that
                                696                 :      * doesn't happen instantaneously, and it won't happen at all for cases
                                697                 :      * such as temporary tables.)
                                698                 :      *
                                699                 :      * We test "never vacuumed" by seeing whether reltuples < 0.
                                700                 :      *
                                701                 :      * If the table has inheritance children, we don't apply this heuristic.
                                702                 :      * Totally empty parent tables are quite common, so we should be willing
                                703                 :      * to believe that they are empty.
 1371 rhaas                     704 ECB             :      */
 1371 rhaas                     705 CBC      174520 :     if (curpages < 10 &&
  952 tgl                       706           49110 :         reltuples < 0 &&
 1371 rhaas                     707           49110 :         !rel->rd_rel->relhassubclass)
 1371 rhaas                     708 GIC       47955 :         curpages = 10;
                                709                 : 
 1371 rhaas                     710 ECB             :     /* report estimated # pages */
 1371 rhaas                     711 GIC      174520 :     *pages = curpages;
 1371 rhaas                     712 ECB             :     /* quick exit if rel is clearly empty */
 1371 rhaas                     713 GIC      174520 :     if (curpages == 0)
 1371 rhaas                     714 ECB             :     {
 1371 rhaas                     715 CBC        5014 :         *tuples = 0;
                                716            5014 :         *allvisfrac = 0;
 1371 rhaas                     717 GIC        5014 :         return;
                                718                 :     }
                                719                 : 
 1371 rhaas                     720 ECB             :     /* estimate number of tuples from previous tuple density */
  952 tgl                       721 CBC      169506 :     if (reltuples >= 0 && relpages > 0)
 1371 rhaas                     722 GIC      107615 :         density = reltuples / (double) relpages;
                                723                 :     else
                                724                 :     {
                                725                 :         /*
                                726                 :          * When we have no data because the relation was never yet vacuumed,
                                727                 :          * estimate tuple width from attribute datatypes.  We assume here that
                                728                 :          * the pages are completely full, which is OK for tables but is
                                729                 :          * probably an overestimate for indexes.  Fortunately
                                730                 :          * get_relation_info() can clamp the overestimate to the parent
                                731                 :          * table's size.
                                732                 :          *
                                733                 :          * Note: this code intentionally disregards alignment considerations,
                                734                 :          * because (a) that would be gilding the lily considering how crude
                                735                 :          * the estimate is, (b) it creates platform dependencies in the
                                736                 :          * default plans which are kind of a headache for regression testing,
                                737                 :          * and (c) different table AMs might use different padding schemes.
                                738                 :          */
                                739                 :         int32       tuple_width;
 1371 rhaas                     740 ECB             : 
 1371 rhaas                     741 CBC       61891 :         tuple_width = get_rel_data_width(rel, attr_widths);
 1371 rhaas                     742 GIC       61891 :         tuple_width += overhead_bytes_per_tuple;
 1371 rhaas                     743 ECB             :         /* note: integer division is intentional here */
 1371 rhaas                     744 GIC       61891 :         density = usable_bytes_per_page / tuple_width;
 1371 rhaas                     745 ECB             :     }
 1371 rhaas                     746 GIC      169506 :     *tuples = rint(density * (double) curpages);
                                747                 : 
                                748                 :     /*
                                749                 :      * We use relallvisible as-is, rather than scaling it up like we do for
                                750                 :      * the pages and tuples counts, on the theory that any pages added since
                                751                 :      * the last VACUUM are most likely not marked all-visible.  But costsize.c
                                752                 :      * wants it converted to a fraction.
 1371 rhaas                     753 ECB             :      */
 1371 rhaas                     754 CBC      169506 :     if (relallvisible == 0 || curpages <= 0)
                                755           96598 :         *allvisfrac = 0;
                                756           72908 :     else if ((double) relallvisible >= curpages)
 1371 rhaas                     757 GIC       34883 :         *allvisfrac = 1;
 1371 rhaas                     758 ECB             :     else
 1371 rhaas                     759 GIC       38025 :         *allvisfrac = (double) relallvisible / curpages;
                                760                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a