LCOV - differential code coverage report
Current view: top level - src/include/access - nbtree.h (source / functions) Coverage Total Hit UIC GBC GIC GNC CBC ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 98.9 % 94 93 1 1 44 2 46 45 2
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 17 17 16 1 17
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * nbtree.h
       4                 :  *    header file for postgres btree access method implementation.
       5                 :  *
       6                 :  *
       7                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       8                 :  * Portions Copyright (c) 1994, Regents of the University of California
       9                 :  *
      10                 :  * src/include/access/nbtree.h
      11                 :  *
      12                 :  *-------------------------------------------------------------------------
      13                 :  */
      14                 : #ifndef NBTREE_H
      15                 : #define NBTREE_H
      16                 : 
      17                 : #include "access/amapi.h"
      18                 : #include "access/itup.h"
      19                 : #include "access/sdir.h"
      20                 : #include "access/tableam.h"
      21                 : #include "access/xlogreader.h"
      22                 : #include "catalog/pg_am_d.h"
      23                 : #include "catalog/pg_index.h"
      24                 : #include "lib/stringinfo.h"
      25                 : #include "storage/bufmgr.h"
      26                 : #include "storage/shm_toc.h"
      27                 : 
      28                 : /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
      29                 : typedef uint16 BTCycleId;
      30                 : 
      31                 : /*
      32                 :  *  BTPageOpaqueData -- At the end of every page, we store a pointer
      33                 :  *  to both siblings in the tree.  This is used to do forward/backward
      34                 :  *  index scans.  The next-page link is also critical for recovery when
      35                 :  *  a search has navigated to the wrong page due to concurrent page splits
      36                 :  *  or deletions; see src/backend/access/nbtree/README for more info.
      37                 :  *
      38                 :  *  In addition, we store the page's btree level (counting upwards from
      39                 :  *  zero at a leaf page) as well as some flag bits indicating the page type
      40                 :  *  and status.  If the page is deleted, a BTDeletedPageData struct is stored
      41                 :  *  in the page's tuple area, while a standard BTPageOpaqueData struct is
      42                 :  *  stored in the page special area.
      43                 :  *
      44                 :  *  We also store a "vacuum cycle ID".  When a page is split while VACUUM is
      45                 :  *  processing the index, a nonzero value associated with the VACUUM run is
      46                 :  *  stored into both halves of the split page.  (If VACUUM is not running,
      47                 :  *  both pages receive zero cycleids.)  This allows VACUUM to detect whether
      48                 :  *  a page was split since it started, with a small probability of false match
      49                 :  *  if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
      50                 :  *  ago.  Also, during a split, the BTP_SPLIT_END flag is cleared in the left
      51                 :  *  (original) page, and set in the right page, but only if the next page
      52                 :  *  to its right has a different cycleid.
      53                 :  *
      54                 :  *  NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
      55                 :  *  instead.
      56                 :  *
      57                 :  *  NOTE: the btpo_level field used to be a union type in order to allow
      58                 :  *  deleted pages to store a 32-bit safexid in the same field.  We now store
      59                 :  *  64-bit/full safexid values using BTDeletedPageData instead.
      60                 :  */
      61                 : 
      62                 : typedef struct BTPageOpaqueData
      63                 : {
      64                 :     BlockNumber btpo_prev;      /* left sibling, or P_NONE if leftmost */
      65                 :     BlockNumber btpo_next;      /* right sibling, or P_NONE if rightmost */
      66                 :     uint32      btpo_level;     /* tree level --- zero for leaf pages */
      67                 :     uint16      btpo_flags;     /* flag bits, see below */
      68                 :     BTCycleId   btpo_cycleid;   /* vacuum cycle ID of latest split */
      69                 : } BTPageOpaqueData;
      70                 : 
      71                 : typedef BTPageOpaqueData *BTPageOpaque;
      72                 : 
      73                 : #define BTPageGetOpaque(page) ((BTPageOpaque) PageGetSpecialPointer(page))
      74                 : 
      75                 : /* Bits defined in btpo_flags */
      76                 : #define BTP_LEAF        (1 << 0)  /* leaf page, i.e. not internal page */
      77                 : #define BTP_ROOT        (1 << 1)  /* root page (has no parent) */
      78                 : #define BTP_DELETED     (1 << 2)  /* page has been deleted from tree */
      79                 : #define BTP_META        (1 << 3)  /* meta-page */
      80                 : #define BTP_HALF_DEAD   (1 << 4)  /* empty, but still in tree */
      81                 : #define BTP_SPLIT_END   (1 << 5)  /* rightmost page of split group */
      82                 : #define BTP_HAS_GARBAGE (1 << 6)  /* page has LP_DEAD tuples (deprecated) */
      83                 : #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
      84                 : #define BTP_HAS_FULLXID (1 << 8)  /* contains BTDeletedPageData */
      85                 : 
      86                 : /*
      87                 :  * The max allowed value of a cycle ID is a bit less than 64K.  This is
      88                 :  * for convenience of pg_filedump and similar utilities: we want to use
      89                 :  * the last 2 bytes of special space as an index type indicator, and
      90                 :  * restricting cycle ID lets btree use that space for vacuum cycle IDs
      91                 :  * while still allowing index type to be identified.
      92                 :  */
      93                 : #define MAX_BT_CYCLE_ID     0xFF7F
      94                 : 
      95                 : 
      96                 : /*
      97                 :  * The Meta page is always the first page in the btree index.
      98                 :  * Its primary purpose is to point to the location of the btree root page.
      99                 :  * We also point to the "fast" root, which is the current effective root;
     100                 :  * see README for discussion.
     101                 :  */
     102                 : 
     103                 : typedef struct BTMetaPageData
     104                 : {
     105                 :     uint32      btm_magic;      /* should contain BTREE_MAGIC */
     106                 :     uint32      btm_version;    /* nbtree version (always <= BTREE_VERSION) */
     107                 :     BlockNumber btm_root;       /* current root location */
     108                 :     uint32      btm_level;      /* tree level of the root page */
     109                 :     BlockNumber btm_fastroot;   /* current "fast" root location */
     110                 :     uint32      btm_fastlevel;  /* tree level of the "fast" root page */
     111                 :     /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
     112                 : 
     113                 :     /* number of deleted, non-recyclable pages during last cleanup */
     114                 :     uint32      btm_last_cleanup_num_delpages;
     115                 :     /* number of heap tuples during last cleanup (deprecated) */
     116                 :     float8      btm_last_cleanup_num_heap_tuples;
     117                 : 
     118                 :     bool        btm_allequalimage;  /* are all columns "equalimage"? */
     119                 : } BTMetaPageData;
     120                 : 
     121                 : #define BTPageGetMeta(p) \
     122                 :     ((BTMetaPageData *) PageGetContents(p))
     123                 : 
     124                 : /*
     125                 :  * The current Btree version is 4.  That's what you'll get when you create
     126                 :  * a new index.
     127                 :  *
     128                 :  * Btree version 3 was used in PostgreSQL v11.  It is mostly the same as
     129                 :  * version 4, but heap TIDs were not part of the keyspace.  Index tuples
     130                 :  * with duplicate keys could be stored in any order.  We continue to
     131                 :  * support reading and writing Btree versions 2 and 3, so that they don't
     132                 :  * need to be immediately re-indexed at pg_upgrade.  In order to get the
     133                 :  * new heapkeyspace semantics, however, a REINDEX is needed.
     134                 :  *
     135                 :  * Deduplication is safe to use when the btm_allequalimage field is set to
     136                 :  * true.  It's safe to read the btm_allequalimage field on version 3, but
     137                 :  * only version 4 indexes make use of deduplication.  Even version 4
     138                 :  * indexes created on PostgreSQL v12 will need a REINDEX to make use of
     139                 :  * deduplication, though, since there is no other way to set
     140                 :  * btm_allequalimage to true (pg_upgrade hasn't been taught to set the
     141                 :  * metapage field).
     142                 :  *
     143                 :  * Btree version 2 is mostly the same as version 3.  There are two new
     144                 :  * fields in the metapage that were introduced in version 3.  A version 2
     145                 :  * metapage will be automatically upgraded to version 3 on the first
     146                 :  * insert to it.  INCLUDE indexes cannot use version 2.
     147                 :  */
     148                 : #define BTREE_METAPAGE  0       /* first page is meta */
     149                 : #define BTREE_MAGIC     0x053162    /* magic number in metapage */
     150                 : #define BTREE_VERSION   4       /* current version number */
     151                 : #define BTREE_MIN_VERSION   2   /* minimum supported version */
     152                 : #define BTREE_NOVAC_VERSION 3   /* version with all meta fields set */
     153                 : 
     154                 : /*
     155                 :  * Maximum size of a btree index entry, including its tuple header.
     156                 :  *
     157                 :  * We actually need to be able to fit three items on every page,
     158                 :  * so restrict any one item to 1/3 the per-page available space.
     159                 :  *
     160                 :  * There are rare cases where _bt_truncate() will need to enlarge
     161                 :  * a heap index tuple to make space for a tiebreaker heap TID
     162                 :  * attribute, which we account for here.
     163                 :  */
     164                 : #define BTMaxItemSize(page) \
     165                 :     (MAXALIGN_DOWN((PageGetPageSize(page) - \
     166                 :                     MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
     167                 :                     MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \
     168                 :                     MAXALIGN(sizeof(ItemPointerData)))
     169                 : #define BTMaxItemSizeNoHeapTid(page) \
     170                 :     MAXALIGN_DOWN((PageGetPageSize(page) - \
     171                 :                    MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
     172                 :                    MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
     173                 : 
     174                 : /*
     175                 :  * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples
     176                 :  * that may be stored on a btree leaf page.  It is used to size the
     177                 :  * per-page temporary buffers.
     178                 :  *
     179                 :  * Note: we don't bother considering per-tuple overheads here to keep
     180                 :  * things simple (value is based on how many elements a single array of
     181                 :  * heap TIDs must have to fill the space between the page header and
     182                 :  * special area).  The value is slightly higher (i.e. more conservative)
     183                 :  * than necessary as a result, which is considered acceptable.
     184                 :  */
     185                 : #define MaxTIDsPerBTreePage \
     186                 :     (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \
     187                 :            sizeof(ItemPointerData))
     188                 : 
     189                 : /*
     190                 :  * The leaf-page fillfactor defaults to 90% but is user-adjustable.
     191                 :  * For pages above the leaf level, we use a fixed 70% fillfactor.
     192                 :  * The fillfactor is applied during index build and when splitting
     193                 :  * a rightmost page; when splitting non-rightmost pages we try to
     194                 :  * divide the data equally.  When splitting a page that's entirely
     195                 :  * filled with a single value (duplicates), the effective leaf-page
     196                 :  * fillfactor is 96%, regardless of whether the page is a rightmost
     197                 :  * page.
     198                 :  */
     199                 : #define BTREE_MIN_FILLFACTOR        10
     200                 : #define BTREE_DEFAULT_FILLFACTOR    90
     201                 : #define BTREE_NONLEAF_FILLFACTOR    70
     202                 : #define BTREE_SINGLEVAL_FILLFACTOR  96
     203                 : 
     204                 : /*
     205                 :  *  In general, the btree code tries to localize its knowledge about
     206                 :  *  page layout to a couple of routines.  However, we need a special
     207                 :  *  value to indicate "no page number" in those places where we expect
     208                 :  *  page numbers.  We can use zero for this because we never need to
     209                 :  *  make a pointer to the metadata page.
     210                 :  */
     211                 : 
     212                 : #define P_NONE          0
     213                 : 
     214                 : /*
     215                 :  * Macros to test whether a page is leftmost or rightmost on its tree level,
     216                 :  * as well as other state info kept in the opaque data.
     217                 :  */
     218                 : #define P_LEFTMOST(opaque)      ((opaque)->btpo_prev == P_NONE)
     219                 : #define P_RIGHTMOST(opaque)     ((opaque)->btpo_next == P_NONE)
     220                 : #define P_ISLEAF(opaque)        (((opaque)->btpo_flags & BTP_LEAF) != 0)
     221                 : #define P_ISROOT(opaque)        (((opaque)->btpo_flags & BTP_ROOT) != 0)
     222                 : #define P_ISDELETED(opaque)     (((opaque)->btpo_flags & BTP_DELETED) != 0)
     223                 : #define P_ISMETA(opaque)        (((opaque)->btpo_flags & BTP_META) != 0)
     224                 : #define P_ISHALFDEAD(opaque)    (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0)
     225                 : #define P_IGNORE(opaque)        (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
     226                 : #define P_HAS_GARBAGE(opaque)   (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
     227                 : #define P_INCOMPLETE_SPLIT(opaque)  (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
     228                 : #define P_HAS_FULLXID(opaque)   (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
     229                 : 
     230                 : /*
     231                 :  * BTDeletedPageData is the page contents of a deleted page
     232                 :  */
     233                 : typedef struct BTDeletedPageData
     234                 : {
     235                 :     FullTransactionId safexid;  /* See BTPageIsRecyclable() */
     236                 : } BTDeletedPageData;
     237                 : 
     238 ECB             : static inline void
     239 GIC        3420 : BTPageSetDeleted(Page page, FullTransactionId safexid)
     240                 : {
     241                 :     BTPageOpaque opaque;
     242                 :     PageHeader  header;
     243                 :     BTDeletedPageData *contents;
     244 ECB             : 
     245 CBC        3420 :     opaque = BTPageGetOpaque(page);
     246 GIC        3420 :     header = ((PageHeader) page);
     247 ECB             : 
     248 CBC        3420 :     opaque->btpo_flags &= ~BTP_HALF_DEAD;
     249            3420 :     opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
     250 GIC        3420 :     header->pd_lower = MAXALIGN(SizeOfPageHeaderData) +
     251 ECB             :         sizeof(BTDeletedPageData);
     252 GIC        3420 :     header->pd_upper = header->pd_special;
     253                 : 
     254 ECB             :     /* Set safexid in deleted page */
     255 CBC        3420 :     contents = ((BTDeletedPageData *) PageGetContents(page));
     256            3420 :     contents->safexid = safexid;
     257 GIC        3420 : }
     258                 : 
     259 ECB             : static inline FullTransactionId
     260 GIC         721 : BTPageGetDeleteXid(Page page)
     261                 : {
     262                 :     BTPageOpaque opaque;
     263                 :     BTDeletedPageData *contents;
     264                 : 
     265 ECB             :     /* We only expect to be called with a deleted page */
     266 CBC         721 :     Assert(!PageIsNew(page));
     267             721 :     opaque = BTPageGetOpaque(page);
     268 GIC         721 :     Assert(P_ISDELETED(opaque));
     269                 : 
     270 ECB             :     /* pg_upgrade'd deleted page -- must be safe to delete now */
     271 GBC         721 :     if (!P_HAS_FULLXID(opaque))
     272 UIC           0 :         return FirstNormalFullTransactionId;
     273                 : 
     274 ECB             :     /* Get safexid from deleted page */
     275 CBC         721 :     contents = ((BTDeletedPageData *) PageGetContents(page));
     276 GIC         721 :     return contents->safexid;
     277                 : }
     278                 : 
     279                 : /*
     280                 :  * Is an existing page recyclable?
     281                 :  *
     282                 :  * This exists to centralize the policy on which deleted pages are now safe to
     283                 :  * re-use.  However, _bt_pendingfsm_finalize() duplicates some of the same
     284                 :  * logic because it doesn't work directly with pages -- keep the two in sync.
     285                 :  *
     286                 :  * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
     287                 :  * them here (caller is responsible for that case themselves).  Caller might
     288                 :  * well need special handling for new pages anyway.
     289                 :  */
     290 ECB             : static inline bool
     291 GNC       33197 : BTPageIsRecyclable(Page page, Relation heaprel)
     292                 : {
     293                 :     BTPageOpaque opaque;
     294 ECB             : 
     295 GIC       33197 :     Assert(!PageIsNew(page));
     296                 : 
     297 ECB             :     /* Recycling okay iff page is deleted and safexid is old enough */
     298 CBC       33197 :     opaque = BTPageGetOpaque(page);
     299 GIC       33197 :     if (P_ISDELETED(opaque))
     300                 :     {
     301                 :         /*
     302                 :          * The page was deleted, but when? If it was just deleted, a scan
     303                 :          * might have seen the downlink to it, and will read the page later.
     304                 :          * As long as that can happen, we must keep the deleted page around as
     305                 :          * a tombstone.
     306                 :          *
     307                 :          * For that check if the deletion XID could still be visible to
     308                 :          * anyone. If not, then no scan that's still in progress could have
     309                 :          * seen its downlink, and we can recycle it.
     310                 :          */
     311 GNC         493 :         return GlobalVisCheckRemovableFullXid(heaprel, BTPageGetDeleteXid(page));
     312                 :     }
     313                 : 
     314 GIC       32704 :     return false;
     315                 : }
     316                 : 
     317                 : /*
     318                 :  * BTVacState and BTPendingFSM are private nbtree.c state used during VACUUM.
     319                 :  * They are exported for use by page deletion related code in nbtpage.c.
     320                 :  */
     321                 : typedef struct BTPendingFSM
     322                 : {
     323                 :     BlockNumber target;         /* Page deleted by current VACUUM */
     324                 :     FullTransactionId safexid;  /* Page's BTDeletedPageData.safexid */
     325                 : } BTPendingFSM;
     326                 : 
     327                 : typedef struct BTVacState
     328                 : {
     329                 :     IndexVacuumInfo *info;
     330                 :     IndexBulkDeleteResult *stats;
     331                 :     IndexBulkDeleteCallback callback;
     332                 :     void       *callback_state;
     333                 :     BTCycleId   cycleid;
     334                 :     MemoryContext pagedelcontext;
     335                 : 
     336                 :     /*
     337                 :      * _bt_pendingfsm_finalize() state
     338                 :      */
     339                 :     int         bufsize;        /* pendingpages space (in # elements) */
     340                 :     int         maxbufsize;     /* max bufsize that respects work_mem */
     341                 :     BTPendingFSM *pendingpages; /* One entry per newly deleted page */
     342                 :     int         npendingpages;  /* current # valid pendingpages */
     343                 : } BTVacState;
     344                 : 
     345                 : /*
     346                 :  *  Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
     347                 :  *  page.  The high key is not a tuple that is used to visit the heap.  It is
     348                 :  *  a pivot tuple (see "Notes on B-Tree tuple format" below for definition).
     349                 :  *  The high key on a page is required to be greater than or equal to any
     350                 :  *  other key that appears on the page.  If we find ourselves trying to
     351                 :  *  insert a key that is strictly > high key, we know we need to move right
     352                 :  *  (this should only happen if the page was split since we examined the
     353                 :  *  parent page).
     354                 :  *
     355                 :  *  Our insertion algorithm guarantees that we can use the initial least key
     356                 :  *  on our right sibling as the high key.  Once a page is created, its high
     357                 :  *  key changes only if the page is split.
     358                 :  *
     359                 :  *  On a non-rightmost page, the high key lives in item 1 and data items
     360                 :  *  start in item 2.  Rightmost pages have no high key, so we store data
     361                 :  *  items beginning in item 1.
     362                 :  */
     363                 : 
     364                 : #define P_HIKEY             ((OffsetNumber) 1)
     365                 : #define P_FIRSTKEY          ((OffsetNumber) 2)
     366                 : #define P_FIRSTDATAKEY(opaque)  (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
     367                 : 
     368                 : /*
     369                 :  * Notes on B-Tree tuple format, and key and non-key attributes:
     370                 :  *
     371                 :  * INCLUDE B-Tree indexes have non-key attributes.  These are extra
     372                 :  * attributes that may be returned by index-only scans, but do not influence
     373                 :  * the order of items in the index (formally, non-key attributes are not
     374                 :  * considered to be part of the key space).  Non-key attributes are only
     375                 :  * present in leaf index tuples whose item pointers actually point to heap
     376                 :  * tuples (non-pivot tuples).  _bt_check_natts() enforces the rules
     377                 :  * described here.
     378                 :  *
     379                 :  * Non-pivot tuple format (plain/non-posting variant):
     380                 :  *
     381                 :  *  t_tid | t_info | key values | INCLUDE columns, if any
     382                 :  *
     383                 :  * t_tid points to the heap TID, which is a tiebreaker key column as of
     384                 :  * BTREE_VERSION 4.
     385                 :  *
     386                 :  * Non-pivot tuples complement pivot tuples, which only have key columns.
     387                 :  * The sole purpose of pivot tuples is to represent how the key space is
     388                 :  * separated.  In general, any B-Tree index that has more than one level
     389                 :  * (i.e. any index that does not just consist of a metapage and a single
     390                 :  * leaf root page) must have some number of pivot tuples, since pivot
     391                 :  * tuples are used for traversing the tree.  Suffix truncation can omit
     392                 :  * trailing key columns when a new pivot is formed, which makes minus
     393                 :  * infinity their logical value.  Since BTREE_VERSION 4 indexes treat heap
     394                 :  * TID as a trailing key column that ensures that all index tuples are
     395                 :  * physically unique, it is necessary to represent heap TID as a trailing
     396                 :  * key column in pivot tuples, though very often this can be truncated
     397                 :  * away, just like any other key column. (Actually, the heap TID is
     398                 :  * omitted rather than truncated, since its representation is different to
     399                 :  * the non-pivot representation.)
     400                 :  *
     401                 :  * Pivot tuple format:
     402                 :  *
     403                 :  *  t_tid | t_info | key values | [heap TID]
     404                 :  *
     405                 :  * We store the number of columns present inside pivot tuples by abusing
     406                 :  * their t_tid offset field, since pivot tuples never need to store a real
     407                 :  * offset (pivot tuples generally store a downlink in t_tid, though).  The
     408                 :  * offset field only stores the number of columns/attributes when the
     409                 :  * INDEX_ALT_TID_MASK bit is set, which doesn't count the trailing heap
     410                 :  * TID column sometimes stored in pivot tuples -- that's represented by
     411                 :  * the presence of BT_PIVOT_HEAP_TID_ATTR.  The INDEX_ALT_TID_MASK bit in
     412                 :  * t_info is always set on BTREE_VERSION 4 pivot tuples, since
     413                 :  * BTreeTupleIsPivot() must work reliably on heapkeyspace versions.
     414                 :  *
     415                 :  * In version 2 or version 3 (!heapkeyspace) indexes, INDEX_ALT_TID_MASK
     416                 :  * might not be set in pivot tuples.  BTreeTupleIsPivot() won't work
     417                 :  * reliably as a result.  The number of columns stored is implicitly the
     418                 :  * same as the number of columns in the index, just like any non-pivot
     419                 :  * tuple. (The number of columns stored should not vary, since suffix
     420                 :  * truncation of key columns is unsafe within any !heapkeyspace index.)
     421                 :  *
     422                 :  * The 12 least significant bits from t_tid's offset number are used to
     423                 :  * represent the number of key columns within a pivot tuple.  This leaves 4
     424                 :  * status bits (BT_STATUS_OFFSET_MASK bits), which are shared by all tuples
     425                 :  * that have the INDEX_ALT_TID_MASK bit set (set in t_info) to store basic
     426                 :  * tuple metadata.  BTreeTupleIsPivot() and BTreeTupleIsPosting() use the
     427                 :  * BT_STATUS_OFFSET_MASK bits.
     428                 :  *
     429                 :  * Sometimes non-pivot tuples also use a representation that repurposes
     430                 :  * t_tid to store metadata rather than a TID.  PostgreSQL v13 introduced a
     431                 :  * new non-pivot tuple format to support deduplication: posting list
     432                 :  * tuples.  Deduplication merges together multiple equal non-pivot tuples
     433                 :  * into a logically equivalent, space efficient representation.  A posting
     434                 :  * list is an array of ItemPointerData elements.  Non-pivot tuples are
     435                 :  * merged together to form posting list tuples lazily, at the point where
     436                 :  * we'd otherwise have to split a leaf page.
     437                 :  *
     438                 :  * Posting tuple format (alternative non-pivot tuple representation):
     439                 :  *
     440                 :  *  t_tid | t_info | key values | posting list (TID array)
     441                 :  *
     442                 :  * Posting list tuples are recognized as such by having the
     443                 :  * INDEX_ALT_TID_MASK status bit set in t_info and the BT_IS_POSTING status
     444                 :  * bit set in t_tid's offset number.  These flags redefine the content of
     445                 :  * the posting tuple's t_tid to store the location of the posting list
     446                 :  * (instead of a block number), as well as the total number of heap TIDs
     447                 :  * present in the tuple (instead of a real offset number).
     448                 :  *
     449                 :  * The 12 least significant bits from t_tid's offset number are used to
     450                 :  * represent the number of heap TIDs present in the tuple, leaving 4 status
     451                 :  * bits (the BT_STATUS_OFFSET_MASK bits).  Like any non-pivot tuple, the
     452                 :  * number of columns stored is always implicitly the total number in the
     453                 :  * index (in practice there can never be non-key columns stored, since
     454                 :  * deduplication is not supported with INCLUDE indexes).
     455                 :  */
     456                 : #define INDEX_ALT_TID_MASK          INDEX_AM_RESERVED_BIT
     457                 : 
     458                 : /* Item pointer offset bit masks */
     459                 : #define BT_OFFSET_MASK              0x0FFF
     460                 : #define BT_STATUS_OFFSET_MASK       0xF000
     461                 : /* BT_STATUS_OFFSET_MASK status bits */
     462                 : #define BT_PIVOT_HEAP_TID_ATTR      0x1000
     463                 : #define BT_IS_POSTING               0x2000
     464                 : 
     465                 : /*
     466                 :  * Mask allocated for number of keys in index tuple must be able to fit
     467                 :  * maximum possible number of index attributes
     468                 :  */
     469                 : StaticAssertDecl(BT_OFFSET_MASK >= INDEX_MAX_KEYS,
     470                 :                  "BT_OFFSET_MASK can't fit INDEX_MAX_KEYS");
     471                 : 
     472                 : /*
     473                 :  * Note: BTreeTupleIsPivot() can have false negatives (but not false
     474                 :  * positives) when used with !heapkeyspace indexes
     475                 :  */
     476                 : static inline bool
     477       710208502 : BTreeTupleIsPivot(IndexTuple itup)
     478                 : {
     479 CBC   710208502 :     if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
     480 GIC   484022165 :         return false;
     481 ECB             :     /* absence of BT_IS_POSTING in offset number indicates pivot tuple */
     482 CBC   226186337 :     if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) != 0)
     483 GIC    11241898 :         return false;
     484 ECB             : 
     485 CBC   214944439 :     return true;
     486                 : }
     487 ECB             : 
     488                 : static inline bool
     489 GIC   574156255 : BTreeTupleIsPosting(IndexTuple itup)
     490                 : {
     491 CBC   574156255 :     if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
     492 GIC   363628961 :         return false;
     493 ECB             :     /* presence of BT_IS_POSTING in offset number indicates posting tuple */
     494 CBC   210527294 :     if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & BT_IS_POSTING) == 0)
     495 GIC   155203594 :         return false;
     496 ECB             : 
     497 CBC    55323700 :     return true;
     498                 : }
     499 ECB             : 
     500                 : static inline void
     501 GIC      378731 : BTreeTupleSetPosting(IndexTuple itup, uint16 nhtids, int postingoffset)
     502                 : {
     503 CBC      378731 :     Assert(nhtids > 1);
     504 GIC      378731 :     Assert((nhtids & BT_STATUS_OFFSET_MASK) == 0);
     505 CBC      378731 :     Assert((size_t) postingoffset == MAXALIGN(postingoffset));
     506          378731 :     Assert(postingoffset < INDEX_SIZE_MASK);
     507          378731 :     Assert(!BTreeTupleIsPivot(itup));
     508 ECB             : 
     509 CBC      378731 :     itup->t_info |= INDEX_ALT_TID_MASK;
     510 GIC      378731 :     ItemPointerSetOffsetNumber(&itup->t_tid, (nhtids | BT_IS_POSTING));
     511 CBC      378731 :     ItemPointerSetBlockNumber(&itup->t_tid, postingoffset);
     512          378731 : }
     513 ECB             : 
     514                 : static inline uint16
     515 GIC    18767101 : BTreeTupleGetNPosting(IndexTuple posting)
     516                 : {
     517 ECB             :     OffsetNumber existing;
     518                 : 
     519 GIC    18767101 :     Assert(BTreeTupleIsPosting(posting));
     520                 : 
     521 CBC    18767101 :     existing = ItemPointerGetOffsetNumberNoCheck(&posting->t_tid);
     522 GIC    18767101 :     return (existing & BT_OFFSET_MASK);
     523 ECB             : }
     524                 : 
     525                 : static inline uint32
     526 GIC    21080154 : BTreeTupleGetPostingOffset(IndexTuple posting)
     527                 : {
     528 CBC    21080154 :     Assert(BTreeTupleIsPosting(posting));
     529                 : 
     530        21080154 :     return ItemPointerGetBlockNumberNoCheck(&posting->t_tid);
     531                 : }
     532 ECB             : 
     533                 : static inline ItemPointer
     534 GIC    18895118 : BTreeTupleGetPosting(IndexTuple posting)
     535                 : {
     536 CBC    37790236 :     return (ItemPointer) ((char *) posting +
     537 GIC    18895118 :                           BTreeTupleGetPostingOffset(posting));
     538 ECB             : }
     539                 : 
     540                 : static inline ItemPointer
     541 GIC    15471349 : BTreeTupleGetPostingN(IndexTuple posting, int n)
     542                 : {
     543 CBC    15471349 :     return BTreeTupleGetPosting(posting) + n;
     544                 : }
     545 ECB             : 
     546                 : /*
     547                 :  * Get/set downlink block number in pivot tuple.
     548                 :  *
     549                 :  * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
     550                 :  * !heapkeyspace indexes would exhibit false positive assertion failures.
     551                 :  */
     552                 : static inline BlockNumber
     553 GIC    12322514 : BTreeTupleGetDownLink(IndexTuple pivot)
     554                 : {
     555 CBC    12322514 :     return ItemPointerGetBlockNumberNoCheck(&pivot->t_tid);
     556                 : }
     557 ECB             : 
     558                 : static inline void
     559 GIC       72893 : BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
     560                 : {
     561 CBC       72893 :     ItemPointerSetBlockNumber(&pivot->t_tid, blkno);
     562 GIC       72893 : }
     563 ECB             : 
     564                 : /*
     565                 :  * Get number of attributes within tuple.
     566                 :  *
     567                 :  * Note that this does not include an implicit tiebreaker heap TID
     568                 :  * attribute, if any.  Note also that the number of key attributes must be
     569                 :  * explicitly represented in all heapkeyspace pivot tuples.
     570                 :  *
     571                 :  * Note: This is defined as a macro rather than an inline function to
     572                 :  * avoid including rel.h.
     573                 :  */
     574                 : #define BTreeTupleGetNAtts(itup, rel)   \
     575                 :     ( \
     576                 :         (BTreeTupleIsPivot(itup)) ? \
     577                 :         ( \
     578                 :             ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_OFFSET_MASK \
     579                 :         ) \
     580                 :         : \
     581                 :         IndexRelationGetNumberOfAttributes(rel) \
     582                 :     )
     583                 : 
     584                 : /*
     585                 :  * Set number of key attributes in tuple.
     586                 :  *
     587                 :  * The heap TID tiebreaker attribute bit may also be set here, indicating that
     588                 :  * a heap TID value will be stored at the end of the tuple (i.e. using the
     589                 :  * special pivot tuple representation).
     590                 :  */
     591                 : static inline void
     592 GIC       96252 : BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
     593                 : {
     594 CBC       96252 :     Assert(nkeyatts <= INDEX_MAX_KEYS);
     595 GIC       96252 :     Assert((nkeyatts & BT_STATUS_OFFSET_MASK) == 0);
     596 CBC       96252 :     Assert(!heaptid || nkeyatts > 0);
     597           96252 :     Assert(!BTreeTupleIsPivot(itup) || nkeyatts == 0);
     598 ECB             : 
     599 CBC       96252 :     itup->t_info |= INDEX_ALT_TID_MASK;
     600                 : 
     601           96252 :     if (heaptid)
     602 GIC         516 :         nkeyatts |= BT_PIVOT_HEAP_TID_ATTR;
     603 ECB             : 
     604                 :     /* BT_IS_POSTING bit is deliberately unset here */
     605 GIC       96252 :     ItemPointerSetOffsetNumber(&itup->t_tid, nkeyatts);
     606           96252 :     Assert(BTreeTupleIsPivot(itup));
     607 CBC       96252 : }
     608 ECB             : 
     609                 : /*
     610                 :  * Get/set leaf page's "top parent" link from its high key.  Used during page
     611                 :  * deletion.
     612                 :  *
     613                 :  * Note: Cannot assert that tuple is a pivot tuple.  If we did so then
     614                 :  * !heapkeyspace indexes would exhibit false positive assertion failures.
     615                 :  */
     616                 : static inline BlockNumber
     617 GIC        2755 : BTreeTupleGetTopParent(IndexTuple leafhikey)
     618                 : {
     619 CBC        2755 :     return ItemPointerGetBlockNumberNoCheck(&leafhikey->t_tid);
     620                 : }
     621 ECB             : 
     622                 : static inline void
     623 GIC        3420 : BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
     624                 : {
     625 CBC        3420 :     ItemPointerSetBlockNumber(&leafhikey->t_tid, blkno);
     626 GIC        3420 :     BTreeTupleSetNAtts(leafhikey, 0, false);
     627 CBC        3420 : }
     628 ECB             : 
     629                 : /*
     630                 :  * Get tiebreaker heap TID attribute, if any.
     631                 :  *
     632                 :  * This returns the first/lowest heap TID in the case of a posting list tuple.
     633                 :  */
     634                 : static inline ItemPointer
     635 GIC    72430312 : BTreeTupleGetHeapTID(IndexTuple itup)
     636                 : {
     637 CBC    72430312 :     if (BTreeTupleIsPivot(itup))
     638                 :     {
     639 ECB             :         /* Pivot tuple heap TID representation? */
     640 GIC    50192192 :         if ((ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) &
     641                 :              BT_PIVOT_HEAP_TID_ATTR) != 0)
     642 CBC      243318 :             return (ItemPointer) ((char *) itup + IndexTupleSize(itup) -
     643                 :                                   sizeof(ItemPointerData));
     644 ECB             : 
     645                 :         /* Heap TID attribute was truncated */
     646 GIC    49948874 :         return NULL;
     647                 :     }
     648 CBC    22238120 :     else if (BTreeTupleIsPosting(itup))
     649 GIC      989727 :         return BTreeTupleGetPosting(itup);
     650 ECB             : 
     651 CBC    21248393 :     return &itup->t_tid;
     652                 : }
     653 ECB             : 
     654                 : /*
     655                 :  * Get maximum heap TID attribute, which could be the only TID in the case of
     656                 :  * a non-pivot tuple that does not have a posting list tuple.
     657                 :  *
     658                 :  * Works with non-pivot tuples only.
     659                 :  */
     660                 : static inline ItemPointer
     661 GIC      170967 : BTreeTupleGetMaxHeapTID(IndexTuple itup)
     662                 : {
     663 CBC      170967 :     Assert(!BTreeTupleIsPivot(itup));
     664                 : 
     665          170967 :     if (BTreeTupleIsPosting(itup))
     666                 :     {
     667          170471 :         uint16      nposting = BTreeTupleGetNPosting(itup);
     668                 : 
     669          170471 :         return BTreeTupleGetPostingN(itup, nposting - 1);
     670                 :     }
     671 ECB             : 
     672 GIC         496 :     return &itup->t_tid;
     673                 : }
     674 ECB             : 
     675                 : /*
     676                 :  *  Operator strategy numbers for B-tree have been moved to access/stratnum.h,
     677                 :  *  because many places need to use them in ScanKeyInit() calls.
     678                 :  *
     679                 :  *  The strategy numbers are chosen so that we can commute them by
     680                 :  *  subtraction, thus:
     681                 :  */
     682                 : #define BTCommuteStrategyNumber(strat)  (BTMaxStrategyNumber + 1 - (strat))
     683                 : 
     684                 : /*
     685                 :  *  When a new operator class is declared, we require that the user
     686                 :  *  supply us with an amproc procedure (BTORDER_PROC) for determining
     687                 :  *  whether, for two keys a and b, a < b, a = b, or a > b.  This routine
     688                 :  *  must return < 0, 0, > 0, respectively, in these three cases.
     689                 :  *
     690                 :  *  To facilitate accelerated sorting, an operator class may choose to
     691                 :  *  offer a second procedure (BTSORTSUPPORT_PROC).  For full details, see
     692                 :  *  src/include/utils/sortsupport.h.
     693                 :  *
     694                 :  *  To support window frames defined by "RANGE offset PRECEDING/FOLLOWING",
     695                 :  *  an operator class may choose to offer a third amproc procedure
     696                 :  *  (BTINRANGE_PROC), independently of whether it offers sortsupport.
     697                 :  *  For full details, see doc/src/sgml/btree.sgml.
     698                 :  *
     699                 :  *  To facilitate B-Tree deduplication, an operator class may choose to
     700                 :  *  offer a forth amproc procedure (BTEQUALIMAGE_PROC).  For full details,
     701                 :  *  see doc/src/sgml/btree.sgml.
     702                 :  */
     703                 : 
     704                 : #define BTORDER_PROC        1
     705                 : #define BTSORTSUPPORT_PROC  2
     706                 : #define BTINRANGE_PROC      3
     707                 : #define BTEQUALIMAGE_PROC   4
     708                 : #define BTOPTIONS_PROC      5
     709                 : #define BTNProcs            5
     710                 : 
     711                 : /*
     712                 :  *  We need to be able to tell the difference between read and write
     713                 :  *  requests for pages, in order to do locking correctly.
     714                 :  */
     715                 : 
     716                 : #define BT_READ         BUFFER_LOCK_SHARE
     717                 : #define BT_WRITE        BUFFER_LOCK_EXCLUSIVE
     718                 : 
     719                 : /*
     720                 :  * BTStackData -- As we descend a tree, we push the location of pivot
     721                 :  * tuples whose downlink we are about to follow onto a private stack.  If
     722                 :  * we split a leaf, we use this stack to walk back up the tree and insert
     723                 :  * data into its parent page at the correct location.  We also have to
     724                 :  * recursively insert into the grandparent page if and when the parent page
     725                 :  * splits.  Our private stack can become stale due to concurrent page
     726                 :  * splits and page deletions, but it should never give us an irredeemably
     727                 :  * bad picture.
     728                 :  */
     729                 : typedef struct BTStackData
     730                 : {
     731                 :     BlockNumber bts_blkno;
     732                 :     OffsetNumber bts_offset;
     733                 :     struct BTStackData *bts_parent;
     734                 : } BTStackData;
     735                 : 
     736                 : typedef BTStackData *BTStack;
     737                 : 
     738                 : /*
     739                 :  * BTScanInsertData is the btree-private state needed to find an initial
     740                 :  * position for an indexscan, or to insert new tuples -- an "insertion
     741                 :  * scankey" (not to be confused with a search scankey).  It's used to descend
     742                 :  * a B-Tree using _bt_search.
     743                 :  *
     744                 :  * heapkeyspace indicates if we expect all keys in the index to be physically
     745                 :  * unique because heap TID is used as a tiebreaker attribute, and if index may
     746                 :  * have truncated key attributes in pivot tuples.  This is actually a property
     747                 :  * of the index relation itself (not an indexscan).  heapkeyspace indexes are
     748                 :  * indexes whose version is >= version 4.  It's convenient to keep this close
     749                 :  * by, rather than accessing the metapage repeatedly.
     750                 :  *
     751                 :  * allequalimage is set to indicate that deduplication is safe for the index.
     752                 :  * This is also a property of the index relation rather than an indexscan.
     753                 :  *
     754                 :  * anynullkeys indicates if any of the keys had NULL value when scankey was
     755                 :  * built from index tuple (note that already-truncated tuple key attributes
     756                 :  * set NULL as a placeholder key value, which also affects value of
     757                 :  * anynullkeys).  This is a convenience for unique index non-pivot tuple
     758                 :  * insertion, which usually temporarily unsets scantid, but shouldn't iff
     759                 :  * anynullkeys is true.  Value generally matches non-pivot tuple's HasNulls
     760                 :  * bit, but may not when inserting into an INCLUDE index (tuple header value
     761                 :  * is affected by the NULL-ness of both key and non-key attributes).
     762                 :  *
     763                 :  * When nextkey is false (the usual case), _bt_search and _bt_binsrch will
     764                 :  * locate the first item >= scankey.  When nextkey is true, they will locate
     765                 :  * the first item > scan key.
     766                 :  *
     767                 :  * pivotsearch is set to true by callers that want to re-find a leaf page
     768                 :  * using a scankey built from a leaf page's high key.  Most callers set this
     769                 :  * to false.
     770                 :  *
     771                 :  * scantid is the heap TID that is used as a final tiebreaker attribute.  It
     772                 :  * is set to NULL when index scan doesn't need to find a position for a
     773                 :  * specific physical tuple.  Must be set when inserting new tuples into
     774                 :  * heapkeyspace indexes, since every tuple in the tree unambiguously belongs
     775                 :  * in one exact position (it's never set with !heapkeyspace indexes, though).
     776                 :  * Despite the representational difference, nbtree search code considers
     777                 :  * scantid to be just another insertion scankey attribute.
     778                 :  *
     779                 :  * scankeys is an array of scan key entries for attributes that are compared
     780                 :  * before scantid (user-visible attributes).  keysz is the size of the array.
     781                 :  * During insertion, there must be a scan key for every attribute, but when
     782                 :  * starting a regular index scan some can be omitted.  The array is used as a
     783                 :  * flexible array member, though it's sized in a way that makes it possible to
     784                 :  * use stack allocations.  See nbtree/README for full details.
     785                 :  */
     786                 : typedef struct BTScanInsertData
     787                 : {
     788                 :     bool        heapkeyspace;
     789                 :     bool        allequalimage;
     790                 :     bool        anynullkeys;
     791                 :     bool        nextkey;
     792                 :     bool        pivotsearch;
     793                 :     ItemPointer scantid;        /* tiebreaker for scankeys */
     794                 :     int         keysz;          /* Size of scankeys array */
     795                 :     ScanKeyData scankeys[INDEX_MAX_KEYS];   /* Must appear last */
     796                 : } BTScanInsertData;
     797                 : 
     798                 : typedef BTScanInsertData *BTScanInsert;
     799                 : 
     800                 : /*
     801                 :  * BTInsertStateData is a working area used during insertion.
     802                 :  *
     803                 :  * This is filled in after descending the tree to the first leaf page the new
     804                 :  * tuple might belong on.  Tracks the current position while performing
     805                 :  * uniqueness check, before we have determined which exact page to insert
     806                 :  * to.
     807                 :  *
     808                 :  * (This should be private to nbtinsert.c, but it's also used by
     809                 :  * _bt_binsrch_insert)
     810                 :  */
     811                 : typedef struct BTInsertStateData
     812                 : {
     813                 :     IndexTuple  itup;           /* Item we're inserting */
     814                 :     Size        itemsz;         /* Size of itup -- should be MAXALIGN()'d */
     815                 :     BTScanInsert itup_key;      /* Insertion scankey */
     816                 : 
     817                 :     /* Buffer containing leaf page we're likely to insert itup on */
     818                 :     Buffer      buf;
     819                 : 
     820                 :     /*
     821                 :      * Cache of bounds within the current buffer.  Only used for insertions
     822                 :      * where _bt_check_unique is called.  See _bt_binsrch_insert and
     823                 :      * _bt_findinsertloc for details.
     824                 :      */
     825                 :     bool        bounds_valid;
     826                 :     OffsetNumber low;
     827                 :     OffsetNumber stricthigh;
     828                 : 
     829                 :     /*
     830                 :      * if _bt_binsrch_insert found the location inside existing posting list,
     831                 :      * save the position inside the list.  -1 sentinel value indicates overlap
     832                 :      * with an existing posting list tuple that has its LP_DEAD bit set.
     833                 :      */
     834                 :     int         postingoff;
     835                 : } BTInsertStateData;
     836                 : 
     837                 : typedef BTInsertStateData *BTInsertState;
     838                 : 
     839                 : /*
     840                 :  * State used to representing an individual pending tuple during
     841                 :  * deduplication.
     842                 :  */
     843                 : typedef struct BTDedupInterval
     844                 : {
     845                 :     OffsetNumber baseoff;
     846                 :     uint16      nitems;
     847                 : } BTDedupInterval;
     848                 : 
     849                 : /*
     850                 :  * BTDedupStateData is a working area used during deduplication.
     851                 :  *
     852                 :  * The status info fields track the state of a whole-page deduplication pass.
     853                 :  * State about the current pending posting list is also tracked.
     854                 :  *
     855                 :  * A pending posting list is comprised of a contiguous group of equal items
     856                 :  * from the page, starting from page offset number 'baseoff'.  This is the
     857                 :  * offset number of the "base" tuple for new posting list.  'nitems' is the
     858                 :  * current total number of existing items from the page that will be merged to
     859                 :  * make a new posting list tuple, including the base tuple item.  (Existing
     860                 :  * items may themselves be posting list tuples, or regular non-pivot tuples.)
     861                 :  *
     862                 :  * The total size of the existing tuples to be freed when pending posting list
     863                 :  * is processed gets tracked by 'phystupsize'.  This information allows
     864                 :  * deduplication to calculate the space saving for each new posting list
     865                 :  * tuple, and for the entire pass over the page as a whole.
     866                 :  */
     867                 : typedef struct BTDedupStateData
     868                 : {
     869                 :     /* Deduplication status info for entire pass over page */
     870                 :     bool        deduplicate;    /* Still deduplicating page? */
     871                 :     int         nmaxitems;      /* Number of max-sized tuples so far */
     872                 :     Size        maxpostingsize; /* Limit on size of final tuple */
     873                 : 
     874                 :     /* Metadata about base tuple of current pending posting list */
     875                 :     IndexTuple  base;           /* Use to form new posting list */
     876                 :     OffsetNumber baseoff;       /* page offset of base */
     877                 :     Size        basetupsize;    /* base size without original posting list */
     878                 : 
     879                 :     /* Other metadata about pending posting list */
     880                 :     ItemPointer htids;          /* Heap TIDs in pending posting list */
     881                 :     int         nhtids;         /* Number of heap TIDs in htids array */
     882                 :     int         nitems;         /* Number of existing tuples/line pointers */
     883                 :     Size        phystupsize;    /* Includes line pointer overhead */
     884                 : 
     885                 :     /*
     886                 :      * Array of tuples to go on new version of the page.  Contains one entry
     887                 :      * for each group of consecutive items.  Note that existing tuples that
     888                 :      * will not become posting list tuples do not appear in the array (they
     889                 :      * are implicitly unchanged by deduplication pass).
     890                 :      */
     891                 :     int         nintervals;     /* current number of intervals in array */
     892                 :     BTDedupInterval intervals[MaxIndexTuplesPerPage];
     893                 : } BTDedupStateData;
     894                 : 
     895                 : typedef BTDedupStateData *BTDedupState;
     896                 : 
     897                 : /*
     898                 :  * BTVacuumPostingData is state that represents how to VACUUM (or delete) a
     899                 :  * posting list tuple when some (though not all) of its TIDs are to be
     900                 :  * deleted.
     901                 :  *
     902                 :  * Convention is that itup field is the original posting list tuple on input,
     903                 :  * and palloc()'d final tuple used to overwrite existing tuple on output.
     904                 :  */
     905                 : typedef struct BTVacuumPostingData
     906                 : {
     907                 :     /* Tuple that will be/was updated */
     908                 :     IndexTuple  itup;
     909                 :     OffsetNumber updatedoffset;
     910                 : 
     911                 :     /* State needed to describe final itup in WAL */
     912                 :     uint16      ndeletedtids;
     913                 :     uint16      deletetids[FLEXIBLE_ARRAY_MEMBER];
     914                 : } BTVacuumPostingData;
     915                 : 
     916                 : typedef BTVacuumPostingData *BTVacuumPosting;
     917                 : 
     918                 : /*
     919                 :  * BTScanOpaqueData is the btree-private state needed for an indexscan.
     920                 :  * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
     921                 :  * details of the preprocessing), information about the current location
     922                 :  * of the scan, and information about the marked location, if any.  (We use
     923                 :  * BTScanPosData to represent the data needed for each of current and marked
     924                 :  * locations.)  In addition we can remember some known-killed index entries
     925                 :  * that must be marked before we can move off the current page.
     926                 :  *
     927                 :  * Index scans work a page at a time: we pin and read-lock the page, identify
     928                 :  * all the matching items on the page and save them in BTScanPosData, then
     929                 :  * release the read-lock while returning the items to the caller for
     930                 :  * processing.  This approach minimizes lock/unlock traffic.  Note that we
     931                 :  * keep the pin on the index page until the caller is done with all the items
     932                 :  * (this is needed for VACUUM synchronization, see nbtree/README).  When we
     933                 :  * are ready to step to the next page, if the caller has told us any of the
     934                 :  * items were killed, we re-lock the page to mark them killed, then unlock.
     935                 :  * Finally we drop the pin and step to the next page in the appropriate
     936                 :  * direction.
     937                 :  *
     938                 :  * If we are doing an index-only scan, we save the entire IndexTuple for each
     939                 :  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
     940                 :  * into a separate workspace array; each BTScanPosItem stores its tuple's
     941                 :  * offset within that array.  Posting list tuples store a "base" tuple once,
     942                 :  * allowing the same key to be returned for each TID in the posting list
     943                 :  * tuple.
     944                 :  */
     945                 : 
     946                 : typedef struct BTScanPosItem    /* what we remember about each match */
     947                 : {
     948                 :     ItemPointerData heapTid;    /* TID of referenced heap item */
     949                 :     OffsetNumber indexOffset;   /* index item's location within page */
     950                 :     LocationIndex tupleOffset;  /* IndexTuple's offset in workspace, if any */
     951                 : } BTScanPosItem;
     952                 : 
     953                 : typedef struct BTScanPosData
     954                 : {
     955                 :     Buffer      buf;            /* if valid, the buffer is pinned */
     956                 : 
     957                 :     XLogRecPtr  lsn;            /* pos in the WAL stream when page was read */
     958                 :     BlockNumber currPage;       /* page referenced by items array */
     959                 :     BlockNumber nextPage;       /* page's right link when we scanned it */
     960                 : 
     961                 :     /*
     962                 :      * moreLeft and moreRight track whether we think there may be matching
     963                 :      * index entries to the left and right of the current page, respectively.
     964                 :      * We can clear the appropriate one of these flags when _bt_checkkeys()
     965                 :      * returns continuescan = false.
     966                 :      */
     967                 :     bool        moreLeft;
     968                 :     bool        moreRight;
     969                 : 
     970                 :     /*
     971                 :      * If we are doing an index-only scan, nextTupleOffset is the first free
     972                 :      * location in the associated tuple storage workspace.
     973                 :      */
     974                 :     int         nextTupleOffset;
     975                 : 
     976                 :     /*
     977                 :      * The items array is always ordered in index order (ie, increasing
     978                 :      * indexoffset).  When scanning backwards it is convenient to fill the
     979                 :      * array back-to-front, so we start at the last slot and fill downwards.
     980                 :      * Hence we need both a first-valid-entry and a last-valid-entry counter.
     981                 :      * itemIndex is a cursor showing which entry was last returned to caller.
     982                 :      */
     983                 :     int         firstItem;      /* first valid index in items[] */
     984                 :     int         lastItem;       /* last valid index in items[] */
     985                 :     int         itemIndex;      /* current index in items[] */
     986                 : 
     987                 :     BTScanPosItem items[MaxTIDsPerBTreePage];   /* MUST BE LAST */
     988                 : } BTScanPosData;
     989                 : 
     990                 : typedef BTScanPosData *BTScanPos;
     991                 : 
     992                 : #define BTScanPosIsPinned(scanpos) \
     993                 : ( \
     994                 :     AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
     995                 :                 !BufferIsValid((scanpos).buf)), \
     996                 :     BufferIsValid((scanpos).buf) \
     997                 : )
     998                 : #define BTScanPosUnpin(scanpos) \
     999                 :     do { \
    1000                 :         ReleaseBuffer((scanpos).buf); \
    1001                 :         (scanpos).buf = InvalidBuffer; \
    1002                 :     } while (0)
    1003                 : #define BTScanPosUnpinIfPinned(scanpos) \
    1004                 :     do { \
    1005                 :         if (BTScanPosIsPinned(scanpos)) \
    1006                 :             BTScanPosUnpin(scanpos); \
    1007                 :     } while (0)
    1008                 : 
    1009                 : #define BTScanPosIsValid(scanpos) \
    1010                 : ( \
    1011                 :     AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
    1012                 :                 !BufferIsValid((scanpos).buf)), \
    1013                 :     BlockNumberIsValid((scanpos).currPage) \
    1014                 : )
    1015                 : #define BTScanPosInvalidate(scanpos) \
    1016                 :     do { \
    1017                 :         (scanpos).currPage = InvalidBlockNumber; \
    1018                 :         (scanpos).nextPage = InvalidBlockNumber; \
    1019                 :         (scanpos).buf = InvalidBuffer; \
    1020                 :         (scanpos).lsn = InvalidXLogRecPtr; \
    1021                 :         (scanpos).nextTupleOffset = 0; \
    1022                 :     } while (0)
    1023                 : 
    1024                 : /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
    1025                 : typedef struct BTArrayKeyInfo
    1026                 : {
    1027                 :     int         scan_key;       /* index of associated key in arrayKeyData */
    1028                 :     int         cur_elem;       /* index of current element in elem_values */
    1029                 :     int         mark_elem;      /* index of marked element in elem_values */
    1030                 :     int         num_elems;      /* number of elems in current array value */
    1031                 :     Datum      *elem_values;    /* array of num_elems Datums */
    1032                 : } BTArrayKeyInfo;
    1033                 : 
    1034                 : typedef struct BTScanOpaqueData
    1035                 : {
    1036                 :     /* these fields are set by _bt_preprocess_keys(): */
    1037                 :     bool        qual_ok;        /* false if qual can never be satisfied */
    1038                 :     int         numberOfKeys;   /* number of preprocessed scan keys */
    1039                 :     ScanKey     keyData;        /* array of preprocessed scan keys */
    1040                 : 
    1041                 :     /* workspace for SK_SEARCHARRAY support */
    1042                 :     ScanKey     arrayKeyData;   /* modified copy of scan->keyData */
    1043                 :     int         numArrayKeys;   /* number of equality-type array keys (-1 if
    1044                 :                                  * there are any unsatisfiable array keys) */
    1045                 :     int         arrayKeyCount;  /* count indicating number of array scan keys
    1046                 :                                  * processed */
    1047                 :     BTArrayKeyInfo *arrayKeys;  /* info about each equality-type array key */
    1048                 :     MemoryContext arrayContext; /* scan-lifespan context for array data */
    1049                 : 
    1050                 :     /* info about killed items if any (killedItems is NULL if never used) */
    1051                 :     int        *killedItems;    /* currPos.items indexes of killed items */
    1052                 :     int         numKilled;      /* number of currently stored items */
    1053                 : 
    1054                 :     /*
    1055                 :      * If we are doing an index-only scan, these are the tuple storage
    1056                 :      * workspaces for the currPos and markPos respectively.  Each is of size
    1057                 :      * BLCKSZ, so it can hold as much as a full page's worth of tuples.
    1058                 :      */
    1059                 :     char       *currTuples;     /* tuple storage for currPos */
    1060                 :     char       *markTuples;     /* tuple storage for markPos */
    1061                 : 
    1062                 :     /*
    1063                 :      * If the marked position is on the same page as current position, we
    1064                 :      * don't use markPos, but just keep the marked itemIndex in markItemIndex
    1065                 :      * (all the rest of currPos is valid for the mark position). Hence, to
    1066                 :      * determine if there is a mark, first look at markItemIndex, then at
    1067                 :      * markPos.
    1068                 :      */
    1069                 :     int         markItemIndex;  /* itemIndex, or -1 if not valid */
    1070                 : 
    1071                 :     /* keep these last in struct for efficiency */
    1072                 :     BTScanPosData currPos;      /* current position data */
    1073                 :     BTScanPosData markPos;      /* marked position, if any */
    1074                 : } BTScanOpaqueData;
    1075                 : 
    1076                 : typedef BTScanOpaqueData *BTScanOpaque;
    1077                 : 
    1078                 : /*
    1079                 :  * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
    1080                 :  * to use bits 16-31 (see skey.h).  The uppermost bits are copied from the
    1081                 :  * index's indoption[] array entry for the index attribute.
    1082                 :  */
    1083                 : #define SK_BT_REQFWD    0x00010000  /* required to continue forward scan */
    1084                 : #define SK_BT_REQBKWD   0x00020000  /* required to continue backward scan */
    1085                 : #define SK_BT_INDOPTION_SHIFT  24   /* must clear the above bits */
    1086                 : #define SK_BT_DESC          (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
    1087                 : #define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
    1088                 : 
    1089                 : typedef struct BTOptions
    1090                 : {
    1091                 :     int32       varlena_header_;    /* varlena header (do not touch directly!) */
    1092                 :     int         fillfactor;     /* page fill factor in percent (0..100) */
    1093                 :     float8      vacuum_cleanup_index_scale_factor;  /* deprecated */
    1094                 :     bool        deduplicate_items;  /* Try to deduplicate items? */
    1095                 : } BTOptions;
    1096                 : 
    1097                 : #define BTGetFillFactor(relation) \
    1098                 :     (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
    1099                 :                  relation->rd_rel->relam == BTREE_AM_OID), \
    1100                 :      (relation)->rd_options ? \
    1101                 :      ((BTOptions *) (relation)->rd_options)->fillfactor : \
    1102                 :      BTREE_DEFAULT_FILLFACTOR)
    1103                 : #define BTGetTargetPageFreeSpace(relation) \
    1104                 :     (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100)
    1105                 : #define BTGetDeduplicateItems(relation) \
    1106                 :     (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
    1107                 :                  relation->rd_rel->relam == BTREE_AM_OID), \
    1108                 :     ((relation)->rd_options ? \
    1109                 :      ((BTOptions *) (relation)->rd_options)->deduplicate_items : true))
    1110                 : 
    1111                 : /*
    1112                 :  * Constant definition for progress reporting.  Phase numbers must match
    1113                 :  * btbuildphasename.
    1114                 :  */
    1115                 : /* PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE is 1 (see progress.h) */
    1116                 : #define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN       2
    1117                 : #define PROGRESS_BTREE_PHASE_PERFORMSORT_1              3
    1118                 : #define PROGRESS_BTREE_PHASE_PERFORMSORT_2              4
    1119                 : #define PROGRESS_BTREE_PHASE_LEAF_LOAD                  5
    1120                 : 
    1121                 : /*
    1122                 :  * external entry points for btree, in nbtree.c
    1123                 :  */
    1124                 : extern void btbuildempty(Relation index);
    1125                 : extern bool btinsert(Relation rel, Datum *values, bool *isnull,
    1126                 :                      ItemPointer ht_ctid, Relation heapRel,
    1127                 :                      IndexUniqueCheck checkUnique,
    1128                 :                      bool indexUnchanged,
    1129                 :                      struct IndexInfo *indexInfo);
    1130                 : extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys);
    1131                 : extern Size btestimateparallelscan(void);
    1132                 : extern void btinitparallelscan(void *target);
    1133                 : extern bool btgettuple(IndexScanDesc scan, ScanDirection dir);
    1134                 : extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
    1135                 : extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
    1136                 :                      ScanKey orderbys, int norderbys);
    1137                 : extern void btparallelrescan(IndexScanDesc scan);
    1138                 : extern void btendscan(IndexScanDesc scan);
    1139                 : extern void btmarkpos(IndexScanDesc scan);
    1140                 : extern void btrestrpos(IndexScanDesc scan);
    1141                 : extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
    1142                 :                                            IndexBulkDeleteResult *stats,
    1143                 :                                            IndexBulkDeleteCallback callback,
    1144                 :                                            void *callback_state);
    1145                 : extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
    1146                 :                                               IndexBulkDeleteResult *stats);
    1147                 : extern bool btcanreturn(Relation index, int attno);
    1148                 : 
    1149                 : /*
    1150                 :  * prototypes for internal functions in nbtree.c
    1151                 :  */
    1152                 : extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno);
    1153                 : extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page);
    1154                 : extern void _bt_parallel_done(IndexScanDesc scan);
    1155                 : extern void _bt_parallel_advance_array_keys(IndexScanDesc scan);
    1156                 : 
    1157                 : /*
    1158                 :  * prototypes for functions in nbtdedup.c
    1159                 :  */
    1160                 : extern void _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel,
    1161                 :                            IndexTuple newitem, Size newitemsz,
    1162                 :                            bool bottomupdedup);
    1163                 : extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
    1164                 :                                  Size newitemsz);
    1165                 : extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
    1166                 :                                     OffsetNumber baseoff);
    1167                 : extern bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup);
    1168                 : extern Size _bt_dedup_finish_pending(Page newpage, BTDedupState state);
    1169                 : extern IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids,
    1170                 :                                    int nhtids);
    1171                 : extern void _bt_update_posting(BTVacuumPosting vacposting);
    1172                 : extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting,
    1173                 :                                    int postingoff);
    1174                 : 
    1175                 : /*
    1176                 :  * prototypes for functions in nbtinsert.c
    1177                 :  */
    1178                 : extern bool _bt_doinsert(Relation rel, IndexTuple itup,
    1179                 :                          IndexUniqueCheck checkUnique, bool indexUnchanged,
    1180                 :                          Relation heapRel);
    1181                 : extern void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf,
    1182                 :                              BTStack stack);
    1183                 : extern Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack,
    1184                 :                               BlockNumber child);
    1185                 : 
    1186                 : /*
    1187                 :  * prototypes for functions in nbtsplitloc.c
    1188                 :  */
    1189                 : extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage,
    1190                 :                                      OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
    1191                 :                                      bool *newitemonleft);
    1192                 : 
    1193                 : /*
    1194                 :  * prototypes for functions in nbtpage.c
    1195                 :  */
    1196                 : extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
    1197                 :                              bool allequalimage);
    1198                 : extern bool _bt_vacuum_needs_cleanup(Relation rel, Relation heaprel);
    1199                 : extern void _bt_set_cleanup_info(Relation rel, Relation heaprel,
    1200                 :                                  BlockNumber num_delpages);
    1201                 : extern void _bt_upgrademetapage(Page page);
    1202                 : extern Buffer _bt_getroot(Relation rel, Relation heaprel, int access);
    1203                 : extern Buffer _bt_gettrueroot(Relation rel, Relation heaprel);
    1204                 : extern int  _bt_getrootheight(Relation rel, Relation heaprel);
    1205                 : extern void _bt_metaversion(Relation rel, Relation heaprel, bool *heapkeyspace,
    1206                 :                             bool *allequalimage);
    1207                 : extern void _bt_checkpage(Relation rel, Buffer buf);
    1208                 : extern Buffer _bt_getbuf(Relation rel, Relation heaprel, BlockNumber blkno,
    1209                 :                          int access);
    1210                 : extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
    1211                 :                                BlockNumber blkno, int access);
    1212                 : extern void _bt_relbuf(Relation rel, Buffer buf);
    1213                 : extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
    1214                 : extern void _bt_unlockbuf(Relation rel, Buffer buf);
    1215                 : extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
    1216                 : extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
    1217                 : extern void _bt_pageinit(Page page, Size size);
    1218                 : extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
    1219                 :                                 OffsetNumber *deletable, int ndeletable,
    1220                 :                                 BTVacuumPosting *updatable, int nupdatable);
    1221                 : extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
    1222                 :                                       Relation heapRel,
    1223                 :                                       TM_IndexDeleteOp *delstate);
    1224                 : extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate);
    1225                 : extern void _bt_pendingfsm_init(Relation rel, BTVacState *vstate,
    1226                 :                                 bool cleanuponly);
    1227                 : extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate);
    1228                 : 
    1229                 : /*
    1230                 :  * prototypes for functions in nbtsearch.c
    1231                 :  */
    1232                 : extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key,
    1233                 :                           Buffer *bufP, int access, Snapshot snapshot);
    1234                 : extern Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
    1235                 :                             Buffer buf, bool forupdate, BTStack stack,
    1236                 :                             int access, Snapshot snapshot);
    1237                 : extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
    1238                 : extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
    1239                 : extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
    1240                 : extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
    1241                 : extern Buffer _bt_get_endpoint(Relation rel, Relation heaprel, uint32 level,
    1242                 :                                bool rightmost, Snapshot snapshot);
    1243                 : 
    1244                 : /*
    1245                 :  * prototypes for functions in nbtutils.c
    1246                 :  */
    1247                 : extern BTScanInsert _bt_mkscankey(Relation rel, Relation heaprel, IndexTuple itup);
    1248                 : extern void _bt_freestack(BTStack stack);
    1249                 : extern void _bt_preprocess_array_keys(IndexScanDesc scan);
    1250                 : extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
    1251                 : extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
    1252                 : extern void _bt_mark_array_keys(IndexScanDesc scan);
    1253                 : extern void _bt_restore_array_keys(IndexScanDesc scan);
    1254                 : extern void _bt_preprocess_keys(IndexScanDesc scan);
    1255                 : extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
    1256                 :                           int tupnatts, ScanDirection dir, bool *continuescan);
    1257                 : extern void _bt_killitems(IndexScanDesc scan);
    1258                 : extern BTCycleId _bt_vacuum_cycleid(Relation rel);
    1259                 : extern BTCycleId _bt_start_vacuum(Relation rel);
    1260                 : extern void _bt_end_vacuum(Relation rel);
    1261                 : extern void _bt_end_vacuum_callback(int code, Datum arg);
    1262                 : extern Size BTreeShmemSize(void);
    1263                 : extern void BTreeShmemInit(void);
    1264                 : extern bytea *btoptions(Datum reloptions, bool validate);
    1265                 : extern bool btproperty(Oid index_oid, int attno,
    1266                 :                        IndexAMProperty prop, const char *propname,
    1267                 :                        bool *res, bool *isnull);
    1268                 : extern char *btbuildphasename(int64 phasenum);
    1269                 : extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft,
    1270                 :                                IndexTuple firstright, BTScanInsert itup_key);
    1271                 : extern int  _bt_keep_natts_fast(Relation rel, IndexTuple lastleft,
    1272                 :                                 IndexTuple firstright);
    1273                 : extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
    1274                 :                             OffsetNumber offnum);
    1275                 : extern void _bt_check_third_page(Relation rel, Relation heap,
    1276                 :                                  bool needheaptidspace, Page page, IndexTuple newtup);
    1277                 : extern bool _bt_allequalimage(Relation rel, bool debugmessage);
    1278                 : 
    1279                 : /*
    1280                 :  * prototypes for functions in nbtvalidate.c
    1281                 :  */
    1282                 : extern bool btvalidate(Oid opclassoid);
    1283                 : extern void btadjustmembers(Oid opfamilyoid,
    1284                 :                             Oid opclassoid,
    1285                 :                             List *operators,
    1286                 :                             List *functions);
    1287                 : 
    1288                 : /*
    1289                 :  * prototypes for functions in nbtsort.c
    1290                 :  */
    1291                 : extern IndexBuildResult *btbuild(Relation heap, Relation index,
    1292                 :                                  struct IndexInfo *indexInfo);
    1293                 : extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
    1294                 : 
    1295                 : #endif                          /* NBTREE_H */
        

Generated by: LCOV version v1.16-55-g56c0a2a