Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pg_stat_statements.c
4 : * Track statement planning and execution times as well as resource
5 : * usage across a whole database cluster.
6 : *
7 : * Execution costs are totaled for each distinct source query, and kept in
8 : * a shared hashtable. (We track only as many distinct queries as will fit
9 : * in the designated amount of shared memory.)
10 : *
11 : * Starting in Postgres 9.2, this module normalized query entries. As of
12 : * Postgres 14, the normalization is done by the core if compute_query_id is
13 : * enabled, or optionally by third-party modules.
14 : *
15 : * To facilitate presenting entries to users, we create "representative" query
16 : * strings in which constants are replaced with parameter symbols ($n), to
17 : * make it clearer what a normalized entry can represent. To save on shared
18 : * memory, and to avoid having to truncate oversized query strings, we store
19 : * these strings in a temporary external query-texts file. Offsets into this
20 : * file are kept in shared memory.
21 : *
22 : * Note about locking issues: to create or delete an entry in the shared
23 : * hashtable, one must hold pgss->lock exclusively. Modifying any field
24 : * in an entry except the counters requires the same. To look up an entry,
25 : * one must hold the lock shared. To read or update the counters within
26 : * an entry, one must hold the lock shared or exclusive (so the entry doesn't
27 : * disappear!) and also take the entry's mutex spinlock.
28 : * The shared state variable pgss->extent (the next free spot in the external
29 : * query-text file) should be accessed only while holding either the
30 : * pgss->mutex spinlock, or exclusive lock on pgss->lock. We use the mutex to
31 : * allow reserving file space while holding only shared lock on pgss->lock.
32 : * Rewriting the entire external query-text file, eg for garbage collection,
33 : * requires holding pgss->lock exclusively; this allows individual entries
34 : * in the file to be read or written while holding only shared lock.
35 : *
36 : *
37 : * Copyright (c) 2008-2023, PostgreSQL Global Development Group
38 : *
39 : * IDENTIFICATION
40 : * contrib/pg_stat_statements/pg_stat_statements.c
41 : *
42 : *-------------------------------------------------------------------------
43 : */
44 : #include "postgres.h"
45 :
46 : #include <math.h>
47 : #include <sys/stat.h>
48 : #include <unistd.h>
49 :
50 : #include "access/parallel.h"
51 : #include "catalog/pg_authid.h"
52 : #include "common/hashfn.h"
53 : #include "executor/instrument.h"
54 : #include "funcapi.h"
55 : #include "jit/jit.h"
56 : #include "mb/pg_wchar.h"
57 : #include "miscadmin.h"
58 : #include "nodes/queryjumble.h"
59 : #include "optimizer/planner.h"
60 : #include "parser/analyze.h"
61 : #include "parser/parsetree.h"
62 : #include "parser/scanner.h"
63 : #include "parser/scansup.h"
64 : #include "pgstat.h"
65 : #include "storage/fd.h"
66 : #include "storage/ipc.h"
67 : #include "storage/lwlock.h"
68 : #include "storage/shmem.h"
69 : #include "storage/spin.h"
70 : #include "tcop/utility.h"
71 : #include "utils/acl.h"
72 : #include "utils/builtins.h"
73 : #include "utils/memutils.h"
74 : #include "utils/timestamp.h"
75 :
5208 tgl 76 CBC 4 : PG_MODULE_MAGIC;
77 :
78 : /* Location of permanent stats file (valid when database is shut down) */
79 : #define PGSS_DUMP_FILE PGSTAT_STAT_PERMANENT_DIRECTORY "/pg_stat_statements.stat"
80 :
81 : /*
82 : * Location of external query text file.
83 : */
84 : #define PGSS_TEXT_FILE PG_STAT_TMP_DIR "/pgss_query_texts.stat"
85 :
86 : /* Magic number identifying the stats file format */
87 : static const uint32 PGSS_FILE_HEADER = 0x20220408;
88 :
89 : /* PostgreSQL major version number, changes in which invalidate all entries */
90 : static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100;
91 :
92 : /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */
93 : #define USAGE_EXEC(duration) (1.0)
94 : #define USAGE_INIT (1.0) /* including initial planning */
95 : #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */
96 : #define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */
97 : #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */
98 : #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */
99 : #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */
100 : #define IS_STICKY(c) ((c.calls[PGSS_PLAN] + c.calls[PGSS_EXEC]) == 0)
101 :
102 : /*
103 : * Utility statements that pgss_ProcessUtility and pgss_post_parse_analyze
104 : * ignores.
105 : */
106 : #define PGSS_HANDLED_UTILITY(n) (!IsA(n, ExecuteStmt) && \
107 : !IsA(n, PrepareStmt) && \
108 : !IsA(n, DeallocateStmt))
109 :
110 : /*
111 : * Extension version number, for supporting older extension versions' objects
112 : */
113 : typedef enum pgssVersion
114 : {
115 : PGSS_V1_0 = 0,
116 : PGSS_V1_1,
117 : PGSS_V1_2,
118 : PGSS_V1_3,
119 : PGSS_V1_8,
120 : PGSS_V1_9,
121 : PGSS_V1_10
122 : } pgssVersion;
123 :
124 : typedef enum pgssStoreKind
125 : {
126 : PGSS_INVALID = -1,
127 :
128 : /*
129 : * PGSS_PLAN and PGSS_EXEC must be respectively 0 and 1 as they're used to
130 : * reference the underlying values in the arrays in the Counters struct,
131 : * and this order is required in pg_stat_statements_internal().
132 : */
133 : PGSS_PLAN = 0,
134 : PGSS_EXEC,
135 :
136 : PGSS_NUMKIND /* Must be last value of this enum */
137 : } pgssStoreKind;
138 :
139 : /*
140 : * Hashtable key that defines the identity of a hashtable entry. We separate
141 : * queries by user and by database even if they are otherwise identical.
142 : *
143 : * If you add a new key to this struct, make sure to teach pgss_store() to
144 : * zero the padding bytes. Otherwise, things will break, because pgss_hash is
145 : * created using HASH_BLOBS, and thus tag_hash is used to hash this.
146 :
147 : */
148 : typedef struct pgssHashKey
149 : {
150 : Oid userid; /* user OID */
151 : Oid dbid; /* database OID */
152 : uint64 queryid; /* query identifier */
153 : bool toplevel; /* query executed at top level */
154 : } pgssHashKey;
155 :
156 : /*
157 : * The actual stats counters kept within pgssEntry.
158 : */
159 : typedef struct Counters
160 : {
161 : int64 calls[PGSS_NUMKIND]; /* # of times planned/executed */
162 : double total_time[PGSS_NUMKIND]; /* total planning/execution time,
163 : * in msec */
164 : double min_time[PGSS_NUMKIND]; /* minimum planning/execution time in
165 : * msec */
166 : double max_time[PGSS_NUMKIND]; /* maximum planning/execution time in
167 : * msec */
168 : double mean_time[PGSS_NUMKIND]; /* mean planning/execution time in
169 : * msec */
170 : double sum_var_time[PGSS_NUMKIND]; /* sum of variances in
171 : * planning/execution time in msec */
172 : int64 rows; /* total # of retrieved or affected rows */
173 : int64 shared_blks_hit; /* # of shared buffer hits */
174 : int64 shared_blks_read; /* # of shared disk blocks read */
175 : int64 shared_blks_dirtied; /* # of shared disk blocks dirtied */
176 : int64 shared_blks_written; /* # of shared disk blocks written */
177 : int64 local_blks_hit; /* # of local buffer hits */
178 : int64 local_blks_read; /* # of local disk blocks read */
179 : int64 local_blks_dirtied; /* # of local disk blocks dirtied */
180 : int64 local_blks_written; /* # of local disk blocks written */
181 : int64 temp_blks_read; /* # of temp blocks read */
182 : int64 temp_blks_written; /* # of temp blocks written */
183 : double blk_read_time; /* time spent reading blocks, in msec */
184 : double blk_write_time; /* time spent writing blocks, in msec */
185 : double temp_blk_read_time; /* time spent reading temp blocks, in msec */
186 : double temp_blk_write_time; /* time spent writing temp blocks, in
187 : * msec */
188 : double usage; /* usage factor */
189 : int64 wal_records; /* # of WAL records generated */
190 : int64 wal_fpi; /* # of WAL full page images generated */
191 : uint64 wal_bytes; /* total amount of WAL generated in bytes */
192 : int64 jit_functions; /* total number of JIT functions emitted */
193 : double jit_generation_time; /* total time to generate jit code */
194 : int64 jit_inlining_count; /* number of times inlining time has been
195 : * > 0 */
196 : double jit_inlining_time; /* total time to inline jit code */
197 : int64 jit_optimization_count; /* number of times optimization time
198 : * has been > 0 */
199 : double jit_optimization_time; /* total time to optimize jit code */
200 : int64 jit_emission_count; /* number of times emission time has been
201 : * > 0 */
202 : double jit_emission_time; /* total time to emit jit code */
203 : } Counters;
204 :
205 : /*
206 : * Global statistics for pg_stat_statements
207 : */
208 : typedef struct pgssGlobalStats
209 : {
210 : int64 dealloc; /* # of times entries were deallocated */
211 : TimestampTz stats_reset; /* timestamp with all stats reset */
212 : } pgssGlobalStats;
213 :
214 : /*
215 : * Statistics per statement
216 : *
217 : * Note: in event of a failure in garbage collection of the query text file,
218 : * we reset query_offset to zero and query_len to -1. This will be seen as
219 : * an invalid state by qtext_fetch().
220 : */
221 : typedef struct pgssEntry
222 : {
223 : pgssHashKey key; /* hash key of entry - MUST BE FIRST */
224 : Counters counters; /* the statistics for this query */
225 : Size query_offset; /* query text offset in external file */
226 : int query_len; /* # of valid bytes in query string, or -1 */
227 : int encoding; /* query text encoding */
228 : slock_t mutex; /* protects the counters only */
229 : } pgssEntry;
230 :
231 : /*
232 : * Global shared state
233 : */
234 : typedef struct pgssSharedState
235 : {
236 : LWLock *lock; /* protects hashtable search/modification */
237 : double cur_median_usage; /* current median usage in hashtable */
238 : Size mean_query_len; /* current mean entry text length */
239 : slock_t mutex; /* protects following fields only: */
240 : Size extent; /* current extent of query file */
241 : int n_writers; /* number of active writers to query file */
242 : int gc_count; /* query file garbage collection cycle count */
243 : pgssGlobalStats stats; /* global statistics for pgss */
244 : } pgssSharedState;
245 :
246 : /*---- Local variables ----*/
247 :
248 : /* Current nesting depth of ExecutorRun+ProcessUtility calls */
249 : static int exec_nested_level = 0;
250 :
251 : /* Current nesting depth of planner calls */
252 : static int plan_nested_level = 0;
253 :
254 : /* Saved hook values in case of unload */
255 : static shmem_request_hook_type prev_shmem_request_hook = NULL;
256 : static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
257 : static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL;
258 : static planner_hook_type prev_planner_hook = NULL;
259 : static ExecutorStart_hook_type prev_ExecutorStart = NULL;
260 : static ExecutorRun_hook_type prev_ExecutorRun = NULL;
261 : static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
262 : static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
263 : static ProcessUtility_hook_type prev_ProcessUtility = NULL;
264 :
265 : /* Links to shared memory state */
266 : static pgssSharedState *pgss = NULL;
267 : static HTAB *pgss_hash = NULL;
268 :
269 : /*---- GUC variables ----*/
270 :
271 : typedef enum
272 : {
273 : PGSS_TRACK_NONE, /* track no statements */
274 : PGSS_TRACK_TOP, /* only top level statements */
275 : PGSS_TRACK_ALL /* all statements, including nested ones */
276 : } PGSSTrackLevel;
277 :
278 : static const struct config_enum_entry track_options[] =
279 : {
280 : {"none", PGSS_TRACK_NONE, false},
281 : {"top", PGSS_TRACK_TOP, false},
282 : {"all", PGSS_TRACK_ALL, false},
283 : {NULL, 0, false}
284 : };
285 :
286 : static int pgss_max = 5000; /* max # statements to track */
287 : static int pgss_track = PGSS_TRACK_TOP; /* tracking level */
288 : static bool pgss_track_utility = true; /* whether to track utility commands */
289 : static bool pgss_track_planning = false; /* whether to track planning
290 : * duration */
291 : static bool pgss_save = true; /* whether to save stats across shutdown */
292 :
293 :
294 : #define pgss_enabled(level) \
295 : (!IsParallelWorker() && \
296 : (pgss_track == PGSS_TRACK_ALL || \
297 : (pgss_track == PGSS_TRACK_TOP && (level) == 0)))
298 :
299 : #define record_gc_qtexts() \
300 : do { \
301 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; \
302 : SpinLockAcquire(&s->mutex); \
303 : s->gc_count++; \
304 : SpinLockRelease(&s->mutex); \
305 : } while(0)
306 :
307 : /*---- Function declarations ----*/
308 :
309 5 : PG_FUNCTION_INFO_V1(pg_stat_statements_reset);
1549 akapila 310 GBC 12 : PG_FUNCTION_INFO_V1(pg_stat_statements_reset_1_7);
3359 tgl 311 LBC 0 : PG_FUNCTION_INFO_V1(pg_stat_statements_1_2);
2935 andrew 312 CBC 5 : PG_FUNCTION_INFO_V1(pg_stat_statements_1_3);
1102 fujii 313 4 : PG_FUNCTION_INFO_V1(pg_stat_statements_1_8);
731 magnus 314 5 : PG_FUNCTION_INFO_V1(pg_stat_statements_1_9);
366 michael 315 GBC 14 : PG_FUNCTION_INFO_V1(pg_stat_statements_1_10);
5208 tgl 316 LBC 0 : PG_FUNCTION_INFO_V1(pg_stat_statements);
864 fujii 317 GIC 5 : PG_FUNCTION_INFO_V1(pg_stat_statements_info);
318 :
319 : static void pgss_shmem_request(void);
320 : static void pgss_shmem_startup(void);
321 : static void pgss_shmem_shutdown(int code, Datum arg);
322 : static void pgss_post_parse_analyze(ParseState *pstate, Query *query,
323 : JumbleState *jstate);
324 : static PlannedStmt *pgss_planner(Query *parse,
325 : const char *query_string,
326 : int cursorOptions,
327 : ParamListInfo boundParams);
328 : static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags);
329 : static void pgss_ExecutorRun(QueryDesc *queryDesc,
330 : ScanDirection direction,
331 : uint64 count, bool execute_once);
332 : static void pgss_ExecutorFinish(QueryDesc *queryDesc);
333 : static void pgss_ExecutorEnd(QueryDesc *queryDesc);
334 : static void pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
335 : bool readOnlyTree,
336 : ProcessUtilityContext context, ParamListInfo params,
337 : QueryEnvironment *queryEnv,
338 : DestReceiver *dest, QueryCompletion *qc);
339 : static void pgss_store(const char *query, uint64 queryId,
340 : int query_location, int query_len,
341 : pgssStoreKind kind,
342 : double total_time, uint64 rows,
343 : const BufferUsage *bufusage,
344 : const WalUsage *walusage,
345 : const struct JitInstrumentation *jitusage,
346 : JumbleState *jstate);
347 : static void pg_stat_statements_internal(FunctionCallInfo fcinfo,
348 : pgssVersion api_version,
349 : bool showtext);
350 : static Size pgss_memsize(void);
351 : static pgssEntry *entry_alloc(pgssHashKey *key, Size query_offset, int query_len,
352 : int encoding, bool sticky);
353 : static void entry_dealloc(void);
354 : static bool qtext_store(const char *query, int query_len,
355 : Size *query_offset, int *gc_count);
356 : static char *qtext_load_file(Size *buffer_size);
357 : static char *qtext_fetch(Size query_offset, int query_len,
358 : char *buffer, Size buffer_size);
359 : static bool need_gc_qtexts(void);
360 : static void gc_qtexts(void);
361 : static void entry_reset(Oid userid, Oid dbid, uint64 queryid);
362 : static char *generate_normalized_query(JumbleState *jstate, const char *query,
363 : int query_loc, int *query_len_p);
364 : static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
365 : int query_loc);
366 : static int comp_location(const void *a, const void *b);
367 :
368 :
369 : /*
370 : * Module load callback
371 : */
5208 tgl 372 ECB : void
5208 tgl 373 GIC 4 : _PG_init(void)
374 : {
375 : /*
376 : * In order to create our shared memory area, we have to be loaded via
377 : * shared_preload_libraries. If not, fall out without hooking into any of
378 : * the main system. (We don't throw error here because it seems useful to
379 : * allow the pg_stat_statements functions to be created even when the
380 : * module isn't active. The functions must protect themselves against
381 : * being called then, however.)
5208 tgl 382 ECB : */
5208 tgl 383 CBC 4 : if (!process_shared_preload_libraries_in_progress)
5208 tgl 384 GIC 1 : return;
385 :
386 : /*
387 : * Inform the postmaster that we want to enable query_id calculation if
388 : * compute_query_id is set to auto.
694 alvherre 389 ECB : */
694 alvherre 390 GIC 3 : EnableQueryId();
391 :
392 : /*
393 : * Define (or redefine) custom GUC variables.
5208 tgl 394 ECB : */
5208 tgl 395 GIC 3 : DefineCustomIntVariable("pg_stat_statements.max",
396 : "Sets the maximum number of statements tracked by pg_stat_statements.",
397 : NULL,
398 : &pgss_max,
399 : 5000,
400 : 100,
401 : INT_MAX / 2,
402 : PGC_POSTMASTER,
403 : 0,
404 : NULL,
405 : NULL,
406 : NULL);
5208 tgl 407 ECB :
5208 tgl 408 GIC 3 : DefineCustomEnumVariable("pg_stat_statements.track",
409 : "Selects which statements are tracked by pg_stat_statements.",
410 : NULL,
411 : &pgss_track,
412 : PGSS_TRACK_TOP,
413 : track_options,
414 : PGC_SUSET,
415 : 0,
416 : NULL,
417 : NULL,
418 : NULL);
5208 tgl 419 ECB :
4863 tgl 420 GIC 3 : DefineCustomBoolVariable("pg_stat_statements.track_utility",
421 : "Selects whether utility commands are tracked by pg_stat_statements.",
422 : NULL,
423 : &pgss_track_utility,
424 : true,
425 : PGC_SUSET,
426 : 0,
427 : NULL,
428 : NULL,
429 : NULL);
4863 tgl 430 ECB :
1102 fujii 431 GIC 3 : DefineCustomBoolVariable("pg_stat_statements.track_planning",
432 : "Selects whether planning duration is tracked by pg_stat_statements.",
433 : NULL,
434 : &pgss_track_planning,
435 : false,
436 : PGC_SUSET,
437 : 0,
438 : NULL,
439 : NULL,
440 : NULL);
1102 fujii 441 ECB :
5208 tgl 442 GIC 3 : DefineCustomBoolVariable("pg_stat_statements.save",
443 : "Save pg_stat_statements statistics across server shutdowns.",
444 : NULL,
445 : &pgss_save,
446 : true,
447 : PGC_SIGHUP,
448 : 0,
449 : NULL,
450 : NULL,
451 : NULL);
5208 tgl 452 ECB :
412 tgl 453 GIC 3 : MarkGUCPrefixReserved("pg_stat_statements");
454 :
455 : /*
456 : * Install hooks.
5208 tgl 457 ECB : */
331 rhaas 458 CBC 3 : prev_shmem_request_hook = shmem_request_hook;
459 3 : shmem_request_hook = pgss_shmem_request;
5208 tgl 460 3 : prev_shmem_startup_hook = shmem_startup_hook;
461 3 : shmem_startup_hook = pgss_shmem_startup;
4029 462 3 : prev_post_parse_analyze_hook = post_parse_analyze_hook;
463 3 : post_parse_analyze_hook = pgss_post_parse_analyze;
1102 fujii 464 3 : prev_planner_hook = planner_hook;
465 3 : planner_hook = pgss_planner;
5208 tgl 466 3 : prev_ExecutorStart = ExecutorStart_hook;
467 3 : ExecutorStart_hook = pgss_ExecutorStart;
468 3 : prev_ExecutorRun = ExecutorRun_hook;
469 3 : ExecutorRun_hook = pgss_ExecutorRun;
4424 470 3 : prev_ExecutorFinish = ExecutorFinish_hook;
471 3 : ExecutorFinish_hook = pgss_ExecutorFinish;
5208 472 3 : prev_ExecutorEnd = ExecutorEnd_hook;
473 3 : ExecutorEnd_hook = pgss_ExecutorEnd;
4863 474 3 : prev_ProcessUtility = ProcessUtility_hook;
4863 tgl 475 GIC 3 : ProcessUtility_hook = pgss_ProcessUtility;
476 : }
477 :
478 : /*
479 : * shmem_request hook: request additional shared resources. We'll allocate or
480 : * attach to the shared resources in pgss_shmem_startup().
481 : */
331 rhaas 482 ECB : static void
331 rhaas 483 GIC 3 : pgss_shmem_request(void)
331 rhaas 484 ECB : {
331 rhaas 485 GBC 3 : if (prev_shmem_request_hook)
331 rhaas 486 UIC 0 : prev_shmem_request_hook();
331 rhaas 487 ECB :
331 rhaas 488 CBC 3 : RequestAddinShmemSpace(pgss_memsize());
489 3 : RequestNamedLWLockTranche("pg_stat_statements", 1);
331 rhaas 490 GIC 3 : }
491 :
492 : /*
493 : * shmem_startup hook: allocate or attach to shared memory,
494 : * then load any pre-existing statistics from file.
495 : * Also create and load the query-texts file, which is expected to exist
496 : * (even if empty) while the module is enabled.
497 : */
5208 tgl 498 ECB : static void
5208 tgl 499 GIC 3 : pgss_shmem_startup(void)
500 : {
501 : bool found;
5208 tgl 502 ECB : HASHCTL info;
3359 tgl 503 CBC 3 : FILE *file = NULL;
3359 tgl 504 GIC 3 : FILE *qfile = NULL;
505 : uint32 header;
506 : int32 num;
507 : int32 pgver;
508 : int32 i;
5208 tgl 509 ECB : int buffer_size;
5208 tgl 510 GIC 3 : char *buffer = NULL;
5208 tgl 511 ECB :
5208 tgl 512 GBC 3 : if (prev_shmem_startup_hook)
5208 tgl 513 UIC 0 : prev_shmem_startup_hook();
514 :
5208 tgl 515 ECB : /* reset in case this is a restart within the postmaster */
5208 tgl 516 CBC 3 : pgss = NULL;
5208 tgl 517 GIC 3 : pgss_hash = NULL;
518 :
519 : /*
520 : * Create or attach to the shared memory state, including hash table
5208 tgl 521 ECB : */
5208 tgl 522 GIC 3 : LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
5208 tgl 523 ECB :
5208 tgl 524 GIC 3 : pgss = ShmemInitStruct("pg_stat_statements",
525 : sizeof(pgssSharedState),
526 : &found);
5208 tgl 527 ECB :
5208 tgl 528 GIC 3 : if (!found)
529 : {
5208 tgl 530 ECB : /* First time through ... */
2621 rhaas 531 CBC 3 : pgss->lock = &(GetNamedLWLockTranche("pg_stat_statements"))->lock;
4018 tgl 532 3 : pgss->cur_median_usage = ASSUMED_MEDIAN_INIT;
3359 533 3 : pgss->mean_query_len = ASSUMED_LENGTH_INIT;
534 3 : SpinLockInit(&pgss->mutex);
535 3 : pgss->extent = 0;
536 3 : pgss->n_writers = 0;
537 3 : pgss->gc_count = 0;
864 fujii 538 3 : pgss->stats.dealloc = 0;
842 fujii 539 GIC 3 : pgss->stats.stats_reset = GetCurrentTimestamp();
540 : }
5208 tgl 541 ECB :
5208 tgl 542 CBC 3 : info.keysize = sizeof(pgssHashKey);
3359 543 3 : info.entrysize = sizeof(pgssEntry);
5208 tgl 544 GIC 3 : pgss_hash = ShmemInitHash("pg_stat_statements hash",
545 : pgss_max, pgss_max,
546 : &info,
547 : HASH_ELEM | HASH_BLOBS);
5208 tgl 548 ECB :
5208 tgl 549 GIC 3 : LWLockRelease(AddinShmemInitLock);
550 :
551 : /*
552 : * If we're in the postmaster (or a standalone backend...), set up a shmem
553 : * exit hook to dump the statistics to disk.
5208 tgl 554 ECB : */
5208 tgl 555 CBC 3 : if (!IsUnderPostmaster)
5208 tgl 556 GIC 3 : on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
557 :
558 : /*
559 : * Done if some other process already completed our initialization.
5208 tgl 560 ECB : */
3359 tgl 561 CBC 3 : if (found)
5208 tgl 562 GIC 3 : return;
563 :
564 : /*
565 : * Note: we don't bother with locks here, because there should be no other
566 : * processes running when this code is reached.
567 : */
568 :
3359 tgl 569 ECB : /* Unlink query text file possibly left over from crash */
3359 tgl 570 GIC 3 : unlink(PGSS_TEXT_FILE);
571 :
3359 tgl 572 ECB : /* Allocate new query text temp file */
3359 tgl 573 CBC 3 : qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
3359 tgl 574 GBC 3 : if (qfile == NULL)
3359 tgl 575 UIC 0 : goto write_error;
576 :
577 : /*
578 : * If we were told not to load old statistics, we're done. (Note we do
579 : * not try to unlink any old dump file in this case. This seems a bit
580 : * questionable but it's the historical behavior.)
3359 tgl 581 ECB : */
3359 tgl 582 GIC 3 : if (!pgss_save)
3359 tgl 583 EUB : {
3359 tgl 584 UBC 0 : FreeFile(qfile);
3359 tgl 585 UIC 0 : return;
586 : }
587 :
588 : /*
589 : * Attempt to load old statistics from the dump file.
3359 tgl 590 ECB : */
5208 tgl 591 CBC 3 : file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R);
5208 tgl 592 GIC 3 : if (file == NULL)
5208 tgl 593 ECB : {
3359 tgl 594 GBC 3 : if (errno != ENOENT)
3359 tgl 595 UIC 0 : goto read_error;
3359 tgl 596 ECB : /* No existing persisted stats file, so we're done */
3359 tgl 597 CBC 3 : FreeFile(qfile);
3359 tgl 598 GIC 3 : return;
599 : }
5208 tgl 600 EUB :
3359 tgl 601 UBC 0 : buffer_size = 2048;
5208 tgl 602 UIC 0 : buffer = (char *) palloc(buffer_size);
5208 tgl 603 EUB :
5208 tgl 604 UBC 0 : if (fread(&header, sizeof(uint32), 1, file) != 1 ||
3409 fujii 605 0 : fread(&pgver, sizeof(uint32), 1, file) != 1 ||
5208 tgl 606 0 : fread(&num, sizeof(int32), 1, file) != 1)
3359 tgl 607 UIC 0 : goto read_error;
3359 tgl 608 EUB :
3359 tgl 609 UBC 0 : if (header != PGSS_FILE_HEADER ||
610 0 : pgver != PGSS_PG_MAJOR_VERSION)
3359 tgl 611 UIC 0 : goto data_error;
5208 tgl 612 EUB :
5208 tgl 613 UIC 0 : for (i = 0; i < num; i++)
614 : {
615 : pgssEntry temp;
616 : pgssEntry *entry;
617 : Size query_offset;
5208 tgl 618 EUB :
3359 tgl 619 UBC 0 : if (fread(&temp, sizeof(pgssEntry), 1, file) != 1)
3359 tgl 620 UIC 0 : goto read_error;
621 :
5208 tgl 622 EUB : /* Encoding is the only field we can easily sanity-check */
3359 tgl 623 UBC 0 : if (!PG_VALID_BE_ENCODING(temp.encoding))
3359 tgl 624 UIC 0 : goto data_error;
625 :
3359 tgl 626 EUB : /* Resize buffer as needed */
4029 tgl 627 UIC 0 : if (temp.query_len >= buffer_size)
5208 tgl 628 EUB : {
3359 tgl 629 UBC 0 : buffer_size = Max(buffer_size * 2, temp.query_len + 1);
3359 tgl 630 UIC 0 : buffer = repalloc(buffer, buffer_size);
631 : }
5208 tgl 632 EUB :
3359 tgl 633 UBC 0 : if (fread(buffer, 1, temp.query_len + 1, file) != temp.query_len + 1)
3359 tgl 634 UIC 0 : goto read_error;
635 :
3359 tgl 636 EUB : /* Should have a trailing null, but let's make sure */
4029 tgl 637 UIC 0 : buffer[temp.query_len] = '\0';
638 :
4029 tgl 639 EUB : /* Skip loading "sticky" entries */
1102 fujii 640 UBC 0 : if (IS_STICKY(temp.counters))
4029 tgl 641 UIC 0 : continue;
642 :
3359 tgl 643 EUB : /* Store the query text */
3359 tgl 644 UBC 0 : query_offset = pgss->extent;
645 0 : if (fwrite(buffer, 1, temp.query_len + 1, qfile) != temp.query_len + 1)
646 0 : goto write_error;
3359 tgl 647 UIC 0 : pgss->extent += temp.query_len + 1;
648 :
5208 tgl 649 EUB : /* make the hashtable entry (discards old entries if too many) */
3359 tgl 650 UIC 0 : entry = entry_alloc(&temp.key, query_offset, temp.query_len,
651 : temp.encoding,
652 : false);
653 :
5208 tgl 654 EUB : /* copy in the actual stats */
5208 tgl 655 UIC 0 : entry->counters = temp.counters;
656 : }
657 :
864 fujii 658 EUB : /* Read global statistics for pg_stat_statements */
864 fujii 659 UBC 0 : if (fread(&pgss->stats, sizeof(pgssGlobalStats), 1, file) != 1)
864 fujii 660 UIC 0 : goto read_error;
864 fujii 661 EUB :
5208 tgl 662 UBC 0 : pfree(buffer);
663 0 : FreeFile(file);
3359 tgl 664 UIC 0 : FreeFile(qfile);
665 :
666 : /*
667 : * Remove the persisted stats file so it's not included in
668 : * backups/replication standbys, etc. A new file will be written on next
669 : * shutdown.
670 : *
671 : * Note: it's okay if the PGSS_TEXT_FILE is included in a basebackup,
672 : * because we remove that file on startup; it acts inversely to
673 : * PGSS_DUMP_FILE, in that it is only supposed to be around when the
674 : * server is running, whereas PGSS_DUMP_FILE is only supposed to be around
675 : * when the server is not running. Leaving the file creates no danger of
676 : * a newly restored database having a spurious record of execution costs,
677 : * which is what we're really concerned about here.
3969 magnus 678 EUB : */
3969 magnus 679 UIC 0 : unlink(PGSS_DUMP_FILE);
3969 magnus 680 EUB :
5208 tgl 681 UIC 0 : return;
5208 tgl 682 EUB :
3359 tgl 683 UBC 0 : read_error:
5208 tgl 684 UIC 0 : ereport(LOG,
685 : (errcode_for_file_access(),
686 : errmsg("could not read file \"%s\": %m",
5208 tgl 687 EUB : PGSS_DUMP_FILE)));
3359 tgl 688 UBC 0 : goto fail;
689 0 : data_error:
3359 tgl 690 UIC 0 : ereport(LOG,
691 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
692 : errmsg("ignoring invalid data in file \"%s\"",
3359 tgl 693 EUB : PGSS_DUMP_FILE)));
3359 tgl 694 UBC 0 : goto fail;
695 0 : write_error:
3359 tgl 696 UIC 0 : ereport(LOG,
697 : (errcode_for_file_access(),
698 : errmsg("could not write file \"%s\": %m",
3359 tgl 699 EUB : PGSS_TEXT_FILE)));
3359 tgl 700 UBC 0 : fail:
5208 701 0 : if (buffer)
702 0 : pfree(buffer);
703 0 : if (file)
704 0 : FreeFile(file);
3359 705 0 : if (qfile)
3359 tgl 706 UIC 0 : FreeFile(qfile);
5208 tgl 707 EUB : /* If possible, throw away the bogus file; ignore any error */
5208 tgl 708 UIC 0 : unlink(PGSS_DUMP_FILE);
709 :
710 : /*
711 : * Don't unlink PGSS_TEXT_FILE here; it should always be around while the
712 : * server is running with pg_stat_statements enabled
713 : */
714 : }
715 :
716 : /*
717 : * shmem_shutdown hook: Dump statistics into file.
718 : *
719 : * Note: we don't bother with acquiring lock, because there should be no
720 : * other processes running when this is called.
721 : */
5208 tgl 722 ECB : static void
5208 tgl 723 GIC 3 : pgss_shmem_shutdown(int code, Datum arg)
724 : {
5050 bruce 725 ECB : FILE *file;
3359 tgl 726 CBC 3 : char *qbuffer = NULL;
3359 tgl 727 GIC 3 : Size qbuffer_size = 0;
728 : HASH_SEQ_STATUS hash_seq;
729 : int32 num_entries;
730 : pgssEntry *entry;
731 :
5208 tgl 732 ECB : /* Don't try to dump during a crash. */
5208 tgl 733 CBC 3 : if (code)
5208 tgl 734 GIC 3 : return;
735 :
5208 tgl 736 ECB : /* Safety check ... shouldn't get here unless shmem is set up. */
5208 tgl 737 GBC 3 : if (!pgss || !pgss_hash)
5208 tgl 738 UIC 0 : return;
739 :
5208 tgl 740 ECB : /* Don't dump if told not to. */
5208 tgl 741 GBC 3 : if (!pgss_save)
5208 tgl 742 UIC 0 : return;
5208 tgl 743 ECB :
3969 magnus 744 CBC 3 : file = AllocateFile(PGSS_DUMP_FILE ".tmp", PG_BINARY_W);
5208 tgl 745 GBC 3 : if (file == NULL)
5208 tgl 746 UIC 0 : goto error;
5208 tgl 747 ECB :
5208 tgl 748 GBC 3 : if (fwrite(&PGSS_FILE_HEADER, sizeof(uint32), 1, file) != 1)
5208 tgl 749 LBC 0 : goto error;
3409 fujii 750 GBC 3 : if (fwrite(&PGSS_PG_MAJOR_VERSION, sizeof(uint32), 1, file) != 1)
3409 fujii 751 LBC 0 : goto error;
5208 tgl 752 CBC 3 : num_entries = hash_get_num_entries(pgss_hash);
5208 tgl 753 GBC 3 : if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
5208 tgl 754 UIC 0 : goto error;
5208 tgl 755 ECB :
3359 tgl 756 CBC 3 : qbuffer = qtext_load_file(&qbuffer_size);
3359 tgl 757 GBC 3 : if (qbuffer == NULL)
3359 tgl 758 UIC 0 : goto error;
759 :
760 : /*
761 : * When serializing to disk, we store query texts immediately after their
762 : * entry data. Any orphaned query texts are thereby excluded.
3359 tgl 763 ECB : */
5208 tgl 764 CBC 3 : hash_seq_init(&hash_seq, pgss_hash);
5208 tgl 765 GIC 24493 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
5208 tgl 766 ECB : {
4029 tgl 767 CBC 24490 : int len = entry->query_len;
3359 tgl 768 GIC 24490 : char *qstr = qtext_fetch(entry->query_offset, len,
769 : qbuffer, qbuffer_size);
5208 tgl 770 ECB :
3359 tgl 771 GBC 24490 : if (qstr == NULL)
3359 tgl 772 UIC 0 : continue; /* Ignore any entries with bogus texts */
3359 tgl 773 ECB :
3359 tgl 774 CBC 24490 : if (fwrite(entry, sizeof(pgssEntry), 1, file) != 1 ||
3359 tgl 775 GIC 24490 : fwrite(qstr, 1, len + 1, file) != len + 1)
776 : {
3359 tgl 777 EUB : /* note: we assume hash_seq_term won't change errno */
3359 tgl 778 UBC 0 : hash_seq_term(&hash_seq);
5208 tgl 779 UIC 0 : goto error;
780 : }
781 : }
782 :
864 fujii 783 ECB : /* Dump global statistics for pg_stat_statements */
864 fujii 784 GBC 3 : if (fwrite(&pgss->stats, sizeof(pgssGlobalStats), 1, file) != 1)
864 fujii 785 UIC 0 : goto error;
864 fujii 786 ECB :
3359 tgl 787 CBC 3 : free(qbuffer);
3359 tgl 788 GIC 3 : qbuffer = NULL;
3359 tgl 789 ECB :
5208 tgl 790 GIC 3 : if (FreeFile(file))
5208 tgl 791 EUB : {
5208 tgl 792 UBC 0 : file = NULL;
5208 tgl 793 UIC 0 : goto error;
794 : }
795 :
796 : /*
797 : * Rename file into place, so we atomically replace any old one.
3969 magnus 798 ECB : */
2587 andres 799 GIC 3 : (void) durable_rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE, LOG);
800 :
3359 tgl 801 ECB : /* Unlink query-texts file; it's not needed while shutdown */
3359 tgl 802 GIC 3 : unlink(PGSS_TEXT_FILE);
3359 tgl 803 ECB :
5208 tgl 804 GIC 3 : return;
5208 tgl 805 EUB :
5208 tgl 806 UBC 0 : error:
5208 tgl 807 UIC 0 : ereport(LOG,
808 : (errcode_for_file_access(),
809 : errmsg("could not write file \"%s\": %m",
3955 bruce 810 EUB : PGSS_DUMP_FILE ".tmp")));
297 peter 811 UNC 0 : free(qbuffer);
5208 tgl 812 UBC 0 : if (file)
813 0 : FreeFile(file);
3969 magnus 814 UIC 0 : unlink(PGSS_DUMP_FILE ".tmp");
3359 tgl 815 0 : unlink(PGSS_TEXT_FILE);
816 : }
817 :
818 : /*
819 : * Post-parse-analysis hook: mark query with a queryId
4029 tgl 820 ECB : */
821 : static void
732 bruce 822 CBC 60765 : pgss_post_parse_analyze(ParseState *pstate, Query *query, JumbleState *jstate)
4029 tgl 823 EUB : {
3275 tgl 824 GIC 60765 : if (prev_post_parse_analyze_hook)
732 bruce 825 UIC 0 : prev_post_parse_analyze_hook(pstate, query, jstate);
4029 tgl 826 ECB :
827 : /* Safety check... */
1102 fujii 828 GIC 60765 : if (!pgss || !pgss_hash || !pgss_enabled(exec_nested_level))
4029 tgl 829 11440 : return;
830 :
831 : /*
832 : * Clear queryId for prepared statements related utility, as those will
833 : * inherit from the underlying statement's one (except DEALLOCATE which is
732 bruce 834 ECB : * entirely untracked).
835 : */
4029 tgl 836 CBC 49325 : if (query->utilityStmt)
837 : {
732 bruce 838 23019 : if (pgss_track_utility && !PGSS_HANDLED_UTILITY(query->utilityStmt))
839 : {
840 1481 : query->queryId = UINT64CONST(0);
32 michael 841 GNC 1481 : return;
842 : }
843 : }
844 :
845 : /*
846 : * If query jumbling were able to identify any ignorable constants, we
847 : * immediately create a hash table entry for the query, so that we can
848 : * record the normalized form of the query string. If there were no such
849 : * constants, the normalized string would be the same as the query text
850 : * anyway, so there's no need for an early entry.
851 : */
732 bruce 852 CBC 47844 : if (jstate && jstate->clocations_count > 0)
4029 tgl 853 23708 : pgss_store(pstate->p_sourcetext,
854 : query->queryId,
855 : query->stmt_location,
856 : query->stmt_len,
857 : PGSS_INVALID,
858 : 0,
859 : 0,
860 : NULL,
861 : NULL,
862 : NULL,
863 : jstate);
864 : }
865 :
866 : /*
867 : * Planner hook: forward to regular planner, but measure planning time
868 : * if needed.
869 : */
870 : static PlannedStmt *
1102 fujii 871 37043 : pgss_planner(Query *parse,
872 : const char *query_string,
873 : int cursorOptions,
874 : ParamListInfo boundParams)
875 : {
876 : PlannedStmt *result;
877 :
878 : /*
879 : * We can't process the query if no query_string is provided, as
880 : * pgss_store needs it. We also ignore query without queryid, as it would
881 : * be treated as a utility statement, which may not be the case.
882 : *
883 : * Note that planner_hook can be called from the planner itself, so we
884 : * have a specific nesting level for the planner. However, utility
885 : * commands containing optimizable statements can also call the planner,
886 : * same for regular DML (for instance for underlying foreign key queries).
887 : * So testing the planner nesting level only is not enough to detect real
888 : * top level planner call.
889 : */
890 37043 : if (pgss_enabled(plan_nested_level + exec_nested_level)
891 26463 : && pgss_track_planning && query_string
892 59 : && parse->queryId != UINT64CONST(0))
893 58 : {
894 : instr_time start;
895 : instr_time duration;
896 : BufferUsage bufusage_start,
897 : bufusage;
898 : WalUsage walusage_start,
899 : walusage;
900 :
901 : /* We need to track buffer usage as the planner can access them. */
902 58 : bufusage_start = pgBufferUsage;
903 :
904 : /*
905 : * Similarly the planner could write some WAL records in some cases
906 : * (e.g. setting a hint bit with those being WAL-logged)
907 : */
1099 akapila 908 58 : walusage_start = pgWalUsage;
1102 fujii 909 58 : INSTR_TIME_SET_CURRENT(start);
910 :
911 58 : plan_nested_level++;
912 58 : PG_TRY();
913 : {
914 58 : if (prev_planner_hook)
1102 fujii 915 UBC 0 : result = prev_planner_hook(parse, query_string, cursorOptions,
916 : boundParams);
917 : else
1102 fujii 918 CBC 58 : result = standard_planner(parse, query_string, cursorOptions,
919 : boundParams);
920 : }
1102 fujii 921 UBC 0 : PG_FINALLY();
922 : {
1102 fujii 923 CBC 58 : plan_nested_level--;
924 : }
925 58 : PG_END_TRY();
926 :
927 58 : INSTR_TIME_SET_CURRENT(duration);
928 58 : INSTR_TIME_SUBTRACT(duration, start);
929 :
930 : /* calc differences of buffer counters. */
931 58 : memset(&bufusage, 0, sizeof(BufferUsage));
932 58 : BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
933 :
934 : /* calc differences of WAL counters. */
1099 akapila 935 58 : memset(&walusage, 0, sizeof(WalUsage));
936 58 : WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
937 :
1102 fujii 938 58 : pgss_store(query_string,
939 : parse->queryId,
940 : parse->stmt_location,
941 : parse->stmt_len,
942 : PGSS_PLAN,
943 58 : INSTR_TIME_GET_MILLISEC(duration),
944 : 0,
945 : &bufusage,
946 : &walusage,
947 : NULL,
948 : NULL);
949 : }
950 : else
951 : {
952 36985 : if (prev_planner_hook)
1102 fujii 953 UBC 0 : result = prev_planner_hook(parse, query_string, cursorOptions,
954 : boundParams);
955 : else
1102 fujii 956 CBC 36985 : result = standard_planner(parse, query_string, cursorOptions,
957 : boundParams);
958 : }
959 :
960 36532 : return result;
961 : }
962 :
963 : /*
964 : * ExecutorStart hook: start up tracking if needed
965 : */
966 : static void
5208 tgl 967 43793 : pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
968 : {
969 43793 : if (prev_ExecutorStart)
5208 tgl 970 UBC 0 : prev_ExecutorStart(queryDesc, eflags);
971 : else
5208 tgl 972 CBC 43793 : standard_ExecutorStart(queryDesc, eflags);
973 :
974 : /*
975 : * If query has queryId zero, don't track it. This prevents double
976 : * counting of optimizable statements that are directly contained in
977 : * utility statements.
978 : */
1102 fujii 979 43530 : if (pgss_enabled(exec_nested_level) && queryDesc->plannedstmt->queryId != UINT64CONST(0))
980 : {
981 : /*
982 : * Set up to track total elapsed time in ExecutorRun. Make sure the
983 : * space is allocated in the per-query context so it will go away at
984 : * ExecutorEnd.
985 : */
5208 tgl 986 26765 : if (queryDesc->totaltime == NULL)
987 : {
988 : MemoryContext oldcxt;
989 :
990 26765 : oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
697 efujita 991 26765 : queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
5208 tgl 992 26765 : MemoryContextSwitchTo(oldcxt);
993 : }
994 : }
995 43530 : }
996 :
997 : /*
998 : * ExecutorRun hook: all we need do is track nesting depth
999 : */
1000 : static void
2208 rhaas 1001 42792 : pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
1002 : bool execute_once)
1003 : {
1102 fujii 1004 42792 : exec_nested_level++;
5208 tgl 1005 42792 : PG_TRY();
1006 : {
1007 42792 : if (prev_ExecutorRun)
2208 rhaas 1008 UBC 0 : prev_ExecutorRun(queryDesc, direction, count, execute_once);
1009 : else
2208 rhaas 1010 CBC 42792 : standard_ExecutorRun(queryDesc, direction, count, execute_once);
1011 : }
1255 peter 1012 3113 : PG_FINALLY();
1013 : {
1102 fujii 1014 42792 : exec_nested_level--;
1015 : }
5208 tgl 1016 42792 : PG_END_TRY();
4424 1017 39679 : }
1018 :
1019 : /*
1020 : * ExecutorFinish hook: all we need do is track nesting depth
1021 : */
1022 : static void
1023 38050 : pgss_ExecutorFinish(QueryDesc *queryDesc)
1024 : {
1102 fujii 1025 38050 : exec_nested_level++;
4424 tgl 1026 38050 : PG_TRY();
1027 : {
1028 38050 : if (prev_ExecutorFinish)
4424 tgl 1029 UBC 0 : prev_ExecutorFinish(queryDesc);
1030 : else
4424 tgl 1031 CBC 38050 : standard_ExecutorFinish(queryDesc);
1032 : }
1255 peter 1033 131 : PG_FINALLY();
1034 : {
1102 fujii 1035 38050 : exec_nested_level--;
1036 : }
4424 tgl 1037 38050 : PG_END_TRY();
5208 1038 37919 : }
1039 :
1040 : /*
1041 : * ExecutorEnd hook: store results if needed
1042 : */
1043 : static void
1044 40246 : pgss_ExecutorEnd(QueryDesc *queryDesc)
1045 : {
2006 rhaas 1046 40246 : uint64 queryId = queryDesc->plannedstmt->queryId;
1047 :
1102 fujii 1048 40246 : if (queryId != UINT64CONST(0) && queryDesc->totaltime &&
1049 25681 : pgss_enabled(exec_nested_level))
1050 : {
1051 : /*
1052 : * Make sure stats accumulation is done. (Note: it's okay if several
1053 : * levels of hook all do this.)
1054 : */
5208 tgl 1055 25681 : InstrEndLoop(queryDesc->totaltime);
1056 :
1057 25583 : pgss_store(queryDesc->sourceText,
1058 : queryId,
2276 1059 25681 : queryDesc->plannedstmt->stmt_location,
1060 25681 : queryDesc->plannedstmt->stmt_len,
1061 : PGSS_EXEC,
2118 1062 25681 : queryDesc->totaltime->total * 1000.0, /* convert to msec */
3 michael 1063 GNC 25681 : queryDesc->estate->es_total_processed,
4029 tgl 1064 CBC 25681 : &queryDesc->totaltime->bufusage,
1099 akapila 1065 25681 : &queryDesc->totaltime->walusage,
366 magnus 1066 25681 : queryDesc->estate->es_jit ? &queryDesc->estate->es_jit->instr : NULL,
1067 : NULL);
1068 : }
1069 :
5208 tgl 1070 40246 : if (prev_ExecutorEnd)
5208 tgl 1071 UBC 0 : prev_ExecutorEnd(queryDesc);
1072 : else
5208 tgl 1073 CBC 40246 : standard_ExecutorEnd(queryDesc);
1074 40246 : }
1075 :
1076 : /*
1077 : * ProcessUtility hook
1078 : */
1079 : static void
2276 1080 27689 : pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
1081 : bool readOnlyTree,
1082 : ProcessUtilityContext context,
1083 : ParamListInfo params, QueryEnvironment *queryEnv,
1084 : DestReceiver *dest, QueryCompletion *qc)
1085 : {
1086 27689 : Node *parsetree = pstmt->utilityStmt;
732 bruce 1087 27689 : uint64 saved_queryId = pstmt->queryId;
159 tgl 1088 27689 : int saved_stmt_location = pstmt->stmt_location;
1089 27689 : int saved_stmt_len = pstmt->stmt_len;
1090 :
1091 : /*
1092 : * Force utility statements to get queryId zero. We do this even in cases
1093 : * where the statement contains an optimizable statement for which a
1094 : * queryId could be derived (such as EXPLAIN or DECLARE CURSOR). For such
1095 : * cases, runtime control will first go through ProcessUtility and then
1096 : * the executor, and we don't want the executor hooks to do anything,
1097 : * since we are already measuring the statement's costs at the utility
1098 : * level.
1099 : *
1100 : * Note that this is only done if pg_stat_statements is enabled and
1101 : * configured to track utility statements, in the unlikely possibility
1102 : * that user configured another extension to handle utility statements
1103 : * only.
1104 : */
732 bruce 1105 27689 : if (pgss_enabled(exec_nested_level) && pgss_track_utility)
1106 22979 : pstmt->queryId = UINT64CONST(0);
1107 :
1108 : /*
1109 : * If it's an EXECUTE statement, we don't track it and don't increment the
1110 : * nesting level. This allows the cycles to be charged to the underlying
1111 : * PREPARE instead (by the Executor hooks), which is much more useful.
1112 : *
1113 : * We also don't track execution of PREPARE. If we did, we would get one
1114 : * hash table entry for the PREPARE (with hash calculated from the query
1115 : * string), and then a different one with the same query string (but hash
1116 : * calculated from the query tree) would be used to accumulate costs of
1117 : * ensuing EXECUTEs. This would be confusing, and inconsistent with other
1118 : * cases where planning time is not included at all.
1119 : *
1120 : * Likewise, we don't track execution of DEALLOCATE.
1121 : */
1102 fujii 1122 27689 : if (pgss_track_utility && pgss_enabled(exec_nested_level) &&
732 bruce 1123 22979 : PGSS_HANDLED_UTILITY(parsetree))
4863 tgl 1124 19496 : {
1125 : instr_time start;
1126 : instr_time duration;
1127 : uint64 rows;
1128 : BufferUsage bufusage_start,
1129 : bufusage;
1130 : WalUsage walusage_start,
1131 : walusage;
1132 :
4030 rhaas 1133 21499 : bufusage_start = pgBufferUsage;
1099 akapila 1134 21499 : walusage_start = pgWalUsage;
4863 tgl 1135 21499 : INSTR_TIME_SET_CURRENT(start);
1136 :
1102 fujii 1137 21499 : exec_nested_level++;
4863 tgl 1138 21499 : PG_TRY();
1139 : {
1140 21499 : if (prev_ProcessUtility)
660 tgl 1141 UBC 0 : prev_ProcessUtility(pstmt, queryString, readOnlyTree,
1142 : context, params, queryEnv,
1143 : dest, qc);
1144 : else
660 tgl 1145 CBC 21499 : standard_ProcessUtility(pstmt, queryString, readOnlyTree,
1146 : context, params, queryEnv,
1147 : dest, qc);
1148 : }
1255 peter 1149 2003 : PG_FINALLY();
1150 : {
1102 fujii 1151 21499 : exec_nested_level--;
1152 : }
4863 tgl 1153 21499 : PG_END_TRY();
1154 :
1155 : /*
1156 : * CAUTION: do not access the *pstmt data structure again below here.
1157 : * If it was a ROLLBACK or similar, that data structure may have been
1158 : * freed. We must copy everything we still need into local variables,
1159 : * which we did above.
1160 : *
1161 : * For the same reason, we can't risk restoring pstmt->queryId to its
1162 : * former value, which'd otherwise be a good idea.
1163 : */
1164 :
1165 19496 : INSTR_TIME_SET_CURRENT(duration);
1166 19496 : INSTR_TIME_SUBTRACT(duration, start);
1167 :
1168 : /*
1169 : * Track the total number of rows retrieved or affected by the utility
1170 : * statements of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED
1171 : * VIEW, REFRESH MATERIALIZED VIEW and SELECT INTO.
1172 : */
984 fujii 1173 19496 : rows = (qc && (qc->commandTag == CMDTAG_COPY ||
1174 18217 : qc->commandTag == CMDTAG_FETCH ||
878 1175 17960 : qc->commandTag == CMDTAG_SELECT ||
1176 17788 : qc->commandTag == CMDTAG_REFRESH_MATERIALIZED_VIEW)) ?
984 1177 38992 : qc->nprocessed : 0;
1178 :
1179 : /* calc differences of buffer counters. */
1105 1180 19496 : memset(&bufusage, 0, sizeof(BufferUsage));
1181 19496 : BufferUsageAccumDiff(&bufusage, &pgBufferUsage, &bufusage_start);
1182 :
1183 : /* calc differences of WAL counters. */
1099 akapila 1184 19496 : memset(&walusage, 0, sizeof(WalUsage));
1185 19496 : WalUsageAccumDiff(&walusage, &pgWalUsage, &walusage_start);
1186 :
4029 tgl 1187 19496 : pgss_store(queryString,
1188 : saved_queryId,
1189 : saved_stmt_location,
1190 : saved_stmt_len,
1191 : PGSS_EXEC,
3998 1192 19496 : INSTR_TIME_GET_MILLISEC(duration),
1193 : rows,
1194 : &bufusage,
1195 : &walusage,
1196 : NULL,
1197 : NULL);
1198 : }
1199 : else
1200 : {
4863 1201 6190 : if (prev_ProcessUtility)
660 tgl 1202 UBC 0 : prev_ProcessUtility(pstmt, queryString, readOnlyTree,
1203 : context, params, queryEnv,
1204 : dest, qc);
1205 : else
660 tgl 1206 CBC 6190 : standard_ProcessUtility(pstmt, queryString, readOnlyTree,
1207 : context, params, queryEnv,
1208 : dest, qc);
1209 : }
4863 1210 25591 : }
1211 :
1212 : /*
1213 : * Store some statistics for a statement.
1214 : *
1215 : * If jstate is not NULL then we're trying to create an entry for which
1216 : * we have no statistics as yet; we just want to record the normalized
1217 : * query string. total_time, rows, bufusage and walusage are ignored in this
1218 : * case.
1219 : *
1220 : * If kind is PGSS_PLAN or PGSS_EXEC, its value is used as the array position
1221 : * for the arrays in the Counters field.
1222 : */
1223 : static void
2006 rhaas 1224 68943 : pgss_store(const char *query, uint64 queryId,
1225 : int query_location, int query_len,
1226 : pgssStoreKind kind,
1227 : double total_time, uint64 rows,
1228 : const BufferUsage *bufusage,
1229 : const WalUsage *walusage,
1230 : const struct JitInstrumentation *jitusage,
1231 : JumbleState *jstate)
1232 : {
1233 : pgssHashKey key;
1234 : pgssEntry *entry;
4029 tgl 1235 68943 : char *norm_query = NULL;
3359 1236 68943 : int encoding = GetDatabaseEncoding();
1237 :
5208 1238 68943 : Assert(query != NULL);
1239 :
1240 : /* Safety check... */
1241 68943 : if (!pgss || !pgss_hash)
5208 tgl 1242 UBC 0 : return;
1243 :
1244 : /*
1245 : * Nothing to do if compute_query_id isn't enabled and no other module
1246 : * computed a query identifier.
1247 : */
732 bruce 1248 CBC 68943 : if (queryId == UINT64CONST(0))
732 bruce 1249 UBC 0 : return;
1250 :
1251 : /*
1252 : * Confine our attention to the relevant part of the string, if the query
1253 : * is a portion of a multi-statement source string, and update query
1254 : * location and length if needed.
1255 : */
732 bruce 1256 CBC 68943 : query = CleanQuerytext(query, &query_location, &query_len);
1257 :
1258 : /* Set up key for hashtable search */
1259 :
1260 : /* memset() is required when pgssHashKey is without padding only */
731 magnus 1261 68943 : memset(&key, 0, sizeof(pgssHashKey));
1262 :
5208 tgl 1263 68943 : key.userid = GetUserId();
1264 68943 : key.dbid = MyDatabaseId;
4029 1265 68943 : key.queryid = queryId;
731 magnus 1266 68943 : key.toplevel = (exec_nested_level == 0);
1267 :
1268 : /* Lookup the hash table entry with shared lock. */
5208 tgl 1269 68943 : LWLockAcquire(pgss->lock, LW_SHARED);
1270 :
1271 68943 : entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL);
1272 :
1273 : /* Create new entry, if not present */
1274 68943 : if (!entry)
1275 : {
1276 : Size query_offset;
1277 : int gc_count;
1278 : bool stored;
1279 : bool do_gc;
1280 :
1281 : /*
1282 : * Create a new, normalized query string if caller asked. We don't
1283 : * need to hold the lock while doing this work. (Note: in any case,
1284 : * it's possible that someone else creates a duplicate hashtable entry
1285 : * in the interval where we don't hold the lock below. That case is
1286 : * handled by entry_alloc.)
1287 : */
4029 1288 24795 : if (jstate)
1289 : {
3359 1290 8841 : LWLockRelease(pgss->lock);
4029 1291 8841 : norm_query = generate_normalized_query(jstate, query,
1292 : query_location,
1293 : &query_len);
3359 1294 8841 : LWLockAcquire(pgss->lock, LW_SHARED);
1295 : }
1296 :
1297 : /* Append new query text to file with only shared lock held */
1298 24795 : stored = qtext_store(norm_query ? norm_query : query, query_len,
1299 : &query_offset, &gc_count);
1300 :
1301 : /*
1302 : * Determine whether we need to garbage collect external query texts
1303 : * while the shared lock is still held. This micro-optimization
1304 : * avoids taking the time to decide this while holding exclusive lock.
1305 : */
1306 24795 : do_gc = need_gc_qtexts();
1307 :
1308 : /* Need exclusive lock to make a new hashtable entry - promote */
1309 24795 : LWLockRelease(pgss->lock);
1310 24795 : LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
1311 :
1312 : /*
1313 : * A garbage collection may have occurred while we weren't holding the
1314 : * lock. In the unlikely event that this happens, the query text we
1315 : * stored above will have been garbage collected, so write it again.
1316 : * This should be infrequent enough that doing it while holding
1317 : * exclusive lock isn't a performance problem.
1318 : */
1319 24795 : if (!stored || pgss->gc_count != gc_count)
3359 tgl 1320 UBC 0 : stored = qtext_store(norm_query ? norm_query : query, query_len,
1321 : &query_offset, NULL);
1322 :
1323 : /* If we failed to write to the text file, give up */
3359 tgl 1324 CBC 24795 : if (!stored)
3359 tgl 1325 UBC 0 : goto done;
1326 :
1327 : /* OK to create a new hashtable entry */
3359 tgl 1328 CBC 24795 : entry = entry_alloc(&key, query_offset, query_len, encoding,
1329 : jstate != NULL);
1330 :
1331 : /* If needed, perform garbage collection while exclusive lock held */
1332 24795 : if (do_gc)
3359 tgl 1333 UBC 0 : gc_qtexts();
1334 : }
1335 :
1336 : /* Increment the counts, except when jstate is not NULL */
4017 tgl 1337 CBC 68943 : if (!jstate)
1338 : {
1339 : /*
1340 : * Grab the spinlock while updating the counters (see comment about
1341 : * locking rules at the head of the file)
1342 : */
5208 1343 45235 : volatile pgssEntry *e = (volatile pgssEntry *) entry;
1344 :
1102 fujii 1345 45235 : Assert(kind == PGSS_PLAN || kind == PGSS_EXEC);
1346 :
5208 tgl 1347 45235 : SpinLockAcquire(&e->mutex);
1348 :
1349 : /* "Unstick" entry if it was previously sticky */
1102 fujii 1350 45235 : if (IS_STICKY(e->counters))
4017 tgl 1351 24232 : e->counters.usage = USAGE_INIT;
1352 :
1102 fujii 1353 45235 : e->counters.calls[kind] += 1;
1354 45235 : e->counters.total_time[kind] += total_time;
1355 :
1356 45235 : if (e->counters.calls[kind] == 1)
1357 : {
1358 24274 : e->counters.min_time[kind] = total_time;
1359 24274 : e->counters.max_time[kind] = total_time;
1360 24274 : e->counters.mean_time[kind] = total_time;
1361 : }
1362 : else
1363 : {
1364 : /*
1365 : * Welford's method for accurately computing variance. See
1366 : * <http://www.johndcook.com/blog/standard_deviation/>
1367 : */
1368 20961 : double old_mean = e->counters.mean_time[kind];
1369 :
1370 20961 : e->counters.mean_time[kind] +=
1371 20961 : (total_time - old_mean) / e->counters.calls[kind];
1372 20961 : e->counters.sum_var_time[kind] +=
1373 20961 : (total_time - old_mean) * (total_time - e->counters.mean_time[kind]);
1374 :
1375 : /* calculate min and max time */
1376 20961 : if (e->counters.min_time[kind] > total_time)
1377 5572 : e->counters.min_time[kind] = total_time;
1378 20961 : if (e->counters.max_time[kind] < total_time)
1379 2417 : e->counters.max_time[kind] = total_time;
1380 : }
5208 tgl 1381 45235 : e->counters.rows += rows;
4839 itagaki.takahiro 1382 45235 : e->counters.shared_blks_hit += bufusage->shared_blks_hit;
1383 45235 : e->counters.shared_blks_read += bufusage->shared_blks_read;
4064 rhaas 1384 45235 : e->counters.shared_blks_dirtied += bufusage->shared_blks_dirtied;
4839 itagaki.takahiro 1385 45235 : e->counters.shared_blks_written += bufusage->shared_blks_written;
1386 45235 : e->counters.local_blks_hit += bufusage->local_blks_hit;
1387 45235 : e->counters.local_blks_read += bufusage->local_blks_read;
4064 rhaas 1388 45235 : e->counters.local_blks_dirtied += bufusage->local_blks_dirtied;
4839 itagaki.takahiro 1389 45235 : e->counters.local_blks_written += bufusage->local_blks_written;
1390 45235 : e->counters.temp_blks_read += bufusage->temp_blks_read;
1391 45235 : e->counters.temp_blks_written += bufusage->temp_blks_written;
3997 tgl 1392 45235 : e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time);
1393 45235 : e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time);
366 michael 1394 45235 : e->counters.temp_blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_read_time);
1395 45235 : e->counters.temp_blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->temp_blk_write_time);
3998 tgl 1396 45235 : e->counters.usage += USAGE_EXEC(total_time);
1099 akapila 1397 45235 : e->counters.wal_records += walusage->wal_records;
1069 1398 45235 : e->counters.wal_fpi += walusage->wal_fpi;
1099 1399 45235 : e->counters.wal_bytes += walusage->wal_bytes;
366 magnus 1400 45235 : if (jitusage)
1401 : {
1402 98 : e->counters.jit_functions += jitusage->created_functions;
1403 98 : e->counters.jit_generation_time += INSTR_TIME_GET_MILLISEC(jitusage->generation_counter);
1404 :
1405 98 : if (INSTR_TIME_GET_MILLISEC(jitusage->inlining_counter))
1406 68 : e->counters.jit_inlining_count++;
1407 98 : e->counters.jit_inlining_time += INSTR_TIME_GET_MILLISEC(jitusage->inlining_counter);
1408 :
1409 98 : if (INSTR_TIME_GET_MILLISEC(jitusage->optimization_counter))
1410 96 : e->counters.jit_optimization_count++;
1411 98 : e->counters.jit_optimization_time += INSTR_TIME_GET_MILLISEC(jitusage->optimization_counter);
1412 :
1413 98 : if (INSTR_TIME_GET_MILLISEC(jitusage->emission_counter))
1414 96 : e->counters.jit_emission_count++;
1415 98 : e->counters.jit_emission_time += INSTR_TIME_GET_MILLISEC(jitusage->emission_counter);
1416 : }
1417 :
5208 tgl 1418 45235 : SpinLockRelease(&e->mutex);
1419 : }
1420 :
3359 1421 23708 : done:
5208 1422 68943 : LWLockRelease(pgss->lock);
1423 :
1424 : /* We postpone this clean-up until we're out of the lock */
4029 1425 68943 : if (norm_query)
1426 8841 : pfree(norm_query);
1427 : }
1428 :
1429 : /*
1430 : * Reset statement statistics corresponding to userid, dbid, and queryid.
1431 : */
1432 : Datum
1549 akapila 1433 40 : pg_stat_statements_reset_1_7(PG_FUNCTION_ARGS)
1434 : {
1435 : Oid userid;
1436 : Oid dbid;
1437 : uint64 queryid;
1438 :
1439 40 : userid = PG_GETARG_OID(0);
1440 40 : dbid = PG_GETARG_OID(1);
1441 40 : queryid = (uint64) PG_GETARG_INT64(2);
1442 :
1443 40 : entry_reset(userid, dbid, queryid);
1444 :
1445 40 : PG_RETURN_VOID();
1446 : }
1447 :
1448 : /*
1449 : * Reset statement statistics.
1450 : */
1451 : Datum
5208 tgl 1452 1 : pg_stat_statements_reset(PG_FUNCTION_ARGS)
1453 : {
1549 akapila 1454 1 : entry_reset(0, 0, 0);
1455 :
5208 tgl 1456 1 : PG_RETURN_VOID();
1457 : }
1458 :
1459 : /* Number of output arguments (columns) for various API versions */
1460 : #define PG_STAT_STATEMENTS_COLS_V1_0 14
1461 : #define PG_STAT_STATEMENTS_COLS_V1_1 18
1462 : #define PG_STAT_STATEMENTS_COLS_V1_2 19
1463 : #define PG_STAT_STATEMENTS_COLS_V1_3 23
1464 : #define PG_STAT_STATEMENTS_COLS_V1_8 32
1465 : #define PG_STAT_STATEMENTS_COLS_V1_9 33
1466 : #define PG_STAT_STATEMENTS_COLS_V1_10 43
1467 : #define PG_STAT_STATEMENTS_COLS 43 /* maximum of above */
1468 :
1469 : /*
1470 : * Retrieve statement statistics.
1471 : *
1472 : * The SQL API of this function has changed multiple times, and will likely
1473 : * do so again in future. To support the case where a newer version of this
1474 : * loadable module is being used with an old SQL declaration of the function,
1475 : * we continue to support the older API versions. For 1.2 and later, the
1476 : * expected API version is identified by embedding it in the C name of the
1477 : * function. Unfortunately we weren't bright enough to do that for 1.1.
1478 : */
1479 : Datum
366 michael 1480 41 : pg_stat_statements_1_10(PG_FUNCTION_ARGS)
1481 : {
1482 41 : bool showtext = PG_GETARG_BOOL(0);
1483 :
1484 41 : pg_stat_statements_internal(fcinfo, PGSS_V1_10, showtext);
1485 :
1486 41 : return (Datum) 0;
1487 : }
1488 :
1489 : Datum
731 magnus 1490 1 : pg_stat_statements_1_9(PG_FUNCTION_ARGS)
1491 : {
1492 1 : bool showtext = PG_GETARG_BOOL(0);
1493 :
1494 1 : pg_stat_statements_internal(fcinfo, PGSS_V1_9, showtext);
1495 :
1496 1 : return (Datum) 0;
1497 : }
1498 :
1499 : Datum
1102 fujii 1500 UBC 0 : pg_stat_statements_1_8(PG_FUNCTION_ARGS)
1501 : {
1502 0 : bool showtext = PG_GETARG_BOOL(0);
1503 :
1504 0 : pg_stat_statements_internal(fcinfo, PGSS_V1_8, showtext);
1505 :
1506 0 : return (Datum) 0;
1507 : }
1508 :
1509 : Datum
2935 andrew 1510 CBC 1 : pg_stat_statements_1_3(PG_FUNCTION_ARGS)
1511 : {
1512 1 : bool showtext = PG_GETARG_BOOL(0);
1513 :
1514 1 : pg_stat_statements_internal(fcinfo, PGSS_V1_3, showtext);
1515 :
1516 1 : return (Datum) 0;
1517 : }
1518 :
1519 : Datum
3359 tgl 1520 UBC 0 : pg_stat_statements_1_2(PG_FUNCTION_ARGS)
1521 : {
1522 0 : bool showtext = PG_GETARG_BOOL(0);
1523 :
1524 0 : pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext);
1525 :
1526 0 : return (Datum) 0;
1527 : }
1528 :
1529 : /*
1530 : * Legacy entry point for pg_stat_statements() API versions 1.0 and 1.1.
1531 : * This can be removed someday, perhaps.
1532 : */
1533 : Datum
5208 1534 0 : pg_stat_statements(PG_FUNCTION_ARGS)
1535 : {
1536 : /* If it's really API 1.1, we'll figure that out below */
3359 1537 0 : pg_stat_statements_internal(fcinfo, PGSS_V1_0, true);
1538 :
1539 0 : return (Datum) 0;
1540 : }
1541 :
1542 : /* Common code for all versions of pg_stat_statements() */
1543 : static void
3359 tgl 1544 CBC 43 : pg_stat_statements_internal(FunctionCallInfo fcinfo,
1545 : pgssVersion api_version,
1546 : bool showtext)
1547 : {
5050 bruce 1548 43 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1549 43 : Oid userid = GetUserId();
2201 simon 1550 43 : bool is_allowed_role = false;
3359 tgl 1551 43 : char *qbuffer = NULL;
1552 43 : Size qbuffer_size = 0;
1553 43 : Size extent = 0;
1554 43 : int gc_count = 0;
1555 : HASH_SEQ_STATUS hash_seq;
1556 : pgssEntry *entry;
1557 :
1558 : /*
1559 : * Superusers or roles with the privileges of pg_read_all_stats members
1560 : * are allowed
1561 : */
377 mail 1562 43 : is_allowed_role = has_privs_of_role(userid, ROLE_PG_READ_ALL_STATS);
1563 :
1564 : /* hash table must exist already */
5208 tgl 1565 43 : if (!pgss || !pgss_hash)
5208 tgl 1566 UBC 0 : ereport(ERROR,
1567 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1568 : errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
1569 :
173 michael 1570 CBC 43 : InitMaterializedSRF(fcinfo, 0);
1571 :
1572 : /*
1573 : * Check we have the expected number of output arguments. Aside from
1574 : * being a good safety check, we need a kluge here to detect API version
1575 : * 1.1, which was wedged into the code in an ill-considered way.
1576 : */
397 1577 43 : switch (rsinfo->setDesc->natts)
1578 : {
3409 fujii 1579 UBC 0 : case PG_STAT_STATEMENTS_COLS_V1_0:
3359 tgl 1580 0 : if (api_version != PGSS_V1_0)
1581 0 : elog(ERROR, "incorrect number of output arguments");
3409 fujii 1582 0 : break;
1583 0 : case PG_STAT_STATEMENTS_COLS_V1_1:
1584 : /* pg_stat_statements() should have told us 1.0 */
3359 tgl 1585 0 : if (api_version != PGSS_V1_0)
1586 0 : elog(ERROR, "incorrect number of output arguments");
1587 0 : api_version = PGSS_V1_1;
3409 fujii 1588 0 : break;
3359 tgl 1589 0 : case PG_STAT_STATEMENTS_COLS_V1_2:
1590 0 : if (api_version != PGSS_V1_2)
1591 0 : elog(ERROR, "incorrect number of output arguments");
3409 fujii 1592 0 : break;
2935 andrew 1593 CBC 1 : case PG_STAT_STATEMENTS_COLS_V1_3:
1594 1 : if (api_version != PGSS_V1_3)
2935 andrew 1595 UBC 0 : elog(ERROR, "incorrect number of output arguments");
2935 andrew 1596 CBC 1 : break;
1102 fujii 1597 UBC 0 : case PG_STAT_STATEMENTS_COLS_V1_8:
1598 0 : if (api_version != PGSS_V1_8)
1599 0 : elog(ERROR, "incorrect number of output arguments");
1600 0 : break;
731 magnus 1601 CBC 1 : case PG_STAT_STATEMENTS_COLS_V1_9:
1602 1 : if (api_version != PGSS_V1_9)
731 magnus 1603 UBC 0 : elog(ERROR, "incorrect number of output arguments");
731 magnus 1604 CBC 1 : break;
366 michael 1605 41 : case PG_STAT_STATEMENTS_COLS_V1_10:
1606 41 : if (api_version != PGSS_V1_10)
366 michael 1607 UBC 0 : elog(ERROR, "incorrect number of output arguments");
366 michael 1608 CBC 41 : break;
3409 fujii 1609 UBC 0 : default:
3359 tgl 1610 0 : elog(ERROR, "incorrect number of output arguments");
1611 : }
1612 :
1613 : /*
1614 : * We'd like to load the query text file (if needed) while not holding any
1615 : * lock on pgss->lock. In the worst case we'll have to do this again
1616 : * after we have the lock, but it's unlikely enough to make this a win
1617 : * despite occasional duplicated work. We need to reload if anybody
1618 : * writes to the file (either a retail qtext_store(), or a garbage
1619 : * collection) between this point and where we've gotten shared lock. If
1620 : * a qtext_store is actually in progress when we look, we might as well
1621 : * skip the speculative load entirely.
1622 : */
3359 tgl 1623 CBC 43 : if (showtext)
1624 : {
1625 : int n_writers;
1626 :
1627 : /* Take the mutex so we can examine variables */
1628 : {
1629 43 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1630 :
1631 43 : SpinLockAcquire(&s->mutex);
1632 43 : extent = s->extent;
1633 43 : n_writers = s->n_writers;
1634 43 : gc_count = s->gc_count;
1635 43 : SpinLockRelease(&s->mutex);
1636 : }
1637 :
1638 : /* No point in loading file now if there are active writers */
1639 43 : if (n_writers == 0)
1640 43 : qbuffer = qtext_load_file(&qbuffer_size);
1641 : }
1642 :
1643 : /*
1644 : * Get shared lock, load or reload the query text file if we must, and
1645 : * iterate over the hashtable entries.
1646 : *
1647 : * With a large hash table, we might be holding the lock rather longer
1648 : * than one could wish. However, this only blocks creation of new hash
1649 : * table entries, and the larger the hash table the less likely that is to
1650 : * be needed. So we can hope this is okay. Perhaps someday we'll decide
1651 : * we need to partition the hash table to limit the time spent holding any
1652 : * one lock.
1653 : */
5208 1654 43 : LWLockAcquire(pgss->lock, LW_SHARED);
1655 :
3359 1656 43 : if (showtext)
1657 : {
1658 : /*
1659 : * Here it is safe to examine extent and gc_count without taking the
1660 : * mutex. Note that although other processes might change
1661 : * pgss->extent just after we look at it, the strings they then write
1662 : * into the file cannot yet be referenced in the hashtable, so we
1663 : * don't care whether we see them or not.
1664 : *
1665 : * If qtext_load_file fails, we just press on; we'll return NULL for
1666 : * every query text.
1667 : */
1668 43 : if (qbuffer == NULL ||
1669 43 : pgss->extent != extent ||
1670 43 : pgss->gc_count != gc_count)
1671 : {
297 peter 1672 UNC 0 : free(qbuffer);
3359 tgl 1673 UIC 0 : qbuffer = qtext_load_file(&qbuffer_size);
1674 : }
1675 : }
3359 tgl 1676 ECB :
5208 tgl 1677 CBC 43 : hash_seq_init(&hash_seq, pgss_hash);
5208 tgl 1678 GIC 24097 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
1679 : {
1680 : Datum values[PG_STAT_STATEMENTS_COLS];
5208 tgl 1681 ECB : bool nulls[PG_STAT_STATEMENTS_COLS];
5208 tgl 1682 GIC 24054 : int i = 0;
1683 : Counters tmp;
2934 tgl 1684 ECB : double stddev;
3409 magnus 1685 GIC 24054 : int64 queryid = entry->key.queryid;
5208 tgl 1686 ECB :
5208 tgl 1687 CBC 24054 : memset(values, 0, sizeof(values));
5208 tgl 1688 GIC 24054 : memset(nulls, 0, sizeof(nulls));
5208 tgl 1689 ECB :
5208 tgl 1690 CBC 24054 : values[i++] = ObjectIdGetDatum(entry->key.userid);
1691 24054 : values[i++] = ObjectIdGetDatum(entry->key.dbid);
731 magnus 1692 24054 : if (api_version >= PGSS_V1_9)
731 magnus 1693 GIC 24044 : values[i++] = BoolGetDatum(entry->key.toplevel);
5208 tgl 1694 ECB :
2201 simon 1695 GIC 24054 : if (is_allowed_role || entry->key.userid == userid)
5208 tgl 1696 ECB : {
3359 tgl 1697 CBC 24054 : if (api_version >= PGSS_V1_2)
3409 magnus 1698 GIC 24054 : values[i++] = Int64GetDatumFast(queryid);
3409 fujii 1699 ECB :
3359 tgl 1700 GIC 24054 : if (showtext)
3359 tgl 1701 ECB : {
3359 tgl 1702 GIC 24054 : char *qstr = qtext_fetch(entry->query_offset,
1703 : entry->query_len,
1704 : qbuffer,
1705 : qbuffer_size);
3359 tgl 1706 ECB :
3359 tgl 1707 GIC 24054 : if (qstr)
1708 : {
1709 : char *enc;
3359 tgl 1710 ECB :
3332 tgl 1711 GIC 24054 : enc = pg_any_to_server(qstr,
1712 : entry->query_len,
1713 : entry->encoding);
3359 tgl 1714 ECB :
3359 tgl 1715 GIC 24054 : values[i++] = CStringGetTextDatum(enc);
3359 tgl 1716 ECB :
3359 tgl 1717 GBC 24054 : if (enc != qstr)
3359 tgl 1718 UIC 0 : pfree(enc);
1719 : }
1720 : else
1721 : {
3359 tgl 1722 EUB : /* Just return a null if we fail to find the text */
3359 tgl 1723 UIC 0 : nulls[i++] = true;
1724 : }
1725 : }
1726 : else
1727 : {
3359 tgl 1728 EUB : /* Query text not requested */
3359 tgl 1729 UIC 0 : nulls[i++] = true;
1730 : }
1731 : }
1732 : else
1733 : {
3359 tgl 1734 EUB : /* Don't show queryid */
3359 tgl 1735 UBC 0 : if (api_version >= PGSS_V1_2)
3409 fujii 1736 UIC 0 : nulls[i++] = true;
1737 :
1738 : /*
1739 : * Don't show query text, but hint as to the reason for not doing
1740 : * so if it was requested
3359 tgl 1741 EUB : */
3359 tgl 1742 UBC 0 : if (showtext)
3359 tgl 1743 UIC 0 : values[i++] = CStringGetTextDatum("<insufficient privilege>");
3359 tgl 1744 EUB : else
3359 tgl 1745 UIC 0 : nulls[i++] = true;
1746 : }
1747 :
1748 : /* copy counters to a local variable to keep locking time short */
5208 tgl 1749 ECB : {
5208 tgl 1750 GIC 24054 : volatile pgssEntry *e = (volatile pgssEntry *) entry;
5208 tgl 1751 ECB :
5208 tgl 1752 CBC 24054 : SpinLockAcquire(&e->mutex);
1753 24054 : tmp = e->counters;
5208 tgl 1754 GIC 24054 : SpinLockRelease(&e->mutex);
1755 : }
1756 :
4029 tgl 1757 ECB : /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */
1102 fujii 1758 CBC 24054 : if (IS_STICKY(tmp))
4029 tgl 1759 GIC 571 : continue;
1760 :
1102 fujii 1761 ECB : /* Note that we rely on PGSS_PLAN being 0 and PGSS_EXEC being 1. */
1102 fujii 1762 GIC 70449 : for (int kind = 0; kind < PGSS_NUMKIND; kind++)
2935 andrew 1763 ECB : {
1102 fujii 1764 GIC 46966 : if (kind == PGSS_EXEC || api_version >= PGSS_V1_8)
1102 fujii 1765 ECB : {
1102 fujii 1766 CBC 46957 : values[i++] = Int64GetDatumFast(tmp.calls[kind]);
1102 fujii 1767 GIC 46957 : values[i++] = Float8GetDatumFast(tmp.total_time[kind]);
1768 : }
2878 bruce 1769 ECB :
1102 fujii 1770 GIC 46966 : if ((kind == PGSS_EXEC && api_version >= PGSS_V1_3) ||
1771 : api_version >= PGSS_V1_8)
1102 fujii 1772 ECB : {
1102 fujii 1773 CBC 46957 : values[i++] = Float8GetDatumFast(tmp.min_time[kind]);
1774 46957 : values[i++] = Float8GetDatumFast(tmp.max_time[kind]);
1102 fujii 1775 GIC 46957 : values[i++] = Float8GetDatumFast(tmp.mean_time[kind]);
1776 :
1777 : /*
1778 : * Note we are calculating the population variance here, not
1779 : * the sample variance, as we have data for the whole
1780 : * population, so Bessel's correction is not used, and we
1781 : * don't divide by tmp.calls - 1.
1102 fujii 1782 ECB : */
1102 fujii 1783 CBC 46957 : if (tmp.calls[kind] > 1)
1102 fujii 1784 GIC 4161 : stddev = sqrt(tmp.sum_var_time[kind] / tmp.calls[kind]);
1102 fujii 1785 ECB : else
1102 fujii 1786 CBC 42796 : stddev = 0.0;
1102 fujii 1787 GIC 46957 : values[i++] = Float8GetDatumFast(stddev);
1788 : }
2935 andrew 1789 ECB : }
5208 tgl 1790 CBC 23483 : values[i++] = Int64GetDatumFast(tmp.rows);
4839 itagaki.takahiro 1791 23483 : values[i++] = Int64GetDatumFast(tmp.shared_blks_hit);
1792 23483 : values[i++] = Int64GetDatumFast(tmp.shared_blks_read);
3359 tgl 1793 23483 : if (api_version >= PGSS_V1_1)
4064 rhaas 1794 23483 : values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied);
4839 itagaki.takahiro 1795 23483 : values[i++] = Int64GetDatumFast(tmp.shared_blks_written);
1796 23483 : values[i++] = Int64GetDatumFast(tmp.local_blks_hit);
1797 23483 : values[i++] = Int64GetDatumFast(tmp.local_blks_read);
3359 tgl 1798 23483 : if (api_version >= PGSS_V1_1)
4064 rhaas 1799 23483 : values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied);
4839 itagaki.takahiro 1800 23483 : values[i++] = Int64GetDatumFast(tmp.local_blks_written);
1801 23483 : values[i++] = Int64GetDatumFast(tmp.temp_blks_read);
1802 23483 : values[i++] = Int64GetDatumFast(tmp.temp_blks_written);
3359 tgl 1803 GIC 23483 : if (api_version >= PGSS_V1_1)
4030 rhaas 1804 ECB : {
3997 tgl 1805 CBC 23483 : values[i++] = Float8GetDatumFast(tmp.blk_read_time);
3997 tgl 1806 GIC 23483 : values[i++] = Float8GetDatumFast(tmp.blk_write_time);
4030 rhaas 1807 ECB : }
366 michael 1808 GIC 23483 : if (api_version >= PGSS_V1_10)
366 michael 1809 ECB : {
366 michael 1810 CBC 23462 : values[i++] = Float8GetDatumFast(tmp.temp_blk_read_time);
366 michael 1811 GIC 23462 : values[i++] = Float8GetDatumFast(tmp.temp_blk_write_time);
366 michael 1812 ECB : }
1099 akapila 1813 GIC 23483 : if (api_version >= PGSS_V1_8)
1814 : {
1815 : char buf[256];
1816 : Datum wal_bytes;
1099 akapila 1817 ECB :
1099 akapila 1818 CBC 23474 : values[i++] = Int64GetDatumFast(tmp.wal_records);
1069 akapila 1819 GIC 23474 : values[i++] = Int64GetDatumFast(tmp.wal_fpi);
1099 akapila 1820 ECB :
1099 akapila 1821 GIC 23474 : snprintf(buf, sizeof buf, UINT64_FORMAT, tmp.wal_bytes);
1822 :
1099 akapila 1823 ECB : /* Convert to numeric. */
1099 akapila 1824 GIC 23474 : wal_bytes = DirectFunctionCall3(numeric_in,
1825 : CStringGetDatum(buf),
1826 : ObjectIdGetDatum(0),
1099 akapila 1827 ECB : Int32GetDatum(-1));
1099 akapila 1828 GIC 23474 : values[i++] = wal_bytes;
1099 akapila 1829 ECB : }
366 magnus 1830 GIC 23483 : if (api_version >= PGSS_V1_10)
366 magnus 1831 ECB : {
366 magnus 1832 CBC 23462 : values[i++] = Int64GetDatumFast(tmp.jit_functions);
1833 23462 : values[i++] = Float8GetDatumFast(tmp.jit_generation_time);
1834 23462 : values[i++] = Int64GetDatumFast(tmp.jit_inlining_count);
1835 23462 : values[i++] = Float8GetDatumFast(tmp.jit_inlining_time);
1836 23462 : values[i++] = Int64GetDatumFast(tmp.jit_optimization_count);
1837 23462 : values[i++] = Float8GetDatumFast(tmp.jit_optimization_time);
1838 23462 : values[i++] = Int64GetDatumFast(tmp.jit_emission_count);
366 magnus 1839 GIC 23462 : values[i++] = Float8GetDatumFast(tmp.jit_emission_time);
1840 : }
5208 tgl 1841 ECB :
3359 tgl 1842 GIC 23483 : Assert(i == (api_version == PGSS_V1_0 ? PG_STAT_STATEMENTS_COLS_V1_0 :
1843 : api_version == PGSS_V1_1 ? PG_STAT_STATEMENTS_COLS_V1_1 :
1844 : api_version == PGSS_V1_2 ? PG_STAT_STATEMENTS_COLS_V1_2 :
1845 : api_version == PGSS_V1_3 ? PG_STAT_STATEMENTS_COLS_V1_3 :
1846 : api_version == PGSS_V1_8 ? PG_STAT_STATEMENTS_COLS_V1_8 :
1847 : api_version == PGSS_V1_9 ? PG_STAT_STATEMENTS_COLS_V1_9 :
1848 : api_version == PGSS_V1_10 ? PG_STAT_STATEMENTS_COLS_V1_10 :
1849 : -1 /* fail if you forget to update this assert */ ));
5208 tgl 1850 ECB :
397 michael 1851 GIC 23483 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
1852 : }
5208 tgl 1853 ECB :
5208 tgl 1854 GIC 43 : LWLockRelease(pgss->lock);
5208 tgl 1855 ECB :
297 peter 1856 GNC 43 : free(qbuffer);
5208 tgl 1857 GIC 43 : }
1858 :
1859 : /* Number of output arguments (columns) for pg_stat_statements_info */
1860 : #define PG_STAT_STATEMENTS_INFO_COLS 2
1861 :
1862 : /*
1863 : * Return statistics of pg_stat_statements.
864 fujii 1864 ECB : */
1865 : Datum
864 fujii 1866 GIC 1 : pg_stat_statements_info(PG_FUNCTION_ARGS)
1867 : {
864 fujii 1868 ECB : pgssGlobalStats stats;
842 1869 : TupleDesc tupdesc;
267 peter 1870 GNC 1 : Datum values[PG_STAT_STATEMENTS_INFO_COLS] = {0};
1871 1 : bool nulls[PG_STAT_STATEMENTS_INFO_COLS] = {0};
842 fujii 1872 EUB :
801 michael 1873 GIC 1 : if (!pgss || !pgss_hash)
801 michael 1874 UIC 0 : ereport(ERROR,
1875 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1876 : errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
801 michael 1877 ECB :
842 fujii 1878 EUB : /* Build a tuple descriptor for our result type */
842 fujii 1879 GIC 1 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
842 fujii 1880 UIC 0 : elog(ERROR, "return type must be a row type");
1881 :
864 fujii 1882 ECB : /* Read global statistics for pg_stat_statements */
1883 : {
864 fujii 1884 GIC 1 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
1885 :
864 fujii 1886 CBC 1 : SpinLockAcquire(&s->mutex);
1887 1 : stats = s->stats;
864 fujii 1888 GIC 1 : SpinLockRelease(&s->mutex);
864 fujii 1889 ECB : }
1890 :
842 fujii 1891 GIC 1 : values[0] = Int64GetDatum(stats.dealloc);
1892 1 : values[1] = TimestampTzGetDatum(stats.stats_reset);
1893 :
1894 1 : PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
1895 : }
864 fujii 1896 ECB :
1897 : /*
1898 : * Estimate shared memory space needed.
1899 : */
5208 tgl 1900 : static Size
5208 tgl 1901 CBC 3 : pgss_memsize(void)
1902 : {
5050 bruce 1903 ECB : Size size;
1904 :
5208 tgl 1905 GIC 3 : size = MAXALIGN(sizeof(pgssSharedState));
3359 1906 3 : size = add_size(size, hash_estimate_size(pgss_max, sizeof(pgssEntry)));
1907 :
5208 1908 3 : return size;
1909 : }
1910 :
1911 : /*
1912 : * Allocate a new hashtable entry.
1913 : * caller must hold an exclusive lock on pgss->lock
1914 : *
1915 : * "query" need not be null-terminated; we rely on query_len instead
1916 : *
1917 : * If "sticky" is true, make the new entry artificially sticky so that it will
1918 : * probably still be there when the query finishes execution. We do this by
1919 : * giving it a median usage value rather than the normal value. (Strictly
1920 : * speaking, query strings are normalized on a best effort basis, though it
1921 : * would be difficult to demonstrate this even under artificial conditions.)
1922 : *
1923 : * Note: despite needing exclusive lock, it's not an error for the target
3260 bruce 1924 ECB : * entry to already exist. This is because pgss_store releases and
1925 : * reacquires lock after failing to find a match; so someone else could
1926 : * have made the entry while we waited to get exclusive lock.
1927 : */
1928 : static pgssEntry *
3359 tgl 1929 GIC 24795 : entry_alloc(pgssHashKey *key, Size query_offset, int query_len, int encoding,
1930 : bool sticky)
5208 tgl 1931 ECB : {
5208 tgl 1932 EUB : pgssEntry *entry;
1933 : bool found;
1934 :
5208 tgl 1935 ECB : /* Make space if needed */
5208 tgl 1936 GIC 24795 : while (hash_get_num_entries(pgss_hash) >= pgss_max)
5208 tgl 1937 LBC 0 : entry_dealloc();
1938 :
1939 : /* Find or create an entry with desired hash code */
5208 tgl 1940 GIC 24795 : entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER, &found);
1941 :
5208 tgl 1942 CBC 24795 : if (!found)
1943 : {
5208 tgl 1944 ECB : /* New entry, initialize it */
1945 :
1946 : /* reset the statistics */
5208 tgl 1947 GIC 24794 : memset(&entry->counters, 0, sizeof(Counters));
4017 tgl 1948 ECB : /* set the appropriate initial usage count */
4017 tgl 1949 CBC 24794 : entry->counters.usage = sticky ? pgss->cur_median_usage : USAGE_INIT;
5208 tgl 1950 ECB : /* re-initialize the mutex each time ... we assume no one using it */
5208 tgl 1951 CBC 24794 : SpinLockInit(&entry->mutex);
1952 : /* ... and don't forget the query text metadata */
3359 tgl 1953 GIC 24794 : Assert(query_len >= 0);
3359 tgl 1954 CBC 24794 : entry->query_offset = query_offset;
4029 tgl 1955 GIC 24794 : entry->query_len = query_len;
3359 1956 24794 : entry->encoding = encoding;
1957 : }
1958 :
5208 1959 24795 : return entry;
1960 : }
5208 tgl 1961 EUB :
1962 : /*
1963 : * qsort comparator for sorting into increasing usage order
1964 : */
1965 : static int
5208 tgl 1966 UBC 0 : entry_cmp(const void *lhs, const void *rhs)
5208 tgl 1967 EUB : {
4029 tgl 1968 UBC 0 : double l_usage = (*(pgssEntry *const *) lhs)->counters.usage;
1969 0 : double r_usage = (*(pgssEntry *const *) rhs)->counters.usage;
1970 :
5208 1971 0 : if (l_usage < r_usage)
5208 tgl 1972 UIC 0 : return -1;
1973 0 : else if (l_usage > r_usage)
1974 0 : return +1;
1975 : else
1976 0 : return 0;
1977 : }
1978 :
1979 : /*
2744 tgl 1980 EUB : * Deallocate least-used entries.
1981 : *
1982 : * Caller must hold an exclusive lock on pgss->lock.
1983 : */
1984 : static void
5208 tgl 1985 UIC 0 : entry_dealloc(void)
1986 : {
1987 : HASH_SEQ_STATUS hash_seq;
1988 : pgssEntry **entries;
1989 : pgssEntry *entry;
1990 : int nvictims;
1991 : int i;
1992 : Size tottextlen;
1993 : int nvalidtexts;
1994 :
1995 : /*
1996 : * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them.
1997 : * While we're scanning the table, apply the decay factor to the usage
1998 : * values, and update the mean query length.
1999 : *
2000 : * Note that the mean query length is almost immediately obsolete, since
2001 : * we compute it before not after discarding the least-used entries.
2744 tgl 2002 EUB : * Hopefully, that doesn't affect the mean too much; it doesn't seem worth
2003 : * making two passes to get a more current result. Likewise, the new
2004 : * cur_median_usage includes the entries we're about to zap.
4018 2005 : */
5208 2006 :
5208 tgl 2007 UIC 0 : entries = palloc(hash_get_num_entries(pgss_hash) * sizeof(pgssEntry *));
5208 tgl 2008 EUB :
5208 tgl 2009 UBC 0 : i = 0;
2744 tgl 2010 UIC 0 : tottextlen = 0;
2744 tgl 2011 UBC 0 : nvalidtexts = 0;
2012 :
5208 2013 0 : hash_seq_init(&hash_seq, pgss_hash);
2014 0 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
2015 : {
2016 0 : entries[i++] = entry;
2017 : /* "Sticky" entries get a different usage decay rate. */
1102 fujii 2018 0 : if (IS_STICKY(entry->counters))
4018 tgl 2019 UIC 0 : entry->counters.usage *= STICKY_DECREASE_FACTOR;
4018 tgl 2020 EUB : else
4018 tgl 2021 UBC 0 : entry->counters.usage *= USAGE_DECREASE_FACTOR;
2022 : /* In the mean length computation, ignore dropped texts. */
2744 tgl 2023 UIC 0 : if (entry->query_len >= 0)
2024 : {
2025 0 : tottextlen += entry->query_len + 1;
2744 tgl 2026 UBC 0 : nvalidtexts++;
2027 : }
2028 : }
5208 tgl 2029 EUB :
2744 2030 : /* Sort into increasing order by usage */
5208 tgl 2031 UIC 0 : qsort(entries, i, sizeof(pgssEntry *), entry_cmp);
4018 tgl 2032 EUB :
2744 2033 : /* Record the (approximate) median usage */
4018 tgl 2034 UIC 0 : if (i > 0)
4018 tgl 2035 UBC 0 : pgss->cur_median_usage = entries[i / 2]->counters.usage;
2036 : /* Record the mean query length */
2744 tgl 2037 UIC 0 : if (nvalidtexts > 0)
2744 tgl 2038 UBC 0 : pgss->mean_query_len = tottextlen / nvalidtexts;
2744 tgl 2039 EUB : else
2744 tgl 2040 UIC 0 : pgss->mean_query_len = ASSUMED_LENGTH_INIT;
4018 tgl 2041 EUB :
2042 : /* Now zap an appropriate fraction of lowest-usage entries */
5208 tgl 2043 UBC 0 : nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100);
5208 tgl 2044 UIC 0 : nvictims = Min(nvictims, i);
2045 :
5208 tgl 2046 UBC 0 : for (i = 0; i < nvictims; i++)
2047 : {
5208 tgl 2048 UIC 0 : hash_search(pgss_hash, &entries[i]->key, HASH_REMOVE, NULL);
2049 : }
5208 tgl 2050 EUB :
5208 tgl 2051 UIC 0 : pfree(entries);
864 fujii 2052 EUB :
2053 : /* Increment the number of times entries are deallocated */
2054 : {
864 fujii 2055 UIC 0 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
864 fujii 2056 EUB :
864 fujii 2057 UIC 0 : SpinLockAcquire(&s->mutex);
2058 0 : s->stats.dealloc += 1;
2059 0 : SpinLockRelease(&s->mutex);
2060 : }
5208 tgl 2061 0 : }
2062 :
2063 : /*
2064 : * Given a query string (not necessarily null-terminated), allocate a new
2065 : * entry in the external query text file and store the string there.
2066 : *
2067 : * If successful, returns true, and stores the new entry's offset in the file
2068 : * into *query_offset. Also, if gc_count isn't NULL, *gc_count is set to the
2069 : * number of garbage collections that have occurred so far.
2070 : *
2071 : * On failure, returns false.
2072 : *
2073 : * At least a shared lock on pgss->lock must be held by the caller, so as
2074 : * to prevent a concurrent garbage collection. Share-lock-holding callers
3359 tgl 2075 ECB : * should pass a gc_count pointer to obtain the number of garbage collections,
2076 : * so that they can recheck the count after obtaining exclusive lock to
2077 : * detect whether a garbage collection occurred (and removed this entry).
2078 : */
2079 : static bool
3359 tgl 2080 GIC 24795 : qtext_store(const char *query, int query_len,
2081 : Size *query_offset, int *gc_count)
2082 : {
2083 : Size off;
2084 : int fd;
2085 :
3359 tgl 2086 ECB : /*
2087 : * We use a spinlock to protect extent/n_writers/gc_count, so that
2088 : * multiple processes may execute this function concurrently.
2089 : */
2090 : {
3359 tgl 2091 CBC 24795 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
3359 tgl 2092 ECB :
3359 tgl 2093 CBC 24795 : SpinLockAcquire(&s->mutex);
2094 24795 : off = s->extent;
3359 tgl 2095 GIC 24795 : s->extent += query_len + 1;
2096 24795 : s->n_writers++;
3359 tgl 2097 CBC 24795 : if (gc_count)
3359 tgl 2098 GIC 24795 : *gc_count = s->gc_count;
2099 24795 : SpinLockRelease(&s->mutex);
2100 : }
2101 :
2102 24795 : *query_offset = off;
2103 :
250 tgl 2104 ECB : /*
2105 : * Don't allow the file to grow larger than what qtext_load_file can
250 tgl 2106 EUB : * (theoretically) handle. This has been seen to be reachable on 32-bit
2107 : * platforms.
2108 : */
250 tgl 2109 GIC 24795 : if (unlikely(query_len >= MaxAllocHugeSize - off))
2110 : {
250 tgl 2111 UIC 0 : errno = EFBIG; /* not quite right, but it'll do */
250 tgl 2112 LBC 0 : fd = -1;
2113 0 : goto error;
250 tgl 2114 EUB : }
2115 :
3359 tgl 2116 ECB : /* Now write the data into the successfully-reserved part of the file */
2024 peter_e 2117 GBC 24795 : fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY);
3359 tgl 2118 CBC 24795 : if (fd < 0)
3359 tgl 2119 UBC 0 : goto error;
2120 :
192 tmunro 2121 CBC 24795 : if (pg_pwrite(fd, query, query_len, off) != query_len)
3359 tgl 2122 UIC 0 : goto error;
192 tmunro 2123 GIC 24795 : if (pg_pwrite(fd, "\0", 1, off + query_len) != 1)
3359 tgl 2124 UIC 0 : goto error;
3359 tgl 2125 ECB :
3359 tgl 2126 GIC 24795 : CloseTransientFile(fd);
3359 tgl 2127 ECB :
2128 : /* Mark our write complete */
2129 : {
3359 tgl 2130 GIC 24795 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2131 :
3359 tgl 2132 CBC 24795 : SpinLockAcquire(&s->mutex);
3359 tgl 2133 GIC 24795 : s->n_writers--;
3359 tgl 2134 GBC 24795 : SpinLockRelease(&s->mutex);
3359 tgl 2135 EUB : }
2136 :
3359 tgl 2137 GIC 24795 : return true;
2138 :
3359 tgl 2139 UIC 0 : error:
3359 tgl 2140 UBC 0 : ereport(LOG,
3359 tgl 2141 EUB : (errcode_for_file_access(),
2142 : errmsg("could not write file \"%s\": %m",
2143 : PGSS_TEXT_FILE)));
2144 :
3359 tgl 2145 UBC 0 : if (fd >= 0)
3359 tgl 2146 UIC 0 : CloseTransientFile(fd);
3359 tgl 2147 EUB :
2148 : /* Mark our write complete */
2149 : {
3359 tgl 2150 UIC 0 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2151 :
3359 tgl 2152 UBC 0 : SpinLockAcquire(&s->mutex);
3359 tgl 2153 UIC 0 : s->n_writers--;
2154 0 : SpinLockRelease(&s->mutex);
2155 : }
2156 :
2157 0 : return false;
2158 : }
2159 :
2160 : /*
2161 : * Read the external query text file into a malloc'd buffer.
2162 : *
2163 : * Returns NULL (without throwing an error) if unable to read, eg
2164 : * file not there or insufficient memory.
2165 : *
2166 : * On success, the buffer size is also returned into *buffer_size.
3359 tgl 2167 ECB : *
2168 : * This can be called without any lock on pgss->lock, but in that case
2169 : * the caller is responsible for verifying that the result is sane.
2170 : */
2171 : static char *
3359 tgl 2172 GIC 46 : qtext_load_file(Size *buffer_size)
2173 : {
3359 tgl 2174 ECB : char *buf;
2175 : int fd;
2176 : struct stat stat;
525 tgl 2177 EUB : Size nread;
3359 2178 :
2024 peter_e 2179 GIC 46 : fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDONLY | PG_BINARY);
3359 tgl 2180 46 : if (fd < 0)
2181 : {
3359 tgl 2182 UBC 0 : if (errno != ENOENT)
3359 tgl 2183 UIC 0 : ereport(LOG,
2184 : (errcode_for_file_access(),
2185 : errmsg("could not read file \"%s\": %m",
2118 tgl 2186 ECB : PGSS_TEXT_FILE)));
3359 tgl 2187 UIC 0 : return NULL;
3359 tgl 2188 EUB : }
2189 :
2190 : /* Get file length */
3359 tgl 2191 GIC 46 : if (fstat(fd, &stat))
3359 tgl 2192 EUB : {
3359 tgl 2193 UBC 0 : ereport(LOG,
2194 : (errcode_for_file_access(),
2195 : errmsg("could not stat file \"%s\": %m",
2196 : PGSS_TEXT_FILE)));
3359 tgl 2197 LBC 0 : CloseTransientFile(fd);
2198 0 : return NULL;
2199 : }
3359 tgl 2200 EUB :
3359 tgl 2201 ECB : /* Allocate buffer; beware that off_t might be wider than size_t */
2744 tgl 2202 GIC 46 : if (stat.st_size <= MaxAllocHugeSize)
3359 tgl 2203 GBC 46 : buf = (char *) malloc(stat.st_size);
2204 : else
3359 tgl 2205 UIC 0 : buf = NULL;
3359 tgl 2206 GIC 46 : if (buf == NULL)
2207 : {
3359 tgl 2208 UBC 0 : ereport(LOG,
3359 tgl 2209 EUB : (errcode(ERRCODE_OUT_OF_MEMORY),
2210 : errmsg("out of memory"),
2211 : errdetail("Could not allocate enough memory to read file \"%s\".",
2212 : PGSS_TEXT_FILE)));
3359 tgl 2213 UIC 0 : CloseTransientFile(fd);
2214 0 : return NULL;
2215 : }
2216 :
3359 tgl 2217 ECB : /*
525 2218 : * OK, slurp in the file. Windows fails if we try to read more than
2219 : * INT_MAX bytes at once, and other platforms might not like that either,
2220 : * so read a very large file in 1GB segments.
2221 : */
525 tgl 2222 GIC 46 : nread = 0;
2223 91 : while (nread < stat.st_size)
2224 : {
2225 45 : int toread = Min(1024 * 1024 * 1024, stat.st_size - nread);
2226 :
2227 : /*
2228 : * If we get a short read and errno doesn't get set, the reason is
525 tgl 2229 ECB : * probably that garbage collection truncated the file since we did
2230 : * the fstat(), so we don't log a complaint --- but we don't return
2231 : * the data, either, since it's most likely corrupt due to concurrent
525 tgl 2232 EUB : * writes from garbage collection.
2233 : */
525 tgl 2234 GIC 45 : errno = 0;
2235 45 : if (read(fd, buf + nread, toread) != toread)
2236 : {
525 tgl 2237 UBC 0 : if (errno)
2238 0 : ereport(LOG,
525 tgl 2239 EUB : (errcode_for_file_access(),
2240 : errmsg("could not read file \"%s\": %m",
525 tgl 2241 ECB : PGSS_TEXT_FILE)));
525 tgl 2242 UIC 0 : free(buf);
2243 0 : CloseTransientFile(fd);
525 tgl 2244 LBC 0 : return NULL;
525 tgl 2245 EUB : }
525 tgl 2246 GIC 45 : nread += toread;
2247 : }
2248 :
1373 peter 2249 CBC 46 : if (CloseTransientFile(fd) != 0)
1492 michael 2250 LBC 0 : ereport(LOG,
2251 : (errcode_for_file_access(),
2252 : errmsg("could not close file \"%s\": %m", PGSS_TEXT_FILE)));
2253 :
525 tgl 2254 GIC 46 : *buffer_size = nread;
3359 2255 46 : return buf;
2256 : }
2257 :
2258 : /*
2259 : * Locate a query text in the file image previously read by qtext_load_file().
3359 tgl 2260 ECB : *
2261 : * We validate the given offset/length, and return NULL if bogus. Otherwise,
2262 : * the result points to a null-terminated string within the buffer.
2263 : */
2264 : static char *
3359 tgl 2265 GBC 48544 : qtext_fetch(Size query_offset, int query_len,
2266 : char *buffer, Size buffer_size)
3359 tgl 2267 ECB : {
2268 : /* File read failed? */
3359 tgl 2269 GBC 48544 : if (buffer == NULL)
3359 tgl 2270 UIC 0 : return NULL;
3359 tgl 2271 ECB : /* Bogus offset/length? */
3359 tgl 2272 GBC 48544 : if (query_len < 0 ||
3359 tgl 2273 GIC 48544 : query_offset + query_len >= buffer_size)
3359 tgl 2274 LBC 0 : return NULL;
2275 : /* As a further sanity check, make sure there's a trailing null */
3359 tgl 2276 GIC 48544 : if (buffer[query_offset + query_len] != '\0')
3359 tgl 2277 UIC 0 : return NULL;
2278 : /* Looks OK */
3359 tgl 2279 GIC 48544 : return buffer + query_offset;
2280 : }
2281 :
2282 : /*
3359 tgl 2283 ECB : * Do we need to garbage-collect the external query text file?
2284 : *
2285 : * Caller should hold at least a shared lock on pgss->lock.
2286 : */
2287 : static bool
3359 tgl 2288 GIC 24795 : need_gc_qtexts(void)
3359 tgl 2289 ECB : {
2290 : Size extent;
2291 :
2292 : /* Read shared extent pointer */
2293 : {
3359 tgl 2294 GIC 24795 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
2295 :
2296 24795 : SpinLockAcquire(&s->mutex);
2297 24795 : extent = s->extent;
2298 24795 : SpinLockRelease(&s->mutex);
2299 : }
2300 :
2301 : /*
2302 : * Don't proceed if file does not exceed 512 bytes per possible entry.
250 tgl 2303 ECB : *
2304 : * Here and in the next test, 32-bit machines have overflow hazards if
2305 : * pgss_max and/or mean_query_len are large. Force the multiplications
2306 : * and comparisons to be done in uint64 arithmetic to forestall trouble.
2307 : */
250 tgl 2308 GIC 24795 : if ((uint64) extent < (uint64) 512 * pgss_max)
3359 2309 24795 : return false;
2310 :
2311 : /*
2312 : * Don't proceed if file is less than about 50% bloat. Nothing can or
3359 tgl 2313 EUB : * should be done in the event of unusually large query texts accounting
2314 : * for file's large size. We go to the trouble of maintaining the mean
2315 : * query length in order to prevent garbage collection from thrashing
2316 : * uselessly.
2317 : */
250 tgl 2318 UIC 0 : if ((uint64) extent < (uint64) pgss->mean_query_len * pgss_max * 2)
3359 2319 0 : return false;
2320 :
2321 0 : return true;
2322 : }
2323 :
2324 : /*
2325 : * Garbage-collect orphaned query texts in external file.
2326 : *
2327 : * This won't be called often in the typical case, since it's likely that
2328 : * there won't be too much churn, and besides, a similar compaction process
2329 : * occurs when serializing to disk at shutdown or as part of resetting.
2330 : * Despite this, it seems prudent to plan for the edge case where the file
2331 : * becomes unreasonably large, with no other method of compaction likely to
2332 : * occur in the foreseeable future.
2333 : *
2334 : * The caller must hold an exclusive lock on pgss->lock.
2335 : *
2744 tgl 2336 EUB : * At the first sign of trouble we unlink the query text file to get a clean
2337 : * slate (although existing statistics are retained), rather than risk
2338 : * thrashing by allowing the same problem case to recur indefinitely.
2339 : */
3359 2340 : static void
3359 tgl 2341 UIC 0 : gc_qtexts(void)
2342 : {
2343 : char *qbuffer;
2344 : Size qbuffer_size;
2744 2345 0 : FILE *qfile = NULL;
2346 : HASH_SEQ_STATUS hash_seq;
2347 : pgssEntry *entry;
2348 : Size extent;
2349 : int nentries;
2350 :
3359 tgl 2351 EUB : /*
2352 : * When called from pgss_store, some other session might have proceeded
2353 : * with garbage collection in the no-lock-held interim of lock strength
2354 : * escalation. Check once more that this is actually necessary.
2355 : */
3359 tgl 2356 UIC 0 : if (!need_gc_qtexts())
2357 0 : return;
2358 :
2359 : /*
2360 : * Load the old texts file. If we fail (out of memory, for instance),
2744 tgl 2361 EUB : * invalidate query texts. Hopefully this is rare. It might seem better
2362 : * to leave things alone on an OOM failure, but the problem is that the
2363 : * file is only going to get bigger; hoping for a future non-OOM result is
2364 : * risky and can easily lead to complete denial of service.
2365 : */
3359 tgl 2366 UIC 0 : qbuffer = qtext_load_file(&qbuffer_size);
2367 0 : if (qbuffer == NULL)
2744 2368 0 : goto gc_fail;
2369 :
2370 : /*
3359 tgl 2371 EUB : * We overwrite the query texts file in place, so as to reduce the risk of
2372 : * an out-of-disk-space failure. Since the file is guaranteed not to get
2373 : * larger, this should always work on traditional filesystems; though we
2374 : * could still lose on copy-on-write filesystems.
2375 : */
3359 tgl 2376 UIC 0 : qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2377 0 : if (qfile == NULL)
3359 tgl 2378 EUB : {
3359 tgl 2379 UIC 0 : ereport(LOG,
2380 : (errcode_for_file_access(),
1721 michael 2381 EUB : errmsg("could not write file \"%s\": %m",
3359 tgl 2382 : PGSS_TEXT_FILE)));
3359 tgl 2383 UIC 0 : goto gc_fail;
3359 tgl 2384 EUB : }
2385 :
3359 tgl 2386 UIC 0 : extent = 0;
3359 tgl 2387 UBC 0 : nentries = 0;
3359 tgl 2388 EUB :
3359 tgl 2389 UIC 0 : hash_seq_init(&hash_seq, pgss_hash);
2390 0 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
2391 : {
2392 0 : int query_len = entry->query_len;
3359 tgl 2393 UBC 0 : char *qry = qtext_fetch(entry->query_offset,
2394 : query_len,
2395 : qbuffer,
3359 tgl 2396 EUB : qbuffer_size);
2397 :
3359 tgl 2398 UIC 0 : if (qry == NULL)
3359 tgl 2399 EUB : {
2400 : /* Trouble ... drop the text */
3359 tgl 2401 UIC 0 : entry->query_offset = 0;
3359 tgl 2402 UBC 0 : entry->query_len = -1;
2403 : /* entry will not be counted in mean query length computation */
2404 0 : continue;
2405 : }
2406 :
3359 tgl 2407 UIC 0 : if (fwrite(qry, 1, query_len + 1, qfile) != query_len + 1)
3359 tgl 2408 EUB : {
3359 tgl 2409 UBC 0 : ereport(LOG,
2410 : (errcode_for_file_access(),
2411 : errmsg("could not write file \"%s\": %m",
2118 tgl 2412 EUB : PGSS_TEXT_FILE)));
3359 tgl 2413 UBC 0 : hash_seq_term(&hash_seq);
2414 0 : goto gc_fail;
2415 : }
2416 :
3359 tgl 2417 UIC 0 : entry->query_offset = extent;
2418 0 : extent += query_len + 1;
2419 0 : nentries++;
2420 : }
3359 tgl 2421 EUB :
2422 : /*
2423 : * Truncate away any now-unused space. If this fails for some odd reason,
2424 : * we log it, but there's no need to fail.
2425 : */
3359 tgl 2426 UIC 0 : if (ftruncate(fileno(qfile), extent) != 0)
3359 tgl 2427 UBC 0 : ereport(LOG,
2428 : (errcode_for_file_access(),
1721 michael 2429 EUB : errmsg("could not truncate file \"%s\": %m",
2430 : PGSS_TEXT_FILE)));
2431 :
3359 tgl 2432 UIC 0 : if (FreeFile(qfile))
3359 tgl 2433 EUB : {
3359 tgl 2434 UBC 0 : ereport(LOG,
2435 : (errcode_for_file_access(),
2436 : errmsg("could not write file \"%s\": %m",
3359 tgl 2437 EUB : PGSS_TEXT_FILE)));
3359 tgl 2438 UIC 0 : qfile = NULL;
2439 0 : goto gc_fail;
2440 : }
3359 tgl 2441 EUB :
3359 tgl 2442 UIC 0 : elog(DEBUG1, "pgss gc of queries file shrunk size from %zu to %zu",
2443 : pgss->extent, extent);
2444 :
2445 : /* Reset the shared extent pointer */
2446 0 : pgss->extent = extent;
3359 tgl 2447 EUB :
2448 : /*
2449 : * Also update the mean query length, to be sure that need_gc_qtexts()
2450 : * won't still think we have a problem.
2451 : */
3359 tgl 2452 UBC 0 : if (nentries > 0)
3359 tgl 2453 UIC 0 : pgss->mean_query_len = extent / nentries;
2454 : else
2455 0 : pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2456 :
2457 0 : free(qbuffer);
2458 :
2459 : /*
2460 : * OK, count a garbage collection cycle. (Note: even though we have
3359 tgl 2461 EUB : * exclusive lock on pgss->lock, we must take pgss->mutex for this, since
2462 : * other processes may examine gc_count while holding only the mutex.
2463 : * Also, we have to advance the count *after* we've rewritten the file,
2464 : * else other processes might not realize they read a stale file.)
2465 : */
3359 tgl 2466 UIC 0 : record_gc_qtexts();
3359 tgl 2467 EUB :
3359 tgl 2468 UBC 0 : return;
3359 tgl 2469 EUB :
3359 tgl 2470 UIC 0 : gc_fail:
2471 : /* clean up resources */
2472 0 : if (qfile)
2473 0 : FreeFile(qfile);
297 peter 2474 UNC 0 : free(qbuffer);
3359 tgl 2475 EUB :
2476 : /*
2477 : * Since the contents of the external file are now uncertain, mark all
2478 : * hashtable entries as having invalid texts.
2479 : */
3359 tgl 2480 UIC 0 : hash_seq_init(&hash_seq, pgss_hash);
2481 0 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
2482 : {
2483 0 : entry->query_offset = 0;
3359 tgl 2484 UBC 0 : entry->query_len = -1;
3359 tgl 2485 EUB : }
2486 :
2744 2487 : /*
2488 : * Destroy the query text file and create a new, empty one
2489 : */
2744 tgl 2490 UIC 0 : (void) unlink(PGSS_TEXT_FILE);
2491 0 : qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
2744 tgl 2492 UBC 0 : if (qfile == NULL)
2744 tgl 2493 UIC 0 : ereport(LOG,
2494 : (errcode_for_file_access(),
1721 michael 2495 EUB : errmsg("could not recreate file \"%s\": %m",
2496 : PGSS_TEXT_FILE)));
2497 : else
2744 tgl 2498 UBC 0 : FreeFile(qfile);
2499 :
2500 : /* Reset the shared extent pointer */
2744 tgl 2501 UIC 0 : pgss->extent = 0;
2502 :
2503 : /* Reset mean_query_len to match the new state */
2504 0 : pgss->mean_query_len = ASSUMED_LENGTH_INIT;
2505 :
2506 : /*
2507 : * Bump the GC count even though we failed.
2508 : *
2509 : * This is needed to make concurrent readers of file without any lock on
2510 : * pgss->lock notice existence of new version of file. Once readers
2744 tgl 2511 EUB : * subsequently observe a change in GC count with pgss->lock held, that
2512 : * forces a safe reopen of file. Writers also require that we bump here,
2513 : * of course. (As required by locking protocol, readers and writers don't
2514 : * trust earlier file contents until gc_count is found unchanged after
2515 : * pgss->lock acquired in shared or exclusive mode respectively.)
2516 : */
3359 tgl 2517 UIC 0 : record_gc_qtexts();
3359 tgl 2518 ECB : }
2519 :
2520 : /*
2521 : * Release entries corresponding to parameters passed.
2522 : */
2523 : static void
1549 akapila 2524 CBC 41 : entry_reset(Oid userid, Oid dbid, uint64 queryid)
2525 : {
2526 : HASH_SEQ_STATUS hash_seq;
5050 bruce 2527 ECB : pgssEntry *entry;
3359 tgl 2528 EUB : FILE *qfile;
2529 : long num_entries;
1549 akapila 2530 GIC 41 : long num_remove = 0;
2531 : pgssHashKey key;
1549 akapila 2532 ECB :
1549 akapila 2533 CBC 41 : if (!pgss || !pgss_hash)
1549 akapila 2534 UIC 0 : ereport(ERROR,
1549 akapila 2535 ECB : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2536 : errmsg("pg_stat_statements must be loaded via shared_preload_libraries")));
2537 :
5208 tgl 2538 CBC 41 : LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
1549 akapila 2539 41 : num_entries = hash_get_num_entries(pgss_hash);
5208 tgl 2540 ECB :
1549 akapila 2541 CBC 41 : if (userid != 0 && dbid != 0 && queryid != UINT64CONST(0))
2542 : {
2543 : /* If all the parameters are available, use the fast path. */
731 magnus 2544 1 : memset(&key, 0, sizeof(pgssHashKey));
1549 akapila 2545 1 : key.userid = userid;
2546 1 : key.dbid = dbid;
1549 akapila 2547 GBC 1 : key.queryid = queryid;
2548 :
2549 : /* Remove the key if it exists, starting with the top-level entry */
731 magnus 2550 CBC 1 : key.toplevel = false;
731 magnus 2551 GIC 1 : entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_REMOVE, NULL);
2552 1 : if (entry) /* found */
731 magnus 2553 LBC 0 : num_remove++;
731 magnus 2554 ECB :
2555 : /* Also remove entries for top level statements */
731 magnus 2556 GIC 1 : key.toplevel = true;
731 magnus 2557 ECB :
2558 : /* Remove the key if exists */
1549 akapila 2559 GIC 1 : entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_REMOVE, NULL);
1549 akapila 2560 CBC 1 : if (entry) /* found */
2561 1 : num_remove++;
2562 : }
2563 40 : else if (userid != 0 || dbid != 0 || queryid != UINT64CONST(0))
5208 tgl 2564 ECB : {
1549 akapila 2565 : /* Remove entries corresponding to valid parameters. */
1549 akapila 2566 GIC 3 : hash_seq_init(&hash_seq, pgss_hash);
1549 akapila 2567 CBC 39 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
1549 akapila 2568 ECB : {
1549 akapila 2569 GIC 36 : if ((!userid || entry->key.userid == userid) &&
2570 26 : (!dbid || entry->key.dbid == dbid) &&
2571 24 : (!queryid || entry->key.queryid == queryid))
2572 : {
2573 4 : hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
2574 4 : num_remove++;
1549 akapila 2575 ECB : }
2576 : }
2577 : }
2578 : else
2579 : {
2580 : /* Remove all entries. */
1549 akapila 2581 GIC 37 : hash_seq_init(&hash_seq, pgss_hash);
2582 336 : while ((entry = hash_seq_search(&hash_seq)) != NULL)
2583 : {
1549 akapila 2584 CBC 299 : hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
2585 299 : num_remove++;
2586 : }
2587 : }
2588 :
2589 : /* All entries are removed? */
1549 akapila 2590 GIC 41 : if (num_entries != num_remove)
2591 4 : goto release_lock;
1549 akapila 2592 ECB :
842 fujii 2593 : /*
2594 : * Reset global statistics for pg_stat_statements since all entries are
2595 : * removed.
2596 : */
2597 : {
842 fujii 2598 CBC 37 : volatile pgssSharedState *s = (volatile pgssSharedState *) pgss;
842 fujii 2599 GIC 37 : TimestampTz stats_reset = GetCurrentTimestamp();
2600 :
2601 37 : SpinLockAcquire(&s->mutex);
2602 37 : s->stats.dealloc = 0;
2603 37 : s->stats.stats_reset = stats_reset;
2604 37 : SpinLockRelease(&s->mutex);
842 fujii 2605 ECB : }
2606 :
2607 : /*
3359 tgl 2608 EUB : * Write new empty query file, perhaps even creating a new one to recover
2609 : * if the file was missing.
2610 : */
3359 tgl 2611 GIC 37 : qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W);
3359 tgl 2612 GBC 37 : if (qfile == NULL)
2613 : {
3359 tgl 2614 UIC 0 : ereport(LOG,
2615 : (errcode_for_file_access(),
1721 michael 2616 ECB : errmsg("could not create file \"%s\": %m",
3359 tgl 2617 EUB : PGSS_TEXT_FILE)));
3359 tgl 2618 UIC 0 : goto done;
2619 : }
2620 :
2621 : /* If ftruncate fails, log it, but it's not a fatal problem */
3359 tgl 2622 CBC 37 : if (ftruncate(fileno(qfile), 0) != 0)
3359 tgl 2623 UIC 0 : ereport(LOG,
3359 tgl 2624 ECB : (errcode_for_file_access(),
1721 michael 2625 : errmsg("could not truncate file \"%s\": %m",
2626 : PGSS_TEXT_FILE)));
3359 tgl 2627 :
3359 tgl 2628 GIC 37 : FreeFile(qfile);
3359 tgl 2629 ECB :
3359 tgl 2630 CBC 37 : done:
2631 37 : pgss->extent = 0;
2632 : /* This counts as a query text garbage collection for our purposes */
3359 tgl 2633 GIC 37 : record_gc_qtexts();
2634 :
1549 akapila 2635 41 : release_lock:
5208 tgl 2636 41 : LWLockRelease(pgss->lock);
2637 41 : }
2638 :
2639 : /*
2640 : * Generate a normalized version of the query string that will be used to
2641 : * represent all similar queries.
2642 : *
2643 : * Note that the normalized representation may well vary depending on
2644 : * just which "equivalent" query is used to create the hashtable entry.
2645 : * We assume this is OK.
2646 : *
2647 : * If query_loc > 0, then "query" has been advanced by that much compared to
2648 : * the original string start, so we need to translate the provided locations
2649 : * to compensate. (This lets us avoid re-scanning statements before the one
2650 : * of interest, so it's worth doing.)
2651 : *
2652 : * *query_len_p contains the input string length, and is updated with
2204 tgl 2653 ECB : * the result string length on exit. The resulting string might be longer
2654 : * or shorter depending on what happens with replacement of constants.
2655 : *
2656 : * Returns a palloc'd string.
4029 2657 : */
2658 : static char *
732 bruce 2659 GIC 8841 : generate_normalized_query(JumbleState *jstate, const char *query,
2660 : int query_loc, int *query_len_p)
4029 tgl 2661 ECB : {
2662 : char *norm_query;
4029 tgl 2663 CBC 8841 : int query_len = *query_len_p;
4029 tgl 2664 ECB : int i,
2665 : norm_query_buflen, /* Space allowed for norm_query */
2666 : len_to_wrt, /* Length (in bytes) to write */
4029 tgl 2667 GIC 8841 : quer_loc = 0, /* Source query byte location */
2668 8841 : n_quer_loc = 0, /* Normalized query byte location */
2669 8841 : last_off = 0, /* Offset from start for previous tok */
2118 tgl 2670 CBC 8841 : last_tok_len = 0; /* Length (in bytes) of that tok */
2671 :
2672 : /*
2673 : * Get constants' lengths (core system only gives us locations). Note
2674 : * this also ensures the items are sorted by location.
2675 : */
2276 tgl 2676 GIC 8841 : fill_in_constant_lengths(jstate, query, query_loc);
2677 :
2678 : /*
2204 tgl 2679 ECB : * Allow for $n symbols to be longer than the constants they replace.
2680 : * Constants must take at least one byte in text form, while a $n symbol
2681 : * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
2682 : * could refine that limit based on the max value of n for the current
2683 : * query, but it hardly seems worth any extra effort to do so.
2684 : */
2204 tgl 2685 GIC 8841 : norm_query_buflen = query_len + jstate->clocations_count * 10;
2686 :
2687 : /* Allocate result buffer */
2688 8841 : norm_query = palloc(norm_query_buflen + 1);
4029 tgl 2689 ECB :
4029 tgl 2690 GIC 36766 : for (i = 0; i < jstate->clocations_count; i++)
4029 tgl 2691 ECB : {
2692 : int off, /* Offset from start for cur tok */
2693 : tok_len; /* Length (in bytes) of that tok */
2694 :
4029 tgl 2695 CBC 27925 : off = jstate->clocations[i].location;
2276 tgl 2696 ECB : /* Adjust recorded location if we're dealing with partial string */
2276 tgl 2697 GIC 27925 : off -= query_loc;
2698 :
4029 tgl 2699 CBC 27925 : tok_len = jstate->clocations[i].length;
4029 tgl 2700 ECB :
4029 tgl 2701 GIC 27925 : if (tok_len < 0)
4029 tgl 2702 CBC 160 : continue; /* ignore any duplicates */
4029 tgl 2703 ECB :
3358 2704 : /* Copy next chunk (what precedes the next constant) */
4029 tgl 2705 GIC 27765 : len_to_wrt = off - last_off;
2706 27765 : len_to_wrt -= last_tok_len;
4029 tgl 2707 ECB :
4029 tgl 2708 CBC 27765 : Assert(len_to_wrt >= 0);
4029 tgl 2709 GIC 27765 : memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
4029 tgl 2710 CBC 27765 : n_quer_loc += len_to_wrt;
4029 tgl 2711 ECB :
2204 2712 : /* And insert a param symbol in place of the constant token */
2204 tgl 2713 GIC 55530 : n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d",
2714 27765 : i + 1 + jstate->highest_extern_param_id);
2715 :
4029 2716 27765 : quer_loc = off + tok_len;
2717 27765 : last_off = off;
2718 27765 : last_tok_len = tok_len;
4029 tgl 2719 ECB : }
2720 :
2721 : /*
2722 : * We've copied up until the last ignorable constant. Copy over the
3359 2723 : * remaining bytes of the original query string.
2724 : */
4029 tgl 2725 CBC 8841 : len_to_wrt = query_len - quer_loc;
4029 tgl 2726 ECB :
4029 tgl 2727 GIC 8841 : Assert(len_to_wrt >= 0);
4029 tgl 2728 CBC 8841 : memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
2729 8841 : n_quer_loc += len_to_wrt;
2730 :
2204 tgl 2731 GIC 8841 : Assert(n_quer_loc <= norm_query_buflen);
3359 2732 8841 : norm_query[n_quer_loc] = '\0';
2733 :
2734 8841 : *query_len_p = n_quer_loc;
4029 2735 8841 : return norm_query;
2736 : }
2737 :
2738 : /*
2739 : * Given a valid SQL string and an array of constant-location records,
2740 : * fill in the textual lengths of those constants.
2741 : *
2742 : * The constants may use any allowed constant syntax, such as float literals,
2743 : * bit-strings, single-quoted strings and dollar-quoted strings. This is
2744 : * accomplished by using the public API for the core scanner.
2745 : *
2746 : * It is the caller's job to ensure that the string is a valid SQL statement
2747 : * with constants at the indicated locations. Since in practice the string
2748 : * has already been parsed, and the locations that the caller provides will
2749 : * have originated from within the authoritative parser, this should not be
2750 : * a problem.
2751 : *
2752 : * Duplicate constant pointers are possible, and will have their lengths
2753 : * marked as '-1', so that they are later ignored. (Actually, we assume the
2754 : * lengths were initialized as -1 to start with, and don't change them here.)
2755 : *
2756 : * If query_loc > 0, then "query" has been advanced by that much compared to
2757 : * the original string start, so we need to translate the provided locations
2758 : * to compensate. (This lets us avoid re-scanning statements before the one
2759 : * of interest, so it's worth doing.)
2276 tgl 2760 ECB : *
2761 : * N.B. There is an assumption that a '-' character at a Const location begins
2762 : * a negative numeric constant. This precludes there ever being another
2763 : * reason for a constant to start with a '-'.
2764 : */
2765 : static void
732 bruce 2766 GIC 8841 : fill_in_constant_lengths(JumbleState *jstate, const char *query,
2767 : int query_loc)
4029 tgl 2768 ECB : {
2769 : LocationLen *locs;
2770 : core_yyscan_t yyscanner;
2771 : core_yy_extra_type yyextra;
2772 : core_YYSTYPE yylval;
2773 : YYLTYPE yylloc;
4029 tgl 2774 GIC 8841 : int last_loc = -1;
4029 tgl 2775 ECB : int i;
2776 :
2777 : /*
2778 : * Sort the records by location so that we can process them in order while
2779 : * scanning the query text.
2780 : */
4029 tgl 2781 CBC 8841 : if (jstate->clocations_count > 1)
4029 tgl 2782 GIC 5774 : qsort(jstate->clocations, jstate->clocations_count,
2783 : sizeof(LocationLen), comp_location);
2784 8841 : locs = jstate->clocations;
2785 :
2786 : /* initialize the flex scanner --- should match raw_parser() */
4029 tgl 2787 CBC 8841 : yyscanner = scanner_init(query,
2788 : &yyextra,
2789 : &ScanKeywords,
1554 tgl 2790 ECB : ScanKeywordTokens);
2791 :
2999 2792 : /* we don't want to re-emit any escape string warnings */
2999 tgl 2793 GIC 8841 : yyextra.escape_string_warning = false;
2794 :
2795 : /* Search for each constant, in sequence */
4029 tgl 2796 CBC 36766 : for (i = 0; i < jstate->clocations_count; i++)
2797 : {
2798 27925 : int loc = locs[i].location;
2799 : int tok;
4029 tgl 2800 ECB :
2276 2801 : /* Adjust recorded location if we're dealing with partial string */
2276 tgl 2802 GIC 27925 : loc -= query_loc;
2803 :
4029 2804 27925 : Assert(loc >= 0);
2805 :
4029 tgl 2806 CBC 27925 : if (loc <= last_loc)
4029 tgl 2807 GIC 160 : continue; /* Duplicate constant, ignore */
2808 :
4029 tgl 2809 ECB : /* Lex tokens until we find the desired constant */
4029 tgl 2810 EUB : for (;;)
2811 : {
4029 tgl 2812 GIC 207745 : tok = core_yylex(&yylval, &yylloc, yyscanner);
2813 :
2814 : /* We should not hit end-of-string, but if we do, behave sanely */
2815 207745 : if (tok == 0)
4029 tgl 2816 LBC 0 : break; /* out of inner for-loop */
2817 :
4029 tgl 2818 ECB : /*
2819 : * We should find the token position exactly, but if we somehow
2820 : * run past it, work with that.
2821 : */
4029 tgl 2822 GIC 207745 : if (yylloc >= loc)
2823 : {
2824 27765 : if (query[loc] == '-')
2825 : {
2826 : /*
2827 : * It's a negative value - this is the one and only case
2828 : * where we replace more than a single token.
2829 : *
2830 : * Do not compensate for the core system's special-case
2831 : * adjustment of location to that of the leading '-'
4029 tgl 2832 ECB : * operator in the event of a negative constant. It is
2833 : * also useful for our purposes to start from the minus
3260 bruce 2834 EUB : * symbol. In this way, queries like "select * from foo
2835 : * where bar = 1" and "select * from foo where bar = -2"
2836 : * will have identical normalized query strings.
2837 : */
4029 tgl 2838 GIC 359 : tok = core_yylex(&yylval, &yylloc, yyscanner);
2839 359 : if (tok == 0)
4029 tgl 2840 UIC 0 : break; /* out of inner for-loop */
4029 tgl 2841 ECB : }
2842 :
2843 : /*
2844 : * We now rely on the assumption that flex has placed a zero
2845 : * byte after the text of the current token in scanbuf.
2846 : */
4029 tgl 2847 CBC 27765 : locs[i].length = strlen(yyextra.scanbuf + loc);
4029 tgl 2848 GBC 27765 : break; /* out of inner for-loop */
2849 : }
4029 tgl 2850 ECB : }
2851 :
2852 : /* If we hit end-of-string, give up, leaving remaining lengths -1 */
4029 tgl 2853 CBC 27765 : if (tok == 0)
4029 tgl 2854 LBC 0 : break;
2855 :
4029 tgl 2856 GIC 27765 : last_loc = loc;
2857 : }
2858 :
2859 8841 : scanner_finish(yyscanner);
4029 tgl 2860 CBC 8841 : }
2861 :
4029 tgl 2862 ECB : /*
732 bruce 2863 : * comp_location: comparator for qsorting LocationLen structs by location
2864 : */
4029 tgl 2865 : static int
4029 tgl 2866 CBC 33062 : comp_location(const void *a, const void *b)
4029 tgl 2867 ECB : {
732 bruce 2868 CBC 33062 : int l = ((const LocationLen *) a)->location;
732 bruce 2869 GIC 33062 : int r = ((const LocationLen *) b)->location;
4029 tgl 2870 ECB :
4029 tgl 2871 GIC 33062 : if (l < r)
2872 22541 : return -1;
2873 10521 : else if (l > r)
2874 10354 : return +1;
2875 : else
2876 167 : return 0;
2877 : }
|