Merge pull request #336 from codeforall/main

PG-488: pg_stat_monitor: Overflow management.
pull/337/head
Ibrar Ahmed 2022-12-21 00:42:47 +05:00 committed by GitHub
commit 8dffa8cc97
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 246 additions and 513 deletions

View File

@ -16,59 +16,93 @@
*/ */
#include "postgres.h" #include "postgres.h"
#include "nodes/pg_list.h" #include "nodes/pg_list.h"
#include "pg_stat_monitor.h" #include "pg_stat_monitor.h"
static pgsmLocalState pgsmStateLocal;
static pgssSharedState *pgss; /* parameter for the shared hash */
static HTAB *pgss_hash; static dshash_parameters dsh_params = {
static HTAB *pgss_query_hash; sizeof(pgssHashKey),
sizeof(pgssEntry),
dshash_memcmp,
dshash_memhash
};
static void pgsm_proc_exit(int code, Datum arg);
static Size
static HTAB * pgsm_query_area_size(void)
hash_init(const char *hash_name, int key_size, int entry_size, int hash_size)
{ {
HASHCTL info; Size sz = MAXALIGN(MAX_QUERY_BUF);
return MAXALIGN(sz);
memset(&info, 0, sizeof(info));
info.keysize = key_size;
info.entrysize = entry_size;
return ShmemInitHash(hash_name, hash_size, hash_size, &info, HASH_ELEM | HASH_BLOBS);
} }
Size
pgsm_ShmemSize(void)
{
Size sz = MAXALIGN(sizeof(pgssSharedState));
sz = add_size(sz, pgsm_query_area_size());
sz = add_size(sz, hash_estimate_size(MAX_BUCKET_ENTRIES, sizeof(pgssEntry)));
return sz;
}
void void
pgss_startup(void) pgss_startup(void)
{ {
bool found = false; bool found = false;
pgssSharedState *pgss;
/* reset in case this is a restart within the postmaster */ /* reset in case this is a restart within the postmaster */
pgsmStateLocal.dsa = NULL;
pgss = NULL; pgsmStateLocal.shared_hash = NULL;
pgss_hash = NULL; pgsmStateLocal.shared_pgssState = NULL;
/* /*
* Create or attach to the shared memory state, including hash table * Create or attach to the shared memory state, including hash table
*/ */
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
pgss = ShmemInitStruct("pg_stat_monitor", sizeof(pgssSharedState), &found); pgss = ShmemInitStruct("pg_stat_monitor", pgsm_ShmemSize(), &found);
if (!found) if (!found)
{ {
/* First time through ... */ /* First time through ... */
dsa_area *dsa;
dshash_table *dsh;
char *p = (char *) pgss;
pgss->lock = &(GetNamedLWLockTranche("pg_stat_monitor"))->lock; pgss->lock = &(GetNamedLWLockTranche("pg_stat_monitor"))->lock;
SpinLockInit(&pgss->mutex); SpinLockInit(&pgss->mutex);
ResetSharedState(pgss); ResetSharedState(pgss);
/* the allocation of pgssSharedState itself */
p += MAXALIGN(sizeof(pgssSharedState));
pgss->raw_dsa_area = p;
dsa = dsa_create_in_place(pgss->raw_dsa_area,
pgsm_query_area_size(),
LWLockNewTrancheId(), 0);
dsa_pin(dsa);
dsa_set_size_limit(dsa, pgsm_query_area_size());
pgss->hash_tranche_id = LWLockNewTrancheId();
dsh_params.tranche_id = pgss->hash_tranche_id;
dsh = dshash_create(dsa, &dsh_params, 0);
pgss->hash_handle = dshash_get_hash_table_handle(dsh);
if (PGSM_OVERFLOW_TARGET == OVERFLOW_TARGET_DISK)
dsa_set_size_limit(dsa, -1);
pgsmStateLocal.shared_pgssState = pgss;
/*
* Postmaster will never access these again, thus free the local
* dsa/dshash references.
*/
dshash_detach(dsh);
dsa_detach(dsa);
} }
#ifdef BENCHMARK #ifdef BENCHMARK
init_hook_stats(); init_hook_stats();
#endif #endif
set_qbuf((unsigned char *) ShmemAlloc(MAX_QUERY_BUF));
pgss_hash = hash_init("pg_stat_monitor: bucket hashtable", sizeof(pgssHashKey), sizeof(pgssEntry), MAX_BUCKET_ENTRIES);
pgss_query_hash = hash_init("pg_stat_monitor: queryID hashtable", sizeof(uint64), sizeof(pgssQueryEntry), MAX_BUCKET_ENTRIES);
LWLockRelease(AddinShmemInitLock); LWLockRelease(AddinShmemInitLock);
/* /*
@ -78,23 +112,49 @@ pgss_startup(void)
on_shmem_exit(pgss_shmem_shutdown, (Datum) 0); on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
} }
void
pgsm_attach_shmem(void)
{
MemoryContext oldcontext;
if (pgsmStateLocal.dsa)
return;
oldcontext = MemoryContextSwitchTo(TopMemoryContext);
pgsmStateLocal.dsa = dsa_attach_in_place(pgsmStateLocal.shared_pgssState->raw_dsa_area,
NULL);
dsa_pin_mapping(pgsmStateLocal.dsa);
dsh_params.tranche_id = pgsmStateLocal.shared_pgssState->hash_tranche_id;
pgsmStateLocal.shared_hash = dshash_attach(pgsmStateLocal.dsa, &dsh_params,
pgsmStateLocal.shared_pgssState->hash_handle, 0);
on_proc_exit(pgsm_proc_exit, 0);
MemoryContextSwitchTo(oldcontext);
}
dsa_area*
get_dsa_area_for_query_text(void)
{
pgsm_attach_shmem();
return pgsmStateLocal.dsa;
}
dshash_table*
get_pgssHash(void)
{
pgsm_attach_shmem();
return pgsmStateLocal.shared_hash;
}
pgssSharedState * pgssSharedState *
pgsm_get_ss(void) pgsm_get_ss(void)
{ {
return pgss; pgsm_attach_shmem();
return pgsmStateLocal.shared_pgssState;
} }
HTAB *
pgsm_get_hash(void)
{
return pgss_hash;
}
HTAB *
pgsm_get_query_hash(void)
{
return pgss_query_hash;
}
/* /*
* shmem_shutdown hook: Dump statistics into file. * shmem_shutdown hook: Dump statistics into file.
@ -106,26 +166,24 @@ void
pgss_shmem_shutdown(int code, Datum arg) pgss_shmem_shutdown(int code, Datum arg)
{ {
/* Don't try to dump during a crash. */ /* Don't try to dump during a crash. */
elog(LOG,"pgss_shmem_shutdown");
if (code) if (code)
return; return;
pgss = NULL; pgsmStateLocal.shared_pgssState = NULL;
/* Safety check ... shouldn't get here unless shmem is set up. */ /* Safety check ... shouldn't get here unless shmem is set up. */
if (!IsHashInitialize()) if (!IsHashInitialize())
return; return;
} }
Size static void
hash_memsize(void) pgsm_proc_exit(int code, Datum arg)
{ {
Size size; Assert(pgsmStateLocal.dsa);
dshash_detach(pgsmStateLocal.shared_hash);
size = MAXALIGN(sizeof(pgssSharedState)); pgsmStateLocal.shared_hash = NULL;
size += MAXALIGN(MAX_QUERY_BUF); dsa_detach(pgsmStateLocal.dsa);
size = add_size(size, hash_estimate_size(MAX_BUCKET_ENTRIES, sizeof(pgssEntry))); pgsmStateLocal.dsa = NULL;
size = add_size(size, hash_estimate_size(MAX_BUCKET_ENTRIES, sizeof(pgssQueryEntry)));
return size;
} }
pgssEntry * pgssEntry *
@ -134,13 +192,9 @@ hash_entry_alloc(pgssSharedState *pgss, pgssHashKey *key, int encoding)
pgssEntry *entry = NULL; pgssEntry *entry = NULL;
bool found = false; bool found = false;
if (hash_get_num_entries(pgss_hash) >= MAX_BUCKET_ENTRIES)
{
elog(DEBUG1, "pg_stat_monitor: out of memory");
return NULL;
}
/* Find or create an entry with desired hash code */ /* Find or create an entry with desired hash code */
entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER_NULL, &found); entry = (pgssEntry *) dshash_find_or_insert(pgsmStateLocal.shared_hash, key, &found);
// entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER_NULL, &found);
if (entry == NULL) if (entry == NULL)
elog(DEBUG1, "hash_entry_alloc: OUT OF MEMORY"); elog(DEBUG1, "hash_entry_alloc: OUT OF MEMORY");
else if (!found) else if (!found)
@ -155,6 +209,7 @@ hash_entry_alloc(pgssSharedState *pgss, pgssHashKey *key, int encoding)
/* ... and don't forget the query text metadata */ /* ... and don't forget the query text metadata */
entry->encoding = encoding; entry->encoding = encoding;
} }
dshash_release_lock(pgsmStateLocal.shared_hash, entry);
return entry; return entry;
} }
@ -174,17 +229,22 @@ hash_entry_alloc(pgssSharedState *pgss, pgssHashKey *key, int encoding)
void void
hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_buffer) hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_buffer)
{ {
HASH_SEQ_STATUS hash_seq; dshash_seq_status hstat;
pgssEntry *entry = NULL; pgssEntry *entry = NULL;
/* Store pending query ids from the previous bucket. */ /* Store pending query ids from the previous bucket. */
List *pending_entries = NIL; List *pending_entries = NIL;
ListCell *pending_entry; ListCell *pending_entry;
if (!pgsmStateLocal.shared_hash)
return;
/* Iterate over the hash table. */ /* Iterate over the hash table. */
hash_seq_init(&hash_seq, pgss_hash); dshash_seq_init(&hstat, pgsmStateLocal.shared_hash, true);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
while ((entry = dshash_seq_next(&hstat)) != NULL)
{ {
dsa_pointer pdsa;
/* /*
* Remove all entries if new_bucket_id == -1. Otherwise remove entry * Remove all entries if new_bucket_id == -1. Otherwise remove entry
* in new_bucket_id if it has finished already. * in new_bucket_id if it has finished already.
@ -193,16 +253,14 @@ hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_bu
(entry->key.bucket_id == new_bucket_id && (entry->key.bucket_id == new_bucket_id &&
(entry->counters.state == PGSS_FINISHED || entry->counters.state == PGSS_ERROR))) (entry->counters.state == PGSS_FINISHED || entry->counters.state == PGSS_ERROR)))
{ {
if (new_bucket_id == -1) pdsa = entry->query_pos;
{ dsa_pointer parent_qdsa = entry->counters.info.parent_query;
/* dshash_delete_current(&hstat);
* pg_stat_monitor_reset(), remove entry from query hash table dsa_free(pgsmStateLocal.dsa, pdsa);
* too.
*/ if (DsaPointerIsValid(parent_qdsa))
hash_search(pgss_query_hash, &(entry->key.queryid), HASH_REMOVE, NULL); dsa_free(pgsmStateLocal.dsa, parent_qdsa);
}
entry = hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
} }
/* /*
@ -238,7 +296,11 @@ hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_bu
if (entry->counters.calls.calls > 1) if (entry->counters.calls.calls > 1)
entry->counters.state = PGSS_FINISHED; entry->counters.state = PGSS_FINISHED;
else else
entry = hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL); {
pdsa = entry->query_pos;
dshash_delete_current(&hstat);
dsa_free(pgsmStateLocal.dsa, pdsa);
}
continue; continue;
} }
@ -266,11 +328,15 @@ hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_bu
if (entry->counters.calls.calls > 1) if (entry->counters.calls.calls > 1)
entry->counters.state = PGSS_FINISHED; entry->counters.state = PGSS_FINISHED;
else else
entry = hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL); {
pdsa = entry->query_pos;
dshash_delete_current(&hstat);
dsa_free(pgsmStateLocal.dsa, pdsa);
}
} }
} }
} }
dshash_seq_term(&hstat);
/* /*
* Iterate over the list of pending queries in order to add them back to * Iterate over the list of pending queries in order to add them back to
* the hash table with the updated bucket id. * the hash table with the updated bucket id.
@ -281,7 +347,8 @@ hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_bu
pgssEntry *new_entry; pgssEntry *new_entry;
pgssEntry *old_entry = (pgssEntry *) lfirst(pending_entry); pgssEntry *old_entry = (pgssEntry *) lfirst(pending_entry);
new_entry = (pgssEntry *) hash_search(pgss_hash, &old_entry->key, HASH_ENTER_NULL, &found);
new_entry = (pgssEntry *) dshash_find_or_insert(pgsmStateLocal.shared_hash, &old_entry->key, &found);
if (new_entry == NULL) if (new_entry == NULL)
elog(DEBUG1, "%s", "pg_stat_monitor: out of memory"); elog(DEBUG1, "%s", "pg_stat_monitor: out of memory");
else if (!found) else if (!found)
@ -292,8 +359,9 @@ hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_bu
new_entry->encoding = old_entry->encoding; new_entry->encoding = old_entry->encoding;
new_entry->query_pos = old_entry->query_pos; new_entry->query_pos = old_entry->query_pos;
} }
free(old_entry); free(old_entry);
dshash_release_lock(pgsmStateLocal.shared_hash, entry);
} }
list_free(pending_entries); list_free(pending_entries);
@ -306,16 +374,22 @@ void
hash_entry_reset() hash_entry_reset()
{ {
pgssSharedState *pgss = pgsm_get_ss(); pgssSharedState *pgss = pgsm_get_ss();
HASH_SEQ_STATUS hash_seq; dshash_seq_status hstat;
pgssEntry *entry; pgssEntry *entry;
LWLockAcquire(pgss->lock, LW_EXCLUSIVE); LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
hash_seq_init(&hash_seq, pgss_hash); dshash_seq_init(&hstat, pgsmStateLocal.shared_hash, true);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
while ((entry = dshash_seq_next(&hstat)) != NULL)
{ {
hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL); dsa_pointer pdsa = entry->query_pos;
dshash_delete_current(&hstat);
dsa_free(pgsmStateLocal.dsa, pdsa);
} }
dshash_seq_term(&hstat);
pg_atomic_write_u64(&pgss->current_wbucket, 0); pg_atomic_write_u64(&pgss->current_wbucket, 0);
LWLockRelease(pgss->lock); LWLockRelease(pgss->lock);
} }
@ -323,6 +397,5 @@ hash_entry_reset()
bool bool
IsHashInitialize(void) IsHashInitialize(void)
{ {
return (pgss != NULL && return (pgsmStateLocal.shared_pgssState != NULL);
pgss_hash != NULL);
} }

View File

@ -32,7 +32,6 @@
PGSM_V2_0 PGSM_V2_0
} pgsmVersion; } pgsmVersion;
PG_MODULE_MAGIC; PG_MODULE_MAGIC;
#define BUILD_VERSION "2.0.0-dev" #define BUILD_VERSION "2.0.0-dev"
@ -78,6 +77,7 @@ static int plan_nested_level = 0;
/* The array to store outer layer query id*/ /* The array to store outer layer query id*/
uint64 *nested_queryids; uint64 *nested_queryids;
char **nested_query_txts;
/* Regex object used to extract query comments. */ /* Regex object used to extract query comments. */
static regex_t preg_query_comments; static regex_t preg_query_comments;
@ -88,13 +88,11 @@ static struct rusage rusage_start;
static struct rusage rusage_end; static struct rusage rusage_end;
/* Query buffer, store queries' text. */ /* Query buffer, store queries' text. */
static unsigned char *pgss_qbuf = NULL;
static char *pgss_explain(QueryDesc *queryDesc); static char *pgss_explain(QueryDesc *queryDesc);
static void extract_query_comments(const char *query, char *comments, size_t max_len); static void extract_query_comments(const char *query, char *comments, size_t max_len);
static int get_histogram_bucket(double q_time); static int get_histogram_bucket(double q_time);
static bool IsSystemInitialized(void); static bool IsSystemInitialized(void);
static bool dump_queries_buffer(int bucket_id, unsigned char *buf, int buf_len);
static double time_diff(struct timeval end, struct timeval start); static double time_diff(struct timeval end, struct timeval start);
static void request_additional_shared_resources(void); static void request_additional_shared_resources(void);
@ -230,7 +228,6 @@ static uint64 djb2_hash(unsigned char *str, size_t len);
/* Same as above, but stores the calculated string length into *out_len (small optimization) */ /* Same as above, but stores the calculated string length into *out_len (small optimization) */
static uint64 djb2_hash_str(unsigned char *str, int *out_len); static uint64 djb2_hash_str(unsigned char *str, int *out_len);
/* /*
* Module load callback * Module load callback
*/ */
@ -239,7 +236,6 @@ void
_PG_init(void) _PG_init(void)
{ {
int rc; int rc;
char file_name[1024];
elog(DEBUG2, "pg_stat_monitor: %s()", __FUNCTION__); elog(DEBUG2, "pg_stat_monitor: %s()", __FUNCTION__);
@ -266,8 +262,6 @@ _PG_init(void)
EnableQueryId(); EnableQueryId();
#endif #endif
snprintf(file_name, 1024, "%s", PGSM_TEXT_FILE);
unlink(file_name);
EmitWarningsOnPlaceholders("pg_stat_monitor"); EmitWarningsOnPlaceholders("pg_stat_monitor");
@ -313,6 +307,7 @@ _PG_init(void)
ExecutorCheckPerms_hook = HOOK(pgss_ExecutorCheckPerms); ExecutorCheckPerms_hook = HOOK(pgss_ExecutorCheckPerms);
nested_queryids = (uint64 *) malloc(sizeof(uint64) * max_stack_depth); nested_queryids = (uint64 *) malloc(sizeof(uint64) * max_stack_depth);
nested_query_txts = (char **) malloc(sizeof(char*) * max_stack_depth);
system_init = true; system_init = true;
} }
@ -335,6 +330,7 @@ _PG_fini(void)
emit_log_hook = prev_emit_log_hook; emit_log_hook = prev_emit_log_hook;
free(nested_queryids); free(nested_queryids);
free(nested_query_txts);
regfree(&preg_query_comments); regfree(&preg_query_comments);
hash_entry_reset(); hash_entry_reset();
@ -363,7 +359,7 @@ request_additional_shared_resources(void)
* the postmaster process.) We'll allocate or attach to the shared * the postmaster process.) We'll allocate or attach to the shared
* resources in pgss_shmem_startup(). * resources in pgss_shmem_startup().
*/ */
RequestAddinShmemSpace(hash_memsize() + HOOK_STATS_SIZE); RequestAddinShmemSpace(pgsm_ShmemSize() + HOOK_STATS_SIZE);
RequestNamedLWLockTranche("pg_stat_monitor", 1); RequestNamedLWLockTranche("pg_stat_monitor", 1);
} }
/* /*
@ -554,7 +550,11 @@ pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
bool execute_once) bool execute_once)
{ {
if (exec_nested_level >= 0 && exec_nested_level < max_stack_depth) if (exec_nested_level >= 0 && exec_nested_level < max_stack_depth)
{
nested_queryids[exec_nested_level] = queryDesc->plannedstmt->queryId; nested_queryids[exec_nested_level] = queryDesc->plannedstmt->queryId;
nested_query_txts[exec_nested_level] = strdup(queryDesc->sourceText);
}
exec_nested_level++; exec_nested_level++;
PG_TRY(); PG_TRY();
{ {
@ -564,13 +564,23 @@ pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,
standard_ExecutorRun(queryDesc, direction, count, execute_once); standard_ExecutorRun(queryDesc, direction, count, execute_once);
exec_nested_level--; exec_nested_level--;
if (exec_nested_level >= 0 && exec_nested_level < max_stack_depth) if (exec_nested_level >= 0 && exec_nested_level < max_stack_depth)
{
nested_queryids[exec_nested_level] = UINT64CONST(0); nested_queryids[exec_nested_level] = UINT64CONST(0);
if(nested_query_txts[exec_nested_level])
free(nested_query_txts[exec_nested_level]);
nested_query_txts[exec_nested_level] = NULL;
}
} }
PG_CATCH(); PG_CATCH();
{ {
exec_nested_level--; exec_nested_level--;
if (exec_nested_level >= 0 && exec_nested_level < max_stack_depth) if (exec_nested_level >= 0 && exec_nested_level < max_stack_depth)
{
nested_queryids[exec_nested_level] = UINT64CONST(0); nested_queryids[exec_nested_level] = UINT64CONST(0);
if(nested_query_txts[exec_nested_level])
free(nested_query_txts[exec_nested_level]);
nested_query_txts[exec_nested_level] = NULL;
}
PG_RE_THROW(); PG_RE_THROW();
} }
PG_END_TRY(); PG_END_TRY();
@ -1260,11 +1270,29 @@ pgss_update_entry(pgssEntry *entry,
if (exec_nested_level > 0) if (exec_nested_level > 0)
{ {
if (exec_nested_level >= 0 && exec_nested_level < max_stack_depth) if (exec_nested_level >= 0 && exec_nested_level < max_stack_depth)
{
int parent_query_len = nested_query_txts[exec_nested_level - 1]?
strlen(nested_query_txts[exec_nested_level - 1]): 0;
e->counters.info.parentid = nested_queryids[exec_nested_level - 1]; e->counters.info.parentid = nested_queryids[exec_nested_level - 1];
if (parent_query_len > 0)
{
char *qry_buff;
dsa_area *query_dsa_area = get_dsa_area_for_query_text();
dsa_pointer qry = dsa_allocate(query_dsa_area, parent_query_len+1);
qry_buff = dsa_get_address(query_dsa_area, qry);
memcpy(qry_buff, nested_query_txts[exec_nested_level - 1], parent_query_len);
qry_buff[parent_query_len] = 0;
e->counters.info.parent_query = qry;
}
else
e->counters.info.parent_query = InvalidDsaPointer;
}
} }
else else
{ {
e->counters.info.parentid = UINT64CONST(0); e->counters.info.parentid = UINT64CONST(0);
e->counters.info.parent_query = InvalidDsaPointer;
} }
if (error_info) if (error_info)
@ -1381,7 +1409,6 @@ pgss_store(uint64 queryid,
JumbleState *jstate, JumbleState *jstate,
pgssStoreKind kind) pgssStoreKind kind)
{ {
HTAB *pgss_hash;
pgssHashKey key; pgssHashKey key;
pgssEntry *entry; pgssEntry *entry;
pgssSharedState *pgss = pgsm_get_ss(); pgssSharedState *pgss = pgsm_get_ss();
@ -1486,19 +1513,15 @@ pgss_store(uint64 queryid,
#else #else
key.toplevel = ((exec_nested_level + plan_nested_level) == 0); key.toplevel = ((exec_nested_level + plan_nested_level) == 0);
#endif #endif
pgss_hash = pgsm_get_hash();
LWLockAcquire(pgss->lock, LW_SHARED); LWLockAcquire(pgss->lock, LW_SHARED);
entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL); entry = (pgssEntry *) dshash_find(get_pgssHash(), &key, false);
if (!entry) if (!entry)
{ {
pgssQueryEntry *query_entry; dsa_pointer dsa_query_pointer;
bool query_found = false; char* query_buff;
uint64 prev_qbuf_len = 0;
HTAB *pgss_query_hash;
pgss_query_hash = pgsm_get_query_hash();
/* /*
* Create a new, normalized query string if caller asked. We don't * Create a new, normalized query string if caller asked. We don't
@ -1509,74 +1532,41 @@ pgss_store(uint64 queryid,
*/ */
if (jstate && PGSM_NORMALIZED_QUERY) if (jstate && PGSM_NORMALIZED_QUERY)
{ {
LWLockRelease(pgss->lock);
norm_query = generate_normalized_query(jstate, query, norm_query = generate_normalized_query(jstate, query,
query_location, query_location,
&query_len, &query_len,
GetDatabaseEncoding()); GetDatabaseEncoding());
LWLockAcquire(pgss->lock, LW_SHARED);
} }
query_entry = hash_search(pgss_query_hash, &queryid, HASH_ENTER_NULL, &query_found); /* New query, truncate length if necessary. */
if (query_entry == NULL) if (query_len > PGSM_QUERY_MAX_LEN)
{ query_len = PGSM_QUERY_MAX_LEN;
LWLockRelease(pgss->lock);
if (norm_query)
pfree(norm_query);
elog(DEBUG1, "pgss_store: out of memory (pgss_query_hash).");
return;
}
else if (!query_found)
{
/* New query, truncate length if necessary. */
if (query_len > PGSM_QUERY_MAX_LEN)
query_len = PGSM_QUERY_MAX_LEN;
}
/* Need exclusive lock to make a new hashtable entry - promote */ /* Need exclusive lock to make a new hashtable entry - promote */
LWLockRelease(pgss->lock); LWLockRelease(pgss->lock);
LWLockAcquire(pgss->lock, LW_EXCLUSIVE); LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
if (!query_found) /* Save the query text in raw dsa area */
{ dsa_area* query_dsa_area = get_dsa_area_for_query_text();
if (!SaveQueryText(bucketid, dsa_query_pointer = dsa_allocate(query_dsa_area, query_len+1);
queryid, query_buff = dsa_get_address(query_dsa_area, dsa_query_pointer);
pgss_qbuf, memcpy(query_buff, norm_query ? norm_query : query, query_len);
norm_query ? norm_query : query, query_buff[query_len] = 0;
query_len,
&query_entry->query_pos))
{
LWLockRelease(pgss->lock);
if (norm_query)
pfree(norm_query);
elog(DEBUG1, "pgss_store: insufficient shared space for query.");
return;
}
/*
* Save current query buffer length, if we fail to add a new new
* entry to the hash table then we must restore the original
* length.
*/
memcpy(&prev_qbuf_len, pgss_qbuf, sizeof(prev_qbuf_len));
}
/* OK to create a new hashtable entry */ /* OK to create a new hashtable entry */
entry = hash_entry_alloc(pgss, &key, GetDatabaseEncoding()); entry = hash_entry_alloc(pgss, &key, GetDatabaseEncoding());
if (entry == NULL) if (entry == NULL)
{ {
if (!query_found)
{
/* Restore previous query buffer length. */
memcpy(pgss_qbuf, &prev_qbuf_len, sizeof(prev_qbuf_len));
}
LWLockRelease(pgss->lock); LWLockRelease(pgss->lock);
if (norm_query) if (norm_query)
pfree(norm_query); pfree(norm_query);
return; return;
} }
entry->query_pos = query_entry->query_pos; entry->query_pos = dsa_query_pointer;
} }
else
dshash_release_lock(get_pgssHash(), entry);
if (jstate == NULL) if (jstate == NULL)
pgss_update_entry(entry, /* entry */ pgss_update_entry(entry, /* entry */
@ -1619,9 +1609,6 @@ pg_stat_monitor_reset(PG_FUNCTION_ARGS)
LWLockAcquire(pgss->lock, LW_EXCLUSIVE); LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
hash_entry_dealloc(-1, -1, NULL); hash_entry_dealloc(-1, -1, NULL);
/* Reset query buffer. */
*(uint64 *) pgss_qbuf = 0;
LWLockRelease(pgss->lock); LWLockRelease(pgss->lock);
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
@ -1678,13 +1665,12 @@ pg_stat_monitor_internal(FunctionCallInfo fcinfo,
Tuplestorestate *tupstore; Tuplestorestate *tupstore;
MemoryContext per_query_ctx; MemoryContext per_query_ctx;
MemoryContext oldcontext; MemoryContext oldcontext;
HASH_SEQ_STATUS hash_seq; dshash_seq_status hstat;
pgssEntry *entry; pgssEntry *entry;
char parentid_txt[32]; char parentid_txt[32];
pgssSharedState *pgss = pgsm_get_ss(); pgssSharedState *pgss = pgsm_get_ss();
HTAB *pgss_hash = pgsm_get_hash(); char *query_txt = NULL;
char *query_txt = (char *) palloc0(PGSM_QUERY_MAX_LEN + 1); char *parent_query_txt = NULL;
char *parent_query_txt = (char *) palloc0(PGSM_QUERY_MAX_LEN + 1);
int expected_columns = (api_version >= PGSM_V2_0)?PG_STAT_MONITOR_COLS_V2_0:PG_STAT_MONITOR_COLS_V1_0; int expected_columns = (api_version >= PGSM_V2_0)?PG_STAT_MONITOR_COLS_V2_0:PG_STAT_MONITOR_COLS_V1_0;
/* Safety check... */ /* Safety check... */
@ -1722,10 +1708,11 @@ pg_stat_monitor_internal(FunctionCallInfo fcinfo,
MemoryContextSwitchTo(oldcontext); MemoryContextSwitchTo(oldcontext);
LWLockAcquire(pgss->lock, LW_SHARED); // LWLockAcquire(pgss->lock, LW_SHARED);
hash_seq_init(&hash_seq, pgss_hash); dshash_seq_init(&hstat, get_pgssHash(), false);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
while ((entry = dshash_seq_next(&hstat)) != NULL)
{ {
Datum values[PG_STAT_MONITOR_COLS] = {0}; Datum values[PG_STAT_MONITOR_COLS] = {0};
bool nulls[PG_STAT_MONITOR_COLS] = {0}; bool nulls[PG_STAT_MONITOR_COLS] = {0};
@ -1740,6 +1727,8 @@ pg_stat_monitor_internal(FunctionCallInfo fcinfo,
uint64 userid = entry->key.userid; uint64 userid = entry->key.userid;
int64 ip = entry->key.ip; int64 ip = entry->key.ip;
uint64 planid = entry->key.planid; uint64 planid = entry->key.planid;
dsa_area *query_dsa_area;
char *query_ptr;
#if PG_VERSION_NUM < 140000 #if PG_VERSION_NUM < 140000
bool toplevel = 1; bool toplevel = 1;
bool is_allowed_role = is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS); bool is_allowed_role = is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS);
@ -1747,15 +1736,10 @@ pg_stat_monitor_internal(FunctionCallInfo fcinfo,
bool is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS); bool is_allowed_role = is_member_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS);
bool toplevel = entry->key.toplevel; bool toplevel = entry->key.toplevel;
#endif #endif
/* Load the query text from dsa area */
if (read_query(pgss_qbuf, queryid, query_txt, entry->query_pos) == 0) query_dsa_area = get_dsa_area_for_query_text();
{ query_ptr = dsa_get_address(query_dsa_area, entry->query_pos);
int rc; query_txt = pstrdup(query_ptr);
rc = read_query_buffer(bucketid, queryid, query_txt, entry->query_pos);
if (rc != 1)
snprintf(query_txt, 32, "%s", "<insufficient disk/shared space>");
}
/* copy counters to a local variable to keep locking time short */ /* copy counters to a local variable to keep locking time short */
{ {
@ -1783,15 +1767,17 @@ pg_stat_monitor_internal(FunctionCallInfo fcinfo,
if (tmp.state == PGSS_PARSE || tmp.state == PGSS_PLAN) if (tmp.state == PGSS_PARSE || tmp.state == PGSS_PLAN)
continue; continue;
/* read the parent query text if any */
if (tmp.info.parentid != UINT64CONST(0)) if (tmp.info.parentid != UINT64CONST(0))
{ {
if (read_query(pgss_qbuf, tmp.info.parentid, parent_query_txt, 0) == 0) if (DsaPointerIsValid(tmp.info.parent_query))
{ {
int rc = read_query_buffer(bucketid, tmp.info.parentid, parent_query_txt, 0); query_dsa_area = get_dsa_area_for_query_text();
query_ptr = dsa_get_address(query_dsa_area, tmp.info.parent_query);
if (rc != 1) parent_query_txt = pstrdup(query_ptr);
snprintf(parent_query_txt, 32, "%s", "<insufficient disk/shared space>");
} }
else
parent_query_txt = pstrdup("parent query text not available");
} }
/* bucketid at column number 0 */ /* bucketid at column number 0 */
values[i++] = Int64GetDatumFast(bucketid); values[i++] = Int64GetDatumFast(bucketid);
@ -2071,10 +2057,12 @@ pg_stat_monitor_internal(FunctionCallInfo fcinfo,
tuplestore_putvalues(tupstore, tupdesc, values, nulls); tuplestore_putvalues(tupstore, tupdesc, values, nulls);
} }
/* clean up and return the tuplestore */ /* clean up and return the tuplestore */
LWLockRelease(pgss->lock); dshash_seq_term(&hstat);
pfree(query_txt); if(query_txt)
pfree(parent_query_txt); pfree(query_txt);
if(parent_query_txt)
pfree(parent_query_txt);
tuplestore_donestoring(tupstore); tuplestore_donestoring(tupstore);
} }
@ -2120,7 +2108,6 @@ get_next_wbucket(pgssSharedState *pgss)
if (update_bucket) if (update_bucket)
{ {
char file_name[1024];
new_bucket_id = (tv.tv_sec / PGSM_BUCKET_TIME) % PGSM_MAX_BUCKETS; new_bucket_id = (tv.tv_sec / PGSM_BUCKET_TIME) % PGSM_MAX_BUCKETS;
@ -2128,24 +2115,7 @@ get_next_wbucket(pgssSharedState *pgss)
prev_bucket_id = pg_atomic_exchange_u64(&pgss->current_wbucket, new_bucket_id); prev_bucket_id = pg_atomic_exchange_u64(&pgss->current_wbucket, new_bucket_id);
LWLockAcquire(pgss->lock, LW_EXCLUSIVE); LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
hash_entry_dealloc(new_bucket_id, prev_bucket_id, pgss_qbuf); hash_entry_dealloc(new_bucket_id, prev_bucket_id, NULL);
if (pgss->overflow)
{
pgss->n_bucket_cycles += 1;
if (pgss->n_bucket_cycles >= PGSM_MAX_BUCKETS)
{
/*
* A full rotation of PGSM_MAX_BUCKETS buckets happened since
* we detected a query buffer overflow.
* Reset overflow state and remove the dump file.
*/
pgss->overflow = false;
pgss->n_bucket_cycles = 0;
snprintf(file_name, 1024, "%s", PGSM_TEXT_FILE);
unlink(file_name);
}
}
LWLockRelease(pgss->lock); LWLockRelease(pgss->lock);
@ -3144,165 +3114,6 @@ intarray_get_datum(int32 arr[], int len)
} }
uint64
read_query(unsigned char *buf, uint64 queryid, char *query, size_t pos)
{
bool found = false;
uint64 query_id = 0;
uint64 query_len = 0;
uint64 rlen = 0;
uint64 buf_len = 0;
memcpy(&buf_len, buf, sizeof(uint64));
if (buf_len <= 0)
goto exit;
/* If a position hint is given, try to locate the query directly. */
if (pos != 0 && (pos + sizeof(uint64) + sizeof(uint64)) < buf_len)
{
memcpy(&query_id, &buf[pos], sizeof(uint64));
if (query_id != queryid)
return 0;
pos += sizeof(uint64);
memcpy(&query_len, &buf[pos], sizeof(uint64)); /* query len */
pos += sizeof(uint64);
if (pos + query_len > buf_len) /* avoid reading past buffer's length. */
return 0;
memcpy(query, &buf[pos], query_len); /* Actual query */
query[query_len] = '\0';
return queryid;
}
rlen = sizeof(uint64); /* Move forwad to skip length bytes */
for (;;)
{
if (rlen >= buf_len)
goto exit;
memcpy(&query_id, &buf[rlen], sizeof(uint64)); /* query id */
if (query_id == queryid)
found = true;
rlen += sizeof(uint64);
if (buf_len <= rlen)
continue;
memcpy(&query_len, &buf[rlen], sizeof(uint64)); /* query len */
rlen += sizeof(uint64);
if (buf_len < rlen + query_len)
goto exit;
if (found)
{
if (query != NULL)
{
memcpy(query, &buf[rlen], query_len); /* Actual query */
query[query_len] = 0;
}
return query_id;
}
rlen += query_len;
}
exit:
if (PGSM_OVERFLOW_TARGET == OVERFLOW_TARGET_NONE)
{
sprintf(query, "%s", "<insufficient shared space>");
return -1;
}
return 0;
}
bool
SaveQueryText(uint64 bucketid,
uint64 queryid,
unsigned char *buf,
const char *query,
uint64 query_len,
size_t *query_pos)
{
uint64 buf_len = 0;
memcpy(&buf_len, buf, sizeof(uint64));
if (buf_len == 0)
buf_len += sizeof(uint64);
if (QUERY_BUFFER_OVERFLOW(buf_len, query_len))
{
switch (PGSM_OVERFLOW_TARGET)
{
case OVERFLOW_TARGET_NONE:
return false;
case OVERFLOW_TARGET_DISK:
{
bool dump_ok;
pgssSharedState *pgss = pgsm_get_ss();
if (pgss->overflow)
{
elog(DEBUG1, "query buffer overflowed twice");
return false;
}
/*
* If the query buffer is empty, there is nothing to dump,
* this also means that the current query length exceeds
* MAX_QUERY_BUF.
*/
if (buf_len <= sizeof(uint64))
return false;
dump_ok = dump_queries_buffer(bucketid, buf, MAX_QUERY_BUF);
buf_len = sizeof(uint64);
if (dump_ok)
{
pgss->overflow = true;
pgss->n_bucket_cycles = 0;
}
/*
* We must check for overflow again, as the query length
* may exceed the total size allocated to the buffer
* (MAX_QUERY_BUF).
*/
if (QUERY_BUFFER_OVERFLOW(buf_len, query_len))
{
/*
* If we successfully dumped the query buffer to disk,
* then reset the buffer, otherwise we could end up
* dumping the same buffer again.
*/
if (dump_ok)
*(uint64 *) buf = 0;
return false;
}
}
break;
default:
Assert(false);
break;
}
}
*query_pos = buf_len;
memcpy(&buf[buf_len], &queryid, sizeof(uint64)); /* query id */
buf_len += sizeof(uint64);
memcpy(&buf[buf_len], &query_len, sizeof(uint64)); /* query length */
buf_len += sizeof(uint64);
memcpy(&buf[buf_len], query, query_len); /* query */
buf_len += query_len;
memcpy(buf, &buf_len, sizeof(uint64));
return true;
}
Datum Datum
pg_stat_monitor_settings(PG_FUNCTION_ARGS) pg_stat_monitor_settings(PG_FUNCTION_ARGS)
@ -3440,12 +3251,6 @@ pg_stat_monitor_hook_stats(PG_FUNCTION_ARGS)
return (Datum) 0; return (Datum) 0;
} }
void
set_qbuf(unsigned char *buf)
{
pgss_qbuf = buf;
*(uint64 *) pgss_qbuf = 0;
}
void void
pgsm_emit_log_hook(ErrorData *edata) pgsm_emit_log_hook(ErrorData *edata)
@ -3482,145 +3287,6 @@ IsSystemInitialized(void)
return (system_init && IsHashInitialize()); return (system_init && IsHashInitialize());
} }
static bool
dump_queries_buffer(int bucket_id, unsigned char *buf, int buf_len)
{
int fd = 0;
char file_name[1024];
bool success = true;
int off = 0;
int tries = 0;
snprintf(file_name, 1024, "%s", PGSM_TEXT_FILE);
fd = OpenTransientFile(file_name, O_RDWR | O_CREAT | O_APPEND | PG_BINARY);
if (fd < 0)
{
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m",
file_name)));
return false;
}
/* Loop until write buf_len bytes to the file. */
do
{
ssize_t nwrite = write(fd, buf + off, buf_len - off);
if (nwrite == -1)
{
if (errno == EINTR && tries++ < 3)
continue;
success = false;
break;
}
off += nwrite;
} while (off < buf_len);
if (!success)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not write file \"%s\": %m", file_name)));
if (fd > 0)
CloseTransientFile(fd);
return success;
}
/*
* Try to locate query text in a dumped file for bucket_id.
*
* Returns:
* 1 Query sucessfully read, query_text will contain the query text.
* 0 Query not found.
* -1 I/O Error.
*/
int
read_query_buffer(int bucket_id, uint64 queryid, char *query_txt, size_t pos)
{
int fd = 0;
char file_name[1024];
unsigned char *buf = NULL;
ssize_t nread = 0;
int off = 0;
int tries = 0;
bool done = false;
bool found = false;
snprintf(file_name, 1024, "%s", PGSM_TEXT_FILE);
fd = OpenTransientFile(file_name, O_RDONLY | PG_BINARY);
if (fd < 0)
goto exit;
buf = (unsigned char *) palloc(MAX_QUERY_BUF);
while (!done)
{
off = 0;
/* read a chunck of MAX_QUERY_BUF size. */
do
{
nread = read(fd, buf + off, MAX_QUERY_BUF - off);
if (nread == -1)
{
if (errno == EINTR && tries++ < 3) /* read() was interrupted,
* attempt to read again
* (max attempts=3) */
continue;
goto exit;
}
else if (nread == 0) /* EOF */
{
done = true;
break;
}
off += nread;
} while (off < MAX_QUERY_BUF);
if (off == MAX_QUERY_BUF)
{
/* we have a chunck, scan it looking for queryid. */
if (read_query(buf, queryid, query_txt, pos) != 0)
{
found = true;
/* query was found, don't need to read another chunck. */
break;
}
}
else
/*
* Either done=true or file has a size not multiple of
* MAX_QUERY_BUF. It is safe to assume that the file was truncated
* or corrupted.
*/
break;
}
exit:
if (fd < 0 || nread == -1)
ereport(LOG,
(errcode_for_file_access(),
errmsg("could not read file \"%s\": %m",
file_name)));
if (fd >= 0)
CloseTransientFile(fd);
if (buf)
pfree(buf);
if (found)
return 1;
else if (fd == -1 || nread == -1)
return -1; /* I/O error. */
else
return 0; /* Not found. */
}
static double static double
time_diff(struct timeval end, struct timeval start) time_diff(struct timeval end, struct timeval start)

View File

@ -27,6 +27,9 @@
#include <sys/time.h> #include <sys/time.h>
#include <sys/resource.h> #include <sys/resource.h>
#include "lib/dshash.h"
#include "utils/dsa.h"
#include "access/hash.h" #include "access/hash.h"
#include "catalog/pg_authid.h" #include "catalog/pg_authid.h"
#include "executor/instrument.h" #include "executor/instrument.h"
@ -179,20 +182,6 @@ typedef struct CallTime
double sum_var_time; /* sum of variances in execution time in msec */ double sum_var_time; /* sum of variances in execution time in msec */
} CallTime; } CallTime;
/*
* Entry type for queries hash table (query ID).
*
* We use a hash table to keep track of query IDs that have their
* corresponding query text added to the query buffer (pgsm_query_shared_buffer).
*
* This allow us to avoid adding duplicated queries to the buffer, therefore
* leaving more space for other queries and saving some CPU.
*/
typedef struct pgssQueryEntry
{
uint64 queryid; /* query identifier, also the key. */
size_t query_pos; /* query location within query buffer */
} pgssQueryEntry;
typedef struct PlanInfo typedef struct PlanInfo
{ {
@ -216,6 +205,7 @@ typedef struct pgssHashKey
typedef struct QueryInfo typedef struct QueryInfo
{ {
uint64 parentid; /* parent queryid of current query */ uint64 parentid; /* parent queryid of current query */
dsa_pointer parent_query;
int64 type; /* type of query, options are query, info, int64 type; /* type of query, options are query, info,
* warning, error, fatal */ * warning, error, fatal */
char application_name[APPLICATIONNAME_LEN]; char application_name[APPLICATIONNAME_LEN];
@ -322,7 +312,7 @@ typedef struct pgssEntry
Counters counters; /* the statistics for this query */ Counters counters; /* the statistics for this query */
int encoding; /* query text encoding */ int encoding; /* query text encoding */
slock_t mutex; /* protects the counters only */ slock_t mutex; /* protects the counters only */
size_t query_pos; /* query location within query buffer */ dsa_pointer query_pos; /* query location within query buffer */
} pgssEntry; } pgssEntry;
/* /*
@ -353,10 +343,19 @@ typedef struct pgssSharedState
* This allows us to avoid having a large file on disk that would also * This allows us to avoid having a large file on disk that would also
* slowdown queries to the pg_stat_monitor view. * slowdown queries to the pg_stat_monitor view.
*/ */
bool overflow;
size_t n_bucket_cycles; size_t n_bucket_cycles;
int hash_tranche_id;
void *raw_dsa_area;
dshash_table_handle hash_handle;
} pgssSharedState; } pgssSharedState;
typedef struct pgsmLocalState
{
pgssSharedState *shared_pgssState;
dsa_area *dsa;
dshash_table *shared_hash;
}pgsmLocalState;
#define ResetSharedState(x) \ #define ResetSharedState(x) \
do { \ do { \
x->cur_median_usage = ASSUMED_MEDIAN_INIT; \ x->cur_median_usage = ASSUMED_MEDIAN_INIT; \
@ -418,27 +417,22 @@ void init_guc(void);
GucVariable *get_conf(int i); GucVariable *get_conf(int i);
/* hash_create.c */ /* hash_create.c */
dsa_area *get_dsa_area_for_query_text(void);
dshash_table *get_pgssHash(void);
void pgsm_attach_shmem(void);
bool IsHashInitialize(void); bool IsHashInitialize(void);
void pgss_shmem_startup(void); void pgss_shmem_startup(void);
void pgss_shmem_shutdown(int code, Datum arg); void pgss_shmem_shutdown(int code, Datum arg);
int pgsm_get_bucket_size(void); int pgsm_get_bucket_size(void);
pgssSharedState *pgsm_get_ss(void); pgssSharedState *pgsm_get_ss(void);
HTAB *pgsm_get_plan_hash(void);
HTAB *pgsm_get_hash(void);
HTAB *pgsm_get_query_hash(void);
HTAB *pgsm_get_plan_hash(void);
void hash_entry_reset(void); void hash_entry_reset(void);
void hash_query_entryies_reset(void); void hash_query_entryies_reset(void);
void hash_query_entries(); void hash_query_entries();
void hash_query_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_buffer[]); void hash_query_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_buffer[]);
void hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_buffer); void hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_buffer);
pgssEntry *hash_entry_alloc(pgssSharedState *pgss, pgssHashKey *key, int encoding); pgssEntry *hash_entry_alloc(pgssSharedState *pgss, pgssHashKey *key, int encoding);
Size hash_memsize(void); Size pgsm_ShmemSize(void);
int read_query_buffer(int bucket_id, uint64 queryid, char *query_txt, size_t pos);
uint64 read_query(unsigned char *buf, uint64 queryid, char *query, size_t pos);
void pgss_startup(void); void pgss_startup(void);
void set_qbuf(unsigned char *);
/* hash_query.c */ /* hash_query.c */
void pgss_startup(void); void pgss_startup(void);