PG-488: pg_stat_monitor: Overflow management.

Reimplement the storage mechanism of buckets and query texts
using Dynamic shared memory. Since the dynamic shared memory
can grow into a swap area, so we get the overflow out of the box.

oreover the new design saves the query pointer inside the bucket
and eventually, the query text gets evicted with the bucket recycle.

Finally, the dynamic shared memory hash has a built-in locking
mechanism so we can revisit the whole locking in pg_stat_monitor
has potential for lots of performance improvements
This commit is contained in:
Muhammad Usama
2022-12-20 17:29:15 +05:00
parent 5a6b824737
commit df0580b741
3 changed files with 246 additions and 513 deletions

View File

@@ -16,59 +16,93 @@
*/
#include "postgres.h"
#include "nodes/pg_list.h"
#include "pg_stat_monitor.h"
static pgsmLocalState pgsmStateLocal;
static pgssSharedState *pgss;
static HTAB *pgss_hash;
static HTAB *pgss_query_hash;
/* parameter for the shared hash */
static dshash_parameters dsh_params = {
sizeof(pgssHashKey),
sizeof(pgssEntry),
dshash_memcmp,
dshash_memhash
};
static void pgsm_proc_exit(int code, Datum arg);
static HTAB *
hash_init(const char *hash_name, int key_size, int entry_size, int hash_size)
static Size
pgsm_query_area_size(void)
{
HASHCTL info;
memset(&info, 0, sizeof(info));
info.keysize = key_size;
info.entrysize = entry_size;
return ShmemInitHash(hash_name, hash_size, hash_size, &info, HASH_ELEM | HASH_BLOBS);
Size sz = MAXALIGN(MAX_QUERY_BUF);
return MAXALIGN(sz);
}
Size
pgsm_ShmemSize(void)
{
Size sz = MAXALIGN(sizeof(pgssSharedState));
sz = add_size(sz, pgsm_query_area_size());
sz = add_size(sz, hash_estimate_size(MAX_BUCKET_ENTRIES, sizeof(pgssEntry)));
return sz;
}
void
pgss_startup(void)
{
bool found = false;
pgssSharedState *pgss;
/* reset in case this is a restart within the postmaster */
pgss = NULL;
pgss_hash = NULL;
pgsmStateLocal.dsa = NULL;
pgsmStateLocal.shared_hash = NULL;
pgsmStateLocal.shared_pgssState = NULL;
/*
* Create or attach to the shared memory state, including hash table
*/
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
pgss = ShmemInitStruct("pg_stat_monitor", sizeof(pgssSharedState), &found);
pgss = ShmemInitStruct("pg_stat_monitor", pgsm_ShmemSize(), &found);
if (!found)
{
/* First time through ... */
dsa_area *dsa;
dshash_table *dsh;
char *p = (char *) pgss;
pgss->lock = &(GetNamedLWLockTranche("pg_stat_monitor"))->lock;
SpinLockInit(&pgss->mutex);
ResetSharedState(pgss);
/* the allocation of pgssSharedState itself */
p += MAXALIGN(sizeof(pgssSharedState));
pgss->raw_dsa_area = p;
dsa = dsa_create_in_place(pgss->raw_dsa_area,
pgsm_query_area_size(),
LWLockNewTrancheId(), 0);
dsa_pin(dsa);
dsa_set_size_limit(dsa, pgsm_query_area_size());
pgss->hash_tranche_id = LWLockNewTrancheId();
dsh_params.tranche_id = pgss->hash_tranche_id;
dsh = dshash_create(dsa, &dsh_params, 0);
pgss->hash_handle = dshash_get_hash_table_handle(dsh);
if (PGSM_OVERFLOW_TARGET == OVERFLOW_TARGET_DISK)
dsa_set_size_limit(dsa, -1);
pgsmStateLocal.shared_pgssState = pgss;
/*
* Postmaster will never access these again, thus free the local
* dsa/dshash references.
*/
dshash_detach(dsh);
dsa_detach(dsa);
}
#ifdef BENCHMARK
init_hook_stats();
#endif
set_qbuf((unsigned char *) ShmemAlloc(MAX_QUERY_BUF));
pgss_hash = hash_init("pg_stat_monitor: bucket hashtable", sizeof(pgssHashKey), sizeof(pgssEntry), MAX_BUCKET_ENTRIES);
pgss_query_hash = hash_init("pg_stat_monitor: queryID hashtable", sizeof(uint64), sizeof(pgssQueryEntry), MAX_BUCKET_ENTRIES);
LWLockRelease(AddinShmemInitLock);
/*
@@ -78,23 +112,49 @@ pgss_startup(void)
on_shmem_exit(pgss_shmem_shutdown, (Datum) 0);
}
void
pgsm_attach_shmem(void)
{
MemoryContext oldcontext;
if (pgsmStateLocal.dsa)
return;
oldcontext = MemoryContextSwitchTo(TopMemoryContext);
pgsmStateLocal.dsa = dsa_attach_in_place(pgsmStateLocal.shared_pgssState->raw_dsa_area,
NULL);
dsa_pin_mapping(pgsmStateLocal.dsa);
dsh_params.tranche_id = pgsmStateLocal.shared_pgssState->hash_tranche_id;
pgsmStateLocal.shared_hash = dshash_attach(pgsmStateLocal.dsa, &dsh_params,
pgsmStateLocal.shared_pgssState->hash_handle, 0);
on_proc_exit(pgsm_proc_exit, 0);
MemoryContextSwitchTo(oldcontext);
}
dsa_area*
get_dsa_area_for_query_text(void)
{
pgsm_attach_shmem();
return pgsmStateLocal.dsa;
}
dshash_table*
get_pgssHash(void)
{
pgsm_attach_shmem();
return pgsmStateLocal.shared_hash;
}
pgssSharedState *
pgsm_get_ss(void)
{
return pgss;
pgsm_attach_shmem();
return pgsmStateLocal.shared_pgssState;
}
HTAB *
pgsm_get_hash(void)
{
return pgss_hash;
}
HTAB *
pgsm_get_query_hash(void)
{
return pgss_query_hash;
}
/*
* shmem_shutdown hook: Dump statistics into file.
@@ -106,26 +166,24 @@ void
pgss_shmem_shutdown(int code, Datum arg)
{
/* Don't try to dump during a crash. */
elog(LOG,"pgss_shmem_shutdown");
if (code)
return;
pgss = NULL;
pgsmStateLocal.shared_pgssState = NULL;
/* Safety check ... shouldn't get here unless shmem is set up. */
if (!IsHashInitialize())
return;
}
Size
hash_memsize(void)
static void
pgsm_proc_exit(int code, Datum arg)
{
Size size;
size = MAXALIGN(sizeof(pgssSharedState));
size += MAXALIGN(MAX_QUERY_BUF);
size = add_size(size, hash_estimate_size(MAX_BUCKET_ENTRIES, sizeof(pgssEntry)));
size = add_size(size, hash_estimate_size(MAX_BUCKET_ENTRIES, sizeof(pgssQueryEntry)));
return size;
Assert(pgsmStateLocal.dsa);
dshash_detach(pgsmStateLocal.shared_hash);
pgsmStateLocal.shared_hash = NULL;
dsa_detach(pgsmStateLocal.dsa);
pgsmStateLocal.dsa = NULL;
}
pgssEntry *
@@ -134,13 +192,9 @@ hash_entry_alloc(pgssSharedState *pgss, pgssHashKey *key, int encoding)
pgssEntry *entry = NULL;
bool found = false;
if (hash_get_num_entries(pgss_hash) >= MAX_BUCKET_ENTRIES)
{
elog(DEBUG1, "pg_stat_monitor: out of memory");
return NULL;
}
/* Find or create an entry with desired hash code */
entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER_NULL, &found);
entry = (pgssEntry *) dshash_find_or_insert(pgsmStateLocal.shared_hash, key, &found);
// entry = (pgssEntry *) hash_search(pgss_hash, key, HASH_ENTER_NULL, &found);
if (entry == NULL)
elog(DEBUG1, "hash_entry_alloc: OUT OF MEMORY");
else if (!found)
@@ -155,6 +209,7 @@ hash_entry_alloc(pgssSharedState *pgss, pgssHashKey *key, int encoding)
/* ... and don't forget the query text metadata */
entry->encoding = encoding;
}
dshash_release_lock(pgsmStateLocal.shared_hash, entry);
return entry;
}
@@ -174,17 +229,22 @@ hash_entry_alloc(pgssSharedState *pgss, pgssHashKey *key, int encoding)
void
hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_buffer)
{
HASH_SEQ_STATUS hash_seq;
dshash_seq_status hstat;
pgssEntry *entry = NULL;
/* Store pending query ids from the previous bucket. */
List *pending_entries = NIL;
ListCell *pending_entry;
if (!pgsmStateLocal.shared_hash)
return;
/* Iterate over the hash table. */
hash_seq_init(&hash_seq, pgss_hash);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
dshash_seq_init(&hstat, pgsmStateLocal.shared_hash, true);
while ((entry = dshash_seq_next(&hstat)) != NULL)
{
dsa_pointer pdsa;
/*
* Remove all entries if new_bucket_id == -1. Otherwise remove entry
* in new_bucket_id if it has finished already.
@@ -193,16 +253,14 @@ hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_bu
(entry->key.bucket_id == new_bucket_id &&
(entry->counters.state == PGSS_FINISHED || entry->counters.state == PGSS_ERROR)))
{
if (new_bucket_id == -1)
{
/*
* pg_stat_monitor_reset(), remove entry from query hash table
* too.
*/
hash_search(pgss_query_hash, &(entry->key.queryid), HASH_REMOVE, NULL);
}
pdsa = entry->query_pos;
dsa_pointer parent_qdsa = entry->counters.info.parent_query;
dshash_delete_current(&hstat);
dsa_free(pgsmStateLocal.dsa, pdsa);
if (DsaPointerIsValid(parent_qdsa))
dsa_free(pgsmStateLocal.dsa, parent_qdsa);
entry = hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
}
/*
@@ -238,7 +296,11 @@ hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_bu
if (entry->counters.calls.calls > 1)
entry->counters.state = PGSS_FINISHED;
else
entry = hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
{
pdsa = entry->query_pos;
dshash_delete_current(&hstat);
dsa_free(pgsmStateLocal.dsa, pdsa);
}
continue;
}
@@ -266,11 +328,15 @@ hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_bu
if (entry->counters.calls.calls > 1)
entry->counters.state = PGSS_FINISHED;
else
entry = hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
{
pdsa = entry->query_pos;
dshash_delete_current(&hstat);
dsa_free(pgsmStateLocal.dsa, pdsa);
}
}
}
}
dshash_seq_term(&hstat);
/*
* Iterate over the list of pending queries in order to add them back to
* the hash table with the updated bucket id.
@@ -281,7 +347,8 @@ hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_bu
pgssEntry *new_entry;
pgssEntry *old_entry = (pgssEntry *) lfirst(pending_entry);
new_entry = (pgssEntry *) hash_search(pgss_hash, &old_entry->key, HASH_ENTER_NULL, &found);
new_entry = (pgssEntry *) dshash_find_or_insert(pgsmStateLocal.shared_hash, &old_entry->key, &found);
if (new_entry == NULL)
elog(DEBUG1, "%s", "pg_stat_monitor: out of memory");
else if (!found)
@@ -292,8 +359,9 @@ hash_entry_dealloc(int new_bucket_id, int old_bucket_id, unsigned char *query_bu
new_entry->encoding = old_entry->encoding;
new_entry->query_pos = old_entry->query_pos;
}
free(old_entry);
dshash_release_lock(pgsmStateLocal.shared_hash, entry);
}
list_free(pending_entries);
@@ -306,16 +374,22 @@ void
hash_entry_reset()
{
pgssSharedState *pgss = pgsm_get_ss();
HASH_SEQ_STATUS hash_seq;
dshash_seq_status hstat;
pgssEntry *entry;
LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
hash_seq_init(&hash_seq, pgss_hash);
while ((entry = hash_seq_search(&hash_seq)) != NULL)
dshash_seq_init(&hstat, pgsmStateLocal.shared_hash, true);
while ((entry = dshash_seq_next(&hstat)) != NULL)
{
hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL);
dsa_pointer pdsa = entry->query_pos;
dshash_delete_current(&hstat);
dsa_free(pgsmStateLocal.dsa, pdsa);
}
dshash_seq_term(&hstat);
pg_atomic_write_u64(&pgss->current_wbucket, 0);
LWLockRelease(pgss->lock);
}
@@ -323,6 +397,5 @@ hash_entry_reset()
bool
IsHashInitialize(void)
{
return (pgss != NULL &&
pgss_hash != NULL);
return (pgsmStateLocal.shared_pgssState != NULL);
}