PG-244: Fix race condition in get_next_wbucket().

The if condition bellow in geta_next_wbucket() was subject to a race
condition:
if ((current_usec - pgss->prev_bucket_usec) > (PGSM_BUCKET_TIME * 1000 *
1000))

Two or more threads/processes could easily evaluate this condition to
true, thus executing more than once the block that would calculate a
new bucket id, clear/move old entries in the pgss_query_hash and
pgss_hash hash tables.

To avoid this problem, we define prev_bucket_usec and current_wbucket
variables as atomic and execute a loop to check if another thread has
updated prev_bucket_usec before the current one.
This commit is contained in:
Diego Fronza
2021-09-30 17:13:27 -03:00
parent f269af3da2
commit 89743e9243
3 changed files with 70 additions and 50 deletions

View File

@@ -1490,6 +1490,7 @@ pgss_store(uint64 queryid,
int application_name_len = pg_get_application_name(application_name);
bool reset = false;
uint64 bucketid;
uint64 prev_bucket_id;
uint64 userid;
int con;
uint64 dbid = MyDatabaseId;
@@ -1511,15 +1512,14 @@ pgss_store(uint64 queryid,
extract_query_comments(query, comments, sizeof(comments));
/* Safety check... */
if (!IsSystemInitialized() || !pgss_qbuf[pgss->current_wbucket])
if (!IsSystemInitialized() || !pgss_qbuf[pg_atomic_read_u64(&pgss->current_wbucket)])
return;
prev_bucket_id = pg_atomic_read_u64(&pgss->current_wbucket);
bucketid = get_next_wbucket(pgss);
if (bucketid != pgss->current_wbucket)
{
if (bucketid != prev_bucket_id)
reset = true;
pgss->current_wbucket = bucketid;
}
LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
@@ -1990,40 +1990,76 @@ static uint64
get_next_wbucket(pgssSharedState *pgss)
{
struct timeval tv;
uint64 current_usec;
uint64 bucket_id;
struct tm *lt;
uint64 current_usec;
uint64 current_bucket_usec;
uint64 new_bucket_id;
uint64 prev_bucket_id;
struct tm *lt;
bool update_bucket = false;
gettimeofday(&tv,NULL);
current_usec = (TimestampTz) tv.tv_sec - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY);
current_usec = (current_usec * USECS_PER_SEC) + tv.tv_usec;
current_bucket_usec = pg_atomic_read_u64(&pgss->prev_bucket_usec);
if ((current_usec - pgss->prev_bucket_usec) > (PGSM_BUCKET_TIME * 1000 * 1000))
/*
* If current bucket expired we loop attempting to update prev_bucket_usec.
*
* pg_atomic_compare_exchange_u64 may fail in two possible ways:
* 1. Another thread/process updated the variable before us.
* 2. A spurious failure / hardware event.
*
* In both failure cases we read prev_bucket_usec from memory again, if it was
* a spurious failure then the value of prev_bucket_usec must be the same as
* before, which will cause the while loop to execute again.
*
* If another thread updated prev_bucket_usec, then its current value will
* definitely make the while condition to fail, we can stop the loop as another
* thread has already updated prev_bucket_usec.
*/
while ((current_usec - current_bucket_usec) > (PGSM_BUCKET_TIME * 1000 * 1000))
{
if (pg_atomic_compare_exchange_u64(&pgss->prev_bucket_usec, &current_bucket_usec, current_usec))
{
update_bucket = true;
break;
}
current_bucket_usec = pg_atomic_read_u64(&pgss->prev_bucket_usec);
}
if (update_bucket)
{
unsigned char *buf;
char file_name[1024];
int sec = 0;
bucket_id = (tv.tv_sec / PGSM_BUCKET_TIME) % PGSM_MAX_BUCKETS;
LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
buf = pgss_qbuf[bucket_id];
hash_entry_dealloc(bucket_id, pgss->current_wbucket);
hash_query_entry_dealloc(bucket_id, buf);
new_bucket_id = (tv.tv_sec / PGSM_BUCKET_TIME) % PGSM_MAX_BUCKETS;
snprintf(file_name, 1024, "%s.%d", PGSM_TEXT_FILE, (int)bucket_id);
/* Update bucket id and retrieve the previous one. */
prev_bucket_id = pg_atomic_exchange_u64(&pgss->current_wbucket, new_bucket_id);
LWLockAcquire(pgss->lock, LW_EXCLUSIVE);
buf = pgss_qbuf[new_bucket_id];
hash_entry_dealloc(new_bucket_id, prev_bucket_id);
hash_query_entry_dealloc(new_bucket_id, buf);
snprintf(file_name, 1024, "%s.%d", PGSM_TEXT_FILE, (int)new_bucket_id);
unlink(file_name);
LWLockRelease(pgss->lock);
pgss->prev_bucket_usec = current_usec;
lt = localtime(&tv.tv_sec);
sec = lt->tm_sec - (lt->tm_sec % PGSM_BUCKET_TIME);
if (sec < 0)
sec = 0;
snprintf(pgss->bucket_start_time[bucket_id], sizeof(pgss->bucket_start_time[bucket_id]),
snprintf(pgss->bucket_start_time[new_bucket_id], sizeof(pgss->bucket_start_time[new_bucket_id]),
"%04d-%02d-%02d %02d:%02d:%02d", lt->tm_year + 1900, lt->tm_mon + 1, lt->tm_mday, lt->tm_hour, lt->tm_min, sec);
return bucket_id;
return new_bucket_id;
}
return pgss->current_wbucket;
return pg_atomic_read_u64(&pgss->current_wbucket);
}
#if PG_VERSION_NUM < 140000
@@ -3024,7 +3060,7 @@ pgss_store_query_info(uint64 bucketid,
pgssStoreKind kind)
{
pgssSharedState *pgss = pgsm_get_ss();
unsigned char *buf = pgss_qbuf[pgss->current_wbucket];
unsigned char *buf = pgss_qbuf[pg_atomic_read_u64(&pgss->current_wbucket)];
pgssQueryEntry *entry;
if (query_len > PGSM_QUERY_MAX_LEN)