PG-1674: Fix comment parsing logic at two places (#542)

* PG-1674: Fix query hash calculation comment removal logic

The previous conditions only removed the first few starting characters
of the comment, and leaved everything else there.

This modification fixes this and correctly removes everything.

* PG-1674: Fix performance issues with comment extraction

The previous logic used complex regex parsers, which caused performance
issues with large (megabyte sized) queries. This change removes the
regex dependency and uses the same (fixed) logic from the query hashing
code, which is much faster.

It also checks the related GUC variable, which the previous code
ignored: if we do not want to display extracted comments, we won't
extract them in the first place.

This commit doesn't try to address other issues with comment parsing logic:

* we shouldn't treat comment like things within strings as comments
* we should handle nested C style comments
* we don't extract `--` style comments

All of these issues are still there, as before.
main
Zsolt Parragi 2025-06-23 14:46:15 +02:00 committed by GitHub
parent 3653dd6041
commit 9e0a252873
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 62 additions and 46 deletions

View File

@ -19,7 +19,6 @@
#include "access/parallel.h"
#include "nodes/pg_list.h"
#include "utils/guc.h"
#include <regex.h>
#include "pgstat.h"
#include "commands/dbcommands.h"
#include "commands/explain.h"
@ -96,8 +95,6 @@ uint64 *nested_queryids;
char **nested_query_txts;
List *lentries = NIL;
/* Regex object used to extract query comments. */
static regex_t preg_query_comments;
static char relations[REL_LST][REL_LEN];
static int num_relations; /* Number of relation in the query */
@ -280,8 +277,6 @@ static void pgsm_lock_release(pgsmSharedState *pgsm);
void
_PG_init(void)
{
int rc;
elog(DEBUG2, "[pg_stat_monitor] pg_stat_monitor: %s().", __FUNCTION__);
/*
@ -309,18 +304,8 @@ _PG_init(void)
EnableQueryId();
#endif
EmitWarningsOnPlaceholders("pg_stat_monitor");
/*
* Compile regular expression for extracting out query comments only once.
*/
rc = regcomp(&preg_query_comments, "/\\*([^*]|[\r\n]|(\\*+([^*/]|[\r\n])))*\\*+/", REG_EXTENDED);
if (rc != 0)
{
elog(ERROR, "[pg_stat_monitor] _PG_init: query comments regcomp() failed, return code=(%d).", rc);
}
/*
* Install hooks.
*/
@ -2715,12 +2700,19 @@ get_pgsm_query_id_hash(const char *norm_query, int norm_len)
*/
if (*norm_q_iter == '/' && *(norm_q_iter + 1) == '*')
{
while (*norm_q_iter && *norm_q_iter != '*' && *(norm_q_iter + 1) != '/')
while (*norm_q_iter && *(norm_q_iter + 1) && (*norm_q_iter != '*' || *(norm_q_iter + 1) != '/'))
norm_q_iter++;
/* Skip the '/' if the current character is valid. */
/*
* Skip the end if the current character is valid. norm_q_iter
* points to the *, we have to skip 2 characters
*/
if (*norm_q_iter)
norm_q_iter++;
if (*norm_q_iter)
norm_q_iter++;
continue;
}
/*
@ -3970,46 +3962,70 @@ get_histogram_timings(PG_FUNCTION_ARGS)
return CStringGetTextDatum(text_str);
}
static bool
append_comment_char(char *comments, size_t max_len, size_t *idx, char c)
{
if (*idx >= max_len)
return false;
comments[*idx] = c;
(*idx)++;
return true;
}
static void
extract_query_comments(const char *query, char *comments, size_t max_len)
{
int rc;
size_t nmatch = 1;
regmatch_t pmatch;
regoff_t comment_len,
total_len = 0;
const char *s = query;
size_t curr_len = 0;
const char *q_iter = query;
while (total_len < max_len)
if (!pgsm_extract_comments)
{
rc = regexec(&preg_query_comments, s, nmatch, &pmatch, 0);
if (rc != 0)
break;
comments[0] = 0;
return;
}
comment_len = pmatch.rm_eo - pmatch.rm_so;
if (total_len + comment_len > max_len)
break; /* TODO: log error in error view, insufficient
* space for comment. */
total_len += comment_len;
/* Not 1st iteration, append ", " before next comment. */
if (s != query)
while (q_iter && *q_iter)
{
/*
* multiline comments, + 1 is safe even if we've reach end of string
*/
if (*q_iter == '/' && *(q_iter + 1) == '*')
{
if (total_len + 2 > max_len)
break; /* TODO: log error in error view, insufficient
* space for ", " + comment. */
if (curr_len != 0)
{
if (!append_comment_char(comments, max_len, &curr_len, ','))
return;
if (!append_comment_char(comments, max_len, &curr_len, ' '))
return;
}
while (*q_iter && *(q_iter + 1) && (*q_iter != '*' || *(q_iter + 1) != '/'))
{
if (!append_comment_char(comments, max_len, &curr_len, *q_iter))
return;
q_iter++;
}
memcpy(comments, ", ", 2);
comments += 2;
total_len += 2;
if (*q_iter)
{
if (!append_comment_char(comments, max_len, &curr_len, *q_iter))
return;
q_iter++;
}
if (*q_iter)
{
if (!append_comment_char(comments, max_len, &curr_len, *q_iter))
return;
q_iter++;
}
}
memcpy(comments, s + pmatch.rm_so, comment_len);
comments += comment_len;
s += pmatch.rm_eo;
q_iter++;
}
comments[curr_len] = 0;
}
#if PG_VERSION_NUM < 140000