Use a hash table to de-duplicate column names in ruleutils.c.

52c707483ce4d0161127e4958d981d1b5655865e
m3hm3t/pg18_dev_relation_oid_0
Mehmet Yilmaz 2025-06-17 11:25:43 +00:00
parent 9057c8778b
commit 8383666109
2 changed files with 168 additions and 25 deletions

1
citus-tools Submodule

@ -0,0 +1 @@
Subproject commit 3376bd6845f0614908ed304f5033bd644c82d3bf

View File

@ -235,6 +235,10 @@ typedef void (*rsv_callback) (Node *node, deparse_context *context,
* of aliases to columns of the right input. Thus, positions in the printable
* column alias list are not necessarily one-for-one with varattnos of the
* JOIN, so we need a separate new_colnames[] array for printing purposes.
*
* Finally, when dealing with wide tables we risk O(N^2) costs in assigning
* non-duplicate column names. We ameliorate that by using a hash table that
* holds all the strings appearing in colnames, new_colnames, and parentUsing.
*/
typedef struct
{
@ -301,6 +305,15 @@ typedef struct
int *leftattnos; /* left-child varattnos of join cols, or 0 */
int *rightattnos; /* right-child varattnos of join cols, or 0 */
List *usingNames; /* names assigned to merged columns */
/*
* Hash table holding copies of all the strings appearing in this struct's
* colnames, new_colnames, and parentUsing. We use a hash table only for
* sufficiently wide relations, and only during the colname-assignment
* functions set_relation_column_names and set_join_column_names;
* otherwise, names_hash is NULL.
*/
HTAB *names_hash; /* entries are just strings */
} deparse_columns;
/* This macro is analogous to rt_fetch(), but for deparse_columns structs */
@ -342,6 +355,9 @@ static bool colname_is_unique(const char *colname, deparse_namespace *dpns,
static char *make_colname_unique(char *colname, deparse_namespace *dpns,
deparse_columns *colinfo);
static void expand_colnames_array_to(deparse_columns *colinfo, int n);
static void build_colinfo_names_hash(deparse_columns *colinfo);
static void add_to_names_hash(deparse_columns *colinfo, const char *name);
static void destroy_colinfo_names_hash(deparse_columns *colinfo);
static void identify_join_columns(JoinExpr *j, RangeTblEntry *jrte,
deparse_columns *colinfo);
static char *get_rtable_name(int rtindex, deparse_context *context);
@ -988,6 +1004,10 @@ has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode)
*
* parentUsing is a list of all USING aliases assigned in parent joins of
* the current jointree node. (The passed-in list must not be modified.)
*
* Note that we do not use per-deparse_columns hash tables in this function.
* The number of names that need to be assigned should be small enough that
* we don't need to trouble with that.
*/
static void
set_using_names(deparse_namespace *dpns, Node *jtnode, List *parentUsing)
@ -1265,6 +1285,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
colinfo->new_colnames = (char **) palloc(ncolumns * sizeof(char *));
colinfo->is_new_col = (bool *) palloc(ncolumns * sizeof(bool));
/* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
build_colinfo_names_hash(colinfo);
/*
* Scan the columns, select a unique alias for each one, and store it in
* colinfo->colnames and colinfo->new_colnames. The former array has NULL
@ -1301,6 +1324,7 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
colname = make_colname_unique(colname, dpns, colinfo);
colinfo->colnames[i] = colname;
add_to_names_hash(colinfo, colname);
}
/* Put names of non-dropped columns in new_colnames[] too */
@ -1321,6 +1345,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
has_anonymous = true;
}
/* We're now done needing the colinfo's names_hash */
destroy_colinfo_names_hash(colinfo);
/*
* Set correct length for new_colnames[] array. (Note: if columns have
* been added, colinfo->num_cols includes them, which is not really quite
@ -1391,6 +1418,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
expand_colnames_array_to(colinfo, noldcolumns);
Assert(colinfo->num_cols == noldcolumns);
/* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
build_colinfo_names_hash(colinfo);
/*
* Scan the join output columns, select an alias for each one, and store
* it in colinfo->colnames. If there are USING columns, set_using_names()
@ -1427,6 +1457,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
if (rte->alias == NULL)
{
colinfo->colnames[i] = real_colname;
add_to_names_hash(colinfo, real_colname);
continue;
}
@ -1443,6 +1474,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
colname = make_colname_unique(colname, dpns, colinfo);
colinfo->colnames[i] = colname;
add_to_names_hash(colinfo, colname);
}
/* Remember if any assigned aliases differ from "real" name */
@ -1541,6 +1573,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
}
else
colinfo->new_colnames[j] = child_colname;
add_to_names_hash(colinfo, colinfo->new_colnames[j]);
}
colinfo->is_new_col[j] = leftcolinfo->is_new_col[jc];
@ -1590,6 +1623,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
}
else
colinfo->new_colnames[j] = child_colname;
add_to_names_hash(colinfo, colinfo->new_colnames[j]);
}
colinfo->is_new_col[j] = rightcolinfo->is_new_col[jc];
@ -1611,6 +1645,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
Assert(j == nnewcolumns);
#endif
/* We're now done needing the colinfo's names_hash */
destroy_colinfo_names_hash(colinfo);
/*
* For a named join, print column aliases if we changed any from the child
* names. Unnamed joins cannot print aliases.
@ -1633,6 +1670,20 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
int i;
ListCell *lc;
/*
* If we have a hash table, consult that instead of linearly scanning the
* colinfo's strings.
*/
if (colinfo->names_hash)
{
if (hash_search(colinfo->names_hash,
colname,
HASH_FIND,
NULL) != NULL)
return false;
}
else
{
/* Check against already-assigned column aliases within RTE */
for (i = 0; i < colinfo->num_cols; i++)
{
@ -1643,8 +1694,8 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
}
/*
* If we're building a new_colnames array, check that too (this will be
* partially but not completely redundant with the previous checks)
* If we're building a new_colnames array, check that too (this will
* be partially but not completely redundant with the previous checks)
*/
for (i = 0; i < colinfo->num_new_cols; i++)
{
@ -1654,17 +1705,24 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
return false;
}
/* Also check against USING-column names that must be globally unique */
foreach(lc, dpns->using_names)
/*
* Also check against names already assigned for parent-join USING
* cols
*/
foreach(lc, colinfo->parentUsing)
{
char *oldname = (char *) lfirst(lc);
if (strcmp(oldname, colname) == 0)
return false;
}
}
/* Also check against names already assigned for parent-join USING cols */
foreach(lc, colinfo->parentUsing)
/*
* Also check against USING-column names that must be globally unique.
* These are not hashed, but there should be few of them.
*/
foreach(lc, dpns->using_names)
{
char *oldname = (char *) lfirst(lc);
@ -1734,6 +1792,90 @@ expand_colnames_array_to(deparse_columns *colinfo, int n)
}
}
/*
* build_colinfo_names_hash: optionally construct a hash table for colinfo
*/
static void
build_colinfo_names_hash(deparse_columns *colinfo)
{
HASHCTL hash_ctl;
int i;
ListCell *lc;
/*
* Use a hash table only for RTEs with at least 32 columns. (The cutoff
* is somewhat arbitrary, but let's choose it so that this code does get
* exercised in the regression tests.)
*/
if (colinfo->num_cols < 32)
return;
/*
* Set up the hash table. The entries are just strings with no other
* payload.
*/
hash_ctl.keysize = NAMEDATALEN;
hash_ctl.entrysize = NAMEDATALEN;
hash_ctl.hcxt = CurrentMemoryContext;
colinfo->names_hash = hash_create("deparse_columns names",
colinfo->num_cols + colinfo->num_new_cols,
&hash_ctl,
HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
/*
* Preload the hash table with any names already present (these would have
* come from set_using_names).
*/
for (i = 0; i < colinfo->num_cols; i++)
{
char *oldname = colinfo->colnames[i];
if (oldname)
add_to_names_hash(colinfo, oldname);
}
for (i = 0; i < colinfo->num_new_cols; i++)
{
char *oldname = colinfo->new_colnames[i];
if (oldname)
add_to_names_hash(colinfo, oldname);
}
foreach(lc, colinfo->parentUsing)
{
char *oldname = (char *) lfirst(lc);
add_to_names_hash(colinfo, oldname);
}
}
/*
* add_to_names_hash: add a string to the names_hash, if we're using one
*/
static void
add_to_names_hash(deparse_columns *colinfo, const char *name)
{
if (colinfo->names_hash)
(void) hash_search(colinfo->names_hash,
name,
HASH_ENTER,
NULL);
}
/*
* destroy_colinfo_names_hash: destroy hash table when done with it
*/
static void
destroy_colinfo_names_hash(deparse_columns *colinfo)
{
if (colinfo->names_hash)
{
hash_destroy(colinfo->names_hash);
colinfo->names_hash = NULL;
}
}
/*
* identify_join_columns: figure out where columns of a join come from
*