Use a hash table to de-duplicate column names in ruleutils.c.

52c707483ce4d0161127e4958d981d1b5655865e
2025-06-17 11:25:43 +00:00 · 2025-06-17 11:25:43 +00:00 · 8383666109
parent 9057c8778b
commit 8383666109
2 changed files with 168 additions and 25 deletions
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit 3376bd6845f0614908ed304f5033bd644c82d3bf
--- a/src/backend/distributed/deparser/ruleutils_18.c
+++ b/src/backend/distributed/deparser/ruleutils_18.c
@ -235,6 +235,10 @@ typedef void (*rsv_callback) (Node *node, deparse_context *context,
 * of aliases to columns of the right input.  Thus, positions in the printable
 * column alias list are not necessarily one-for-one with varattnos of the
 * JOIN, so we need a separate new_colnames[] array for printing purposes.
 * 
 * Finally, when dealing with wide tables we risk O(N^2) costs in assigning
 * non-duplicate column names.  We ameliorate that by using a hash table that
 * holds all the strings appearing in colnames, new_colnames, and parentUsing.
 */
 typedef struct
 {
@ -301,6 +305,15 @@ typedef struct
 	int		   *leftattnos;		/* left-child varattnos of join cols, or 0 */
 	int		   *rightattnos;	/* right-child varattnos of join cols, or 0 */
 	List	   *usingNames;		/* names assigned to merged columns */
 	/*
 	 * Hash table holding copies of all the strings appearing in this struct's
 	 * colnames, new_colnames, and parentUsing.  We use a hash table only for
 	 * sufficiently wide relations, and only during the colname-assignment
 	 * functions set_relation_column_names and set_join_column_names;
 	 * otherwise, names_hash is NULL.
 	 */
 	HTAB	   *names_hash;		/* entries are just strings */	
 } deparse_columns;
 /* This macro is analogous to rt_fetch(), but for deparse_columns structs */
@ -342,6 +355,9 @@ static bool colname_is_unique(const char *colname, deparse_namespace *dpns,
 static char *make_colname_unique(char *colname, deparse_namespace *dpns,
 					deparse_columns *colinfo);
 static void expand_colnames_array_to(deparse_columns *colinfo, int n);
 static void build_colinfo_names_hash(deparse_columns *colinfo);
 static void add_to_names_hash(deparse_columns *colinfo, const char *name);
 static void destroy_colinfo_names_hash(deparse_columns *colinfo);
 static void identify_join_columns(JoinExpr *j, RangeTblEntry *jrte,
 					  deparse_columns *colinfo);
 static char *get_rtable_name(int rtindex, deparse_context *context);
@ -988,6 +1004,10 @@ has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode)
 *
 * parentUsing is a list of all USING aliases assigned in parent joins of
 * the current jointree node.  (The passed-in list must not be modified.)
 * 
 * Note that we do not use per-deparse_columns hash tables in this function.
 * The number of names that need to be assigned should be small enough that
 * we don't need to trouble with that.
 */
 static void
 set_using_names(deparse_namespace *dpns, Node *jtnode, List *parentUsing)
@ -1265,6 +1285,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 	colinfo->new_colnames = (char **) palloc(ncolumns * sizeof(char *));
 	colinfo->is_new_col = (bool *) palloc(ncolumns * sizeof(bool));
 	/* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
 	build_colinfo_names_hash(colinfo);	
 	/*
 	 * Scan the columns, select a unique alias for each one, and store it in
 	 * colinfo->colnames and colinfo->new_colnames.  The former array has NULL
@ -1301,6 +1324,7 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 			colname = make_colname_unique(colname, dpns, colinfo);
 			colinfo->colnames[i] = colname;
 			add_to_names_hash(colinfo, colname);
 		}
 		/* Put names of non-dropped columns in new_colnames[] too */
@ -1321,6 +1345,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 			has_anonymous = true;
 	}
 	/* We're now done needing the colinfo's names_hash */
 	destroy_colinfo_names_hash(colinfo);
 	/*
 	 * Set correct length for new_colnames[] array.  (Note: if columns have
 	 * been added, colinfo->num_cols includes them, which is not really quite
@ -1391,6 +1418,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 	expand_colnames_array_to(colinfo, noldcolumns);
 	Assert(colinfo->num_cols == noldcolumns);
 	/* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
 	build_colinfo_names_hash(colinfo);	
 	/*
 	 * Scan the join output columns, select an alias for each one, and store
 	 * it in colinfo->colnames.  If there are USING columns, set_using_names()
@ -1427,6 +1457,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 		if (rte->alias == NULL)
 		{
 			colinfo->colnames[i] = real_colname;
 			add_to_names_hash(colinfo, real_colname);
 			continue;
 		}
@ -1443,6 +1474,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 			colname = make_colname_unique(colname, dpns, colinfo);
 			colinfo->colnames[i] = colname;
 			add_to_names_hash(colinfo, colname);
 		}
 		/* Remember if any assigned aliases differ from "real" name */
@ -1541,6 +1573,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 			}
 			else
 				colinfo->new_colnames[j] = child_colname;
 			add_to_names_hash(colinfo, colinfo->new_colnames[j]);
 		}
 		colinfo->is_new_col[j] = leftcolinfo->is_new_col[jc];
@ -1590,6 +1623,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 			}
 			else
 				colinfo->new_colnames[j] = child_colname;
 			add_to_names_hash(colinfo, colinfo->new_colnames[j]);
 		}
 		colinfo->is_new_col[j] = rightcolinfo->is_new_col[jc];
@ -1611,6 +1645,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 	Assert(j == nnewcolumns);
 #endif
 	/* We're now done needing the colinfo's names_hash */
 	destroy_colinfo_names_hash(colinfo);
 	/*
 	 * For a named join, print column aliases if we changed any from the child
 	 * names.  Unnamed joins cannot print aliases.
@ -1633,6 +1670,20 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
 	int			i;
 	ListCell   *lc;
 	/*
 	 * If we have a hash table, consult that instead of linearly scanning the
 	 * colinfo's strings.
 	 */
 	if (colinfo->names_hash)
 	{
 		if (hash_search(colinfo->names_hash,
 						colname,
 						HASH_FIND,
 						NULL) != NULL)
 			return false;
 	}
 	else
 	{
 		/* Check against already-assigned column aliases within RTE */
 		for (i = 0; i < colinfo->num_cols; i++)
 		{
@ -1643,8 +1694,8 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
 		}
 		/*
-	 * If we're building a new_colnames array, check that too (this will be
+		 * If we're building a new_colnames array, check that too (this will
-	 * partially but not completely redundant with the previous checks)
+		 * be partially but not completely redundant with the previous checks)
 		 */
 		for (i = 0; i < colinfo->num_new_cols; i++)
 		{
@ -1654,17 +1705,24 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
 				return false;
 		}
-	/* Also check against USING-column names that must be globally unique */
+		/*
-	foreach(lc, dpns->using_names)
+		 * Also check against names already assigned for parent-join USING
 		 * cols
 		 */
 		foreach(lc, colinfo->parentUsing)
 		{
 			char	   *oldname = (char *) lfirst(lc);
 			if (strcmp(oldname, colname) == 0)
 				return false;
 		}
 	}
-	/* Also check against names already assigned for parent-join USING cols */
+	/*
-	foreach(lc, colinfo->parentUsing)
+	 * Also check against USING-column names that must be globally unique.
 	 * These are not hashed, but there should be few of them.
 	 */
 	foreach(lc, dpns->using_names)
 	{
 		char	   *oldname = (char *) lfirst(lc);
@ -1734,6 +1792,90 @@ expand_colnames_array_to(deparse_columns *colinfo, int n)
 	}
 }
 /*
 * build_colinfo_names_hash: optionally construct a hash table for colinfo
 */
 static void
 build_colinfo_names_hash(deparse_columns *colinfo)
 {
 	HASHCTL		hash_ctl;
 	int			i;
 	ListCell   *lc;
 	/*
 	 * Use a hash table only for RTEs with at least 32 columns.  (The cutoff
 	 * is somewhat arbitrary, but let's choose it so that this code does get
 	 * exercised in the regression tests.)
 	 */
 	if (colinfo->num_cols < 32)
 		return;
 	/*
 	 * Set up the hash table.  The entries are just strings with no other
 	 * payload.
 	 */
 	hash_ctl.keysize = NAMEDATALEN;
 	hash_ctl.entrysize = NAMEDATALEN;
 	hash_ctl.hcxt = CurrentMemoryContext;
 	colinfo->names_hash = hash_create("deparse_columns names",
 									  colinfo->num_cols + colinfo->num_new_cols,
 									  &hash_ctl,
 									  HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
 	/*
 	 * Preload the hash table with any names already present (these would have
 	 * come from set_using_names).
 	 */
 	for (i = 0; i < colinfo->num_cols; i++)
 	{
 		char	   *oldname = colinfo->colnames[i];
 		if (oldname)
 			add_to_names_hash(colinfo, oldname);
 	}
 	for (i = 0; i < colinfo->num_new_cols; i++)
 	{
 		char	   *oldname = colinfo->new_colnames[i];
 		if (oldname)
 			add_to_names_hash(colinfo, oldname);
 	}
 	foreach(lc, colinfo->parentUsing)
 	{
 		char	   *oldname = (char *) lfirst(lc);
 		add_to_names_hash(colinfo, oldname);
 	}
 }
 /*
 * add_to_names_hash: add a string to the names_hash, if we're using one
 */
 static void
 add_to_names_hash(deparse_columns *colinfo, const char *name)
 {
 	if (colinfo->names_hash)
 		(void) hash_search(colinfo->names_hash,
 						   name,
 						   HASH_ENTER,
 						   NULL);
 }
 /*
 * destroy_colinfo_names_hash: destroy hash table when done with it
 */
 static void
 destroy_colinfo_names_hash(deparse_columns *colinfo)
 {
 	if (colinfo->names_hash)
 	{
 		hash_destroy(colinfo->names_hash);
 		colinfo->names_hash = NULL;
 	}
 }
 /*
 * identify_join_columns: figure out where columns of a join come from
 *
		`@ -0,0 +1 @@`
							`Subproject commit 3376bd6845f0614908ed304f5033bd644c82d3bf`