Use a hash table to de-duplicate column names in ruleutils.c.

52c707483ce4d0161127e4958d981d1b5655865e
2025-06-17 11:25:43 +00:00 · 2025-06-17 11:25:43 +00:00 · 8383666109
parent 9057c8778b
commit 8383666109
2 changed files with 168 additions and 25 deletions
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 3376bd6845f0614908ed304f5033bd644c82d3bf
--- a/src/backend/distributed/deparser/ruleutils_18.c
+++ b/src/backend/distributed/deparser/ruleutils_18.c
@ -235,6 +235,10 @@ typedef void (*rsv_callback) (Node *node, deparse_context *context,
 * of aliases to columns of the right input.  Thus, positions in the printable
 * column alias list are not necessarily one-for-one with varattnos of the
 * JOIN, so we need a separate new_colnames[] array for printing purposes.
+ * 
+* Finally, when dealing with wide tables we risk O(N^2) costs in assigning
+ * non-duplicate column names.  We ameliorate that by using a hash table that
+ * holds all the strings appearing in colnames, new_colnames, and parentUsing.
 */
 typedef struct
 {
@ -301,6 +305,15 @@ typedef struct
 	int		   *leftattnos;		/* left-child varattnos of join cols, or 0 */
 	int		   *rightattnos;	/* right-child varattnos of join cols, or 0 */
 	List	   *usingNames;		/* names assigned to merged columns */
+
+	/*
+	 * Hash table holding copies of all the strings appearing in this struct's
+	 * colnames, new_colnames, and parentUsing.  We use a hash table only for
+	 * sufficiently wide relations, and only during the colname-assignment
+	 * functions set_relation_column_names and set_join_column_names;
+	 * otherwise, names_hash is NULL.
+	 */
+	HTAB	   *names_hash;		/* entries are just strings */	
 } deparse_columns;

 /* This macro is analogous to rt_fetch(), but for deparse_columns structs */
@ -342,6 +355,9 @@ static bool colname_is_unique(const char *colname, deparse_namespace *dpns,
 static char *make_colname_unique(char *colname, deparse_namespace *dpns,
 					deparse_columns *colinfo);
 static void expand_colnames_array_to(deparse_columns *colinfo, int n);
+static void build_colinfo_names_hash(deparse_columns *colinfo);
+static void add_to_names_hash(deparse_columns *colinfo, const char *name);
+static void destroy_colinfo_names_hash(deparse_columns *colinfo);
 static void identify_join_columns(JoinExpr *j, RangeTblEntry *jrte,
 					  deparse_columns *colinfo);
 static char *get_rtable_name(int rtindex, deparse_context *context);
@ -988,6 +1004,10 @@ has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode)
 *
 * parentUsing is a list of all USING aliases assigned in parent joins of
 * the current jointree node.  (The passed-in list must not be modified.)
+ * 
+ * Note that we do not use per-deparse_columns hash tables in this function.
+ * The number of names that need to be assigned should be small enough that
+ * we don't need to trouble with that.
 */
 static void
 set_using_names(deparse_namespace *dpns, Node *jtnode, List *parentUsing)
@ -1265,6 +1285,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 	colinfo->new_colnames = (char **) palloc(ncolumns * sizeof(char *));
 	colinfo->is_new_col = (bool *) palloc(ncolumns * sizeof(bool));

+	/* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
+	build_colinfo_names_hash(colinfo);	
+
 	/*
 	 * Scan the columns, select a unique alias for each one, and store it in
 	 * colinfo->colnames and colinfo->new_colnames.  The former array has NULL
@ -1301,6 +1324,7 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 			colname = make_colname_unique(colname, dpns, colinfo);

 			colinfo->colnames[i] = colname;
+			add_to_names_hash(colinfo, colname);
 		}

 		/* Put names of non-dropped columns in new_colnames[] too */
@ -1321,6 +1345,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 			has_anonymous = true;
 	}

+	/* We're now done needing the colinfo's names_hash */
+	destroy_colinfo_names_hash(colinfo);
+
 	/*
 	 * Set correct length for new_colnames[] array.  (Note: if columns have
 	 * been added, colinfo->num_cols includes them, which is not really quite
@ -1391,6 +1418,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 	expand_colnames_array_to(colinfo, noldcolumns);
 	Assert(colinfo->num_cols == noldcolumns);

+	/* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
+	build_colinfo_names_hash(colinfo);	
+
 	/*
 	 * Scan the join output columns, select an alias for each one, and store
 	 * it in colinfo->colnames.  If there are USING columns, set_using_names()
@ -1427,6 +1457,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 		if (rte->alias == NULL)
 		{
 			colinfo->colnames[i] = real_colname;
+			add_to_names_hash(colinfo, real_colname);
 			continue;
 		}

@ -1443,6 +1474,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 			colname = make_colname_unique(colname, dpns, colinfo);

 			colinfo->colnames[i] = colname;
+			add_to_names_hash(colinfo, colname);
 		}

 		/* Remember if any assigned aliases differ from "real" name */
@ -1541,6 +1573,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 			}
 			else
 				colinfo->new_colnames[j] = child_colname;
+			add_to_names_hash(colinfo, colinfo->new_colnames[j]);
 		}

 		colinfo->is_new_col[j] = leftcolinfo->is_new_col[jc];
@ -1590,6 +1623,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 			}
 			else
 				colinfo->new_colnames[j] = child_colname;
+			add_to_names_hash(colinfo, colinfo->new_colnames[j]);
 		}

 		colinfo->is_new_col[j] = rightcolinfo->is_new_col[jc];
@ -1611,6 +1645,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
 	Assert(j == nnewcolumns);
 #endif

+	/* We're now done needing the colinfo's names_hash */
+	destroy_colinfo_names_hash(colinfo);
+
 	/*
 	 * For a named join, print column aliases if we changed any from the child
 	 * names.  Unnamed joins cannot print aliases.
@ -1633,6 +1670,20 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
 	int			i;
 	ListCell   *lc;

+	/*
+	 * If we have a hash table, consult that instead of linearly scanning the
+	 * colinfo's strings.
+	 */
+	if (colinfo->names_hash)
+	{
+		if (hash_search(colinfo->names_hash,
+						colname,
+						HASH_FIND,
+						NULL) != NULL)
+			return false;
+	}
+	else
+	{
 		/* Check against already-assigned column aliases within RTE */
 		for (i = 0; i < colinfo->num_cols; i++)
 		{
@ -1643,8 +1694,8 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
 		}

 		/*
-	 * If we're building a new_colnames array, check that too (this will be
-	 * partially but not completely redundant with the previous checks)
+		 * If we're building a new_colnames array, check that too (this will
+		 * be partially but not completely redundant with the previous checks)
 		 */
 		for (i = 0; i < colinfo->num_new_cols; i++)
 		{
@ -1654,17 +1705,24 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
 				return false;
 		}

-	/* Also check against USING-column names that must be globally unique */
-	foreach(lc, dpns->using_names)
+		/*
+		 * Also check against names already assigned for parent-join USING
+		 * cols
+		 */
+		foreach(lc, colinfo->parentUsing)
 		{
 			char	   *oldname = (char *) lfirst(lc);

 			if (strcmp(oldname, colname) == 0)
 				return false;
 		}
+	}

-	/* Also check against names already assigned for parent-join USING cols */
-	foreach(lc, colinfo->parentUsing)
+	/*
+	 * Also check against USING-column names that must be globally unique.
+	 * These are not hashed, but there should be few of them.
+	 */
+	foreach(lc, dpns->using_names)
 	{
 		char	   *oldname = (char *) lfirst(lc);

@ -1734,6 +1792,90 @@ expand_colnames_array_to(deparse_columns *colinfo, int n)
 	}
 }

+/*
+ * build_colinfo_names_hash: optionally construct a hash table for colinfo
+ */
+static void
+build_colinfo_names_hash(deparse_columns *colinfo)
+{
+	HASHCTL		hash_ctl;
+	int			i;
+	ListCell   *lc;
+
+	/*
+	 * Use a hash table only for RTEs with at least 32 columns.  (The cutoff
+	 * is somewhat arbitrary, but let's choose it so that this code does get
+	 * exercised in the regression tests.)
+	 */
+	if (colinfo->num_cols < 32)
+		return;
+
+	/*
+	 * Set up the hash table.  The entries are just strings with no other
+	 * payload.
+	 */
+	hash_ctl.keysize = NAMEDATALEN;
+	hash_ctl.entrysize = NAMEDATALEN;
+	hash_ctl.hcxt = CurrentMemoryContext;
+	colinfo->names_hash = hash_create("deparse_columns names",
+									  colinfo->num_cols + colinfo->num_new_cols,
+									  &hash_ctl,
+									  HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
+
+	/*
+	 * Preload the hash table with any names already present (these would have
+	 * come from set_using_names).
+	 */
+	for (i = 0; i < colinfo->num_cols; i++)
+	{
+		char	   *oldname = colinfo->colnames[i];
+
+		if (oldname)
+			add_to_names_hash(colinfo, oldname);
+	}
+
+	for (i = 0; i < colinfo->num_new_cols; i++)
+	{
+		char	   *oldname = colinfo->new_colnames[i];
+
+		if (oldname)
+			add_to_names_hash(colinfo, oldname);
+	}
+
+	foreach(lc, colinfo->parentUsing)
+	{
+		char	   *oldname = (char *) lfirst(lc);
+
+		add_to_names_hash(colinfo, oldname);
+	}
+}
+
+/*
+ * add_to_names_hash: add a string to the names_hash, if we're using one
+ */
+static void
+add_to_names_hash(deparse_columns *colinfo, const char *name)
+{
+	if (colinfo->names_hash)
+		(void) hash_search(colinfo->names_hash,
+						   name,
+						   HASH_ENTER,
+						   NULL);
+}
+
+/*
+ * destroy_colinfo_names_hash: destroy hash table when done with it
+ */
+static void
+destroy_colinfo_names_hash(deparse_columns *colinfo)
+{
+	if (colinfo->names_hash)
+	{
+		hash_destroy(colinfo->names_hash);
+		colinfo->names_hash = NULL;
+	}
+}
+
 /*
 * identify_join_columns: figure out where columns of a join come from
 *
				`@ -0,0 +1 @@`
				`Subproject commit 3376bd6845f0614908ed304f5033bd644c82d3bf`