Propagate statistics with optional name

2023-08-17 16:14:57 +03:00 · 2023-08-17 16:14:57 +03:00 · e619127290
parent 682dca1f12
commit e619127290
3 changed files with 254 additions and 0 deletions
--- a/src/backend/distributed/commands/statistics.c
+++ b/src/backend/distributed/commands/statistics.c
@ -26,6 +26,7 @@
 #include "catalog/pg_namespace.h"
 #include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_type.h"
+#include "commands/defrem.h"
 #include "distributed/commands/utility_hook.h"
 #include "distributed/commands.h"
 #include "distributed/deparse_shard_query.h"
@ -42,6 +43,7 @@
 #include "utils/fmgroids.h"
 #include "utils/fmgrprotos.h"
 #include "utils/lsyscache.h"
+#include "utils/regproc.h"
 #include "utils/relcache.h"
 #include "utils/ruleutils.h"
 #include "utils/syscache.h"
@ -56,6 +58,11 @@ static char * GenerateAlterIndexColumnSetStatsCommand(char *indexNameWithSchema,
 static Oid GetRelIdByStatsOid(Oid statsOid);
 static char * CreateAlterCommandIfOwnerNotDefault(Oid statsOid);
 static char * CreateAlterCommandIfTargetNotDefault(Oid statsOid);
+#if PG_VERSION_NUM >= PG_VERSION_16
+static char * ChooseExtendedStatisticName(const char *name1, const char *name2,
+										  const char *label, Oid namespaceid);
+static char * ChooseExtendedStatisticNameAddition(List *exprs);
+#endif

 /*
 * PreprocessCreateStatisticsStmt is called during the planning phase for
@ -77,6 +84,22 @@ PreprocessCreateStatisticsStmt(Node *node, const char *queryString,

 	EnsureCoordinator();

+#if PG_VERSION_NUM >= PG_VERSION_16
+	if (!(stmt->defnames))
+	{
+		Relation rel = relation_openrv((RangeVar *) relation, ShareUpdateExclusiveLock);
+
+		Oid namespaceId = RelationGetNamespace(rel);
+		char *namestr = ChooseExtendedStatisticName(RelationGetRelationName(rel),
+													ChooseExtendedStatisticNameAddition(
+														stmt->exprs),
+													"stat",
+													namespaceId);
+		stmt->defnames = stringToQualifiedNameList(namestr, NULL);
+		relation_close(rel, ShareUpdateExclusiveLock);
+	}
+#endif
+
 	QualifyTreeNode((Node *) stmt);

 	Oid statsOid = get_statistics_object_oid(stmt->defnames, true);
@ -780,3 +803,125 @@ CreateAlterCommandIfTargetNotDefault(Oid statsOid)

 	return DeparseAlterStatisticsStmt((Node *) alterStatsStmt);
 }
+
+
+#if PG_VERSION_NUM >= PG_VERSION_16
+
+/*
+ * ChooseExtendedStatisticName - copied from PG source
+ *
+ * Select a nonconflicting name for a new statistics object.
+ *
+ * name1, name2, and label are used the same way as for makeObjectName(),
+ * except that the label can't be NULL; digits will be appended to the label
+ * if needed to create a name that is unique within the specified namespace.
+ *
+ * Returns a palloc'd string.
+ *
+ * Note: it is theoretically possible to get a collision anyway, if someone
+ * else chooses the same name concurrently.  This is fairly unlikely to be
+ * a problem in practice, especially if one is holding a share update
+ * exclusive lock on the relation identified by name1.  However, if choosing
+ * multiple names within a single command, you'd better create the new object
+ * and do CommandCounterIncrement before choosing the next one!
+ */
+static char *
+ChooseExtendedStatisticName(const char *name1, const char *name2,
+							const char *label, Oid namespaceid)
+{
+	int pass = 0;
+	char *stxname = NULL;
+	char modlabel[NAMEDATALEN];
+
+	/* try the unmodified label first */
+	strlcpy(modlabel, label, sizeof(modlabel));
+
+	for (;;)
+	{
+		Oid existingstats;
+
+		stxname = makeObjectName(name1, name2, modlabel);
+
+		existingstats = GetSysCacheOid2(STATEXTNAMENSP, Anum_pg_statistic_ext_oid,
+										PointerGetDatum(stxname),
+										ObjectIdGetDatum(namespaceid));
+		if (!OidIsValid(existingstats))
+		{
+			break;
+		}
+
+		/* found a conflict, so try a new name component */
+		pfree(stxname);
+		snprintf(modlabel, sizeof(modlabel), "%s%d", label, ++pass);
+	}
+
+	return stxname;
+}
+
+
+/*
+ * ChooseExtendedStatisticNameAddition - copied from PG source
+ *
+ * Generate "name2" for a new statistics object given the list of column
+ * names for it.  This will be passed to ChooseExtendedStatisticName along
+ * with the parent table name and a suitable label.
+ *
+ * We know that less than NAMEDATALEN characters will actually be used,
+ * so we can truncate the result once we've generated that many.
+ *
+ * XXX see also ChooseForeignKeyConstraintNameAddition and
+ * ChooseIndexNameAddition.
+ */
+static char *
+ChooseExtendedStatisticNameAddition(List *exprs)
+{
+	char buf[NAMEDATALEN * 2];
+	int buflen = 0;
+	ListCell *lc;
+
+	buf[0] = '\0';
+	foreach(lc, exprs)
+	{
+		StatsElem *selem = (StatsElem *) lfirst(lc);
+		const char *name;
+
+		/* It should be one of these, but just skip if it happens not to be */
+		if (!IsA(selem, StatsElem))
+		{
+			continue;
+		}
+
+		name = selem->name;
+
+		if (buflen > 0)
+		{
+			buf[buflen++] = '_';    /* insert _ between names */
+		}
+
+		/*
+		 * We use fixed 'expr' for expressions, which have empty column names.
+		 * For indexes this is handled in ChooseIndexColumnNames, but we have
+		 * no such function for stats and it does not seem worth adding. If a
+		 * better name is needed, the user can specify it explicitly.
+		 */
+		if (!name)
+		{
+			name = "expr";
+		}
+
+		/*
+		 * At this point we have buflen <= NAMEDATALEN.  name should be less
+		 * than NAMEDATALEN already, but use strlcpy for paranoia.
+		 */
+		strlcpy(buf + buflen, name, NAMEDATALEN);
+		buflen += strlen(buf + buflen);
+		if (buflen >= NAMEDATALEN)
+		{
+			break;
+		}
+	}
+	return pstrdup(buf);
+}
+
+
+#endif
--- a/src/test/regress/expected/pg16.out
+++ b/src/test/regress/expected/pg16.out
@ -65,6 +65,69 @@ SET citus.log_remote_commands TO OFF;
 -- only verifying it works and not printing log
 -- remote commands because it can be flaky
 VACUUM (ONLY_DATABASE_STATS);
+-- New feature supported on Citus - test statistics without a name
+-- Relevant PG commit:
+-- https://github.com/postgres/postgres/commit/624aa2a13bd02dd584bb0995c883b5b93b2152df
+CREATE TABLE test_stats (
+    a   int,
+    b   int
+);
+SELECT create_distributed_table('test_stats', 'a');
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+CREATE STATISTICS (dependencies) ON a, b FROM test_stats;
+CREATE STATISTICS (ndistinct, dependencies) on a, b from test_stats;
+CREATE STATISTICS (ndistinct, dependencies, mcv) on a, b from test_stats;
+-- test for distributing an already existing statistics
+CREATE TABLE test_stats2 (
+    a   int,
+    b   int
+);
+CREATE STATISTICS test_stats_a_b_stat (dependencies) ON a, b FROM test_stats2;
+ERROR:  statistics object "test_stats_a_b_stat" already exists
+SELECT create_distributed_table('test_stats2', 'a');
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+-- test when stats is on a different schema
+CREATE SCHEMA sc1;
+CREATE TABLE tbl (a int, b int);
+SELECT create_distributed_table ('tbl', 'a');
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+SET search_path TO sc1;
+CREATE STATISTICS ON a, b FROM pg16.tbl;
+\c - - - :worker_1_port
+-- check all statistics have been correctly created at shards
+SELECT stxnamespace, stxname
+FROM pg_statistic_ext
+WHERE stxnamespace IN (
+	SELECT oid
+	FROM pg_namespace
+	WHERE nspname IN ('pg16', 'sc1')
+) ORDER BY stxname ASC;
+ stxnamespace |           stxname
+---------------------------------------------------------------------
+        19232 | tbl_a_b_stat
+        19232 | tbl_a_b_stat_950003
+        19204 | test_stats_a_b_stat
+        19204 | test_stats_a_b_stat1
+        19204 | test_stats_a_b_stat1_950001
+        19204 | test_stats_a_b_stat2
+        19204 | test_stats_a_b_stat2_950001
+        19204 | test_stats_a_b_stat_950001
+(8 rows)
+
+\c - - - :master_port
+SET search_path TO pg16;
 \set VERBOSITY terse
 SET client_min_messages TO ERROR;
 DROP SCHEMA pg16 CASCADE;
--- a/src/test/regress/sql/pg16.sql
+++ b/src/test/regress/sql/pg16.sql
@ -45,6 +45,52 @@ SET citus.log_remote_commands TO OFF;
 -- remote commands because it can be flaky
 VACUUM (ONLY_DATABASE_STATS);

+-- New feature supported on Citus - test statistics without a name
+-- Relevant PG commit:
+-- https://github.com/postgres/postgres/commit/624aa2a13bd02dd584bb0995c883b5b93b2152df
+
+CREATE TABLE test_stats (
+    a   int,
+    b   int
+);
+
+SELECT create_distributed_table('test_stats', 'a');
+
+CREATE STATISTICS (dependencies) ON a, b FROM test_stats;
+CREATE STATISTICS (ndistinct, dependencies) on a, b from test_stats;
+CREATE STATISTICS (ndistinct, dependencies, mcv) on a, b from test_stats;
+
+-- test for distributing an already existing statistics
+CREATE TABLE test_stats2 (
+    a   int,
+    b   int
+);
+
+CREATE STATISTICS test_stats_a_b_stat (dependencies) ON a, b FROM test_stats2;
+
+SELECT create_distributed_table('test_stats2', 'a');
+
+-- test when stats is on a different schema
+CREATE SCHEMA sc1;
+CREATE TABLE tbl (a int, b int);
+SELECT create_distributed_table ('tbl', 'a');
+
+SET search_path TO sc1;
+CREATE STATISTICS ON a, b FROM pg16.tbl;
+
+\c - - - :worker_1_port
+-- check all statistics have been correctly created at shards
+SELECT stxnamespace, stxname
+FROM pg_statistic_ext
+WHERE stxnamespace IN (
+	SELECT oid
+	FROM pg_namespace
+	WHERE nspname IN ('pg16', 'sc1')
+) ORDER BY stxname ASC;
+
+\c - - - :master_port
+SET search_path TO pg16;
+
 \set VERBOSITY terse
 SET client_min_messages TO ERROR;
 DROP SCHEMA pg16 CASCADE;