From f9218d9780a88356b7f6861bb1cc07781c1df808 Mon Sep 17 00:00:00 2001 From: Benjamin O Date: Fri, 27 Oct 2023 10:42:55 -0400 Subject: [PATCH 01/12] Support replacing IPv6 Loopback in `normalize.sed` (#7269) I had a test failure issue due to my machine using the IPv6 loopback address. This change to the `normalize.sed` solves that issue. --- src/test/regress/bin/normalize.sed | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/regress/bin/normalize.sed b/src/test/regress/bin/normalize.sed index efa9e310f..1d293e964 100644 --- a/src/test/regress/bin/normalize.sed +++ b/src/test/regress/bin/normalize.sed @@ -222,7 +222,7 @@ s/(CONTEXT: PL\/pgSQL function .* line )([0-9]+)/\1XX/g s/^(PL\/pgSQL function .* line) [0-9]+ (.*)/\1 XX \2/g # normalize a test difference in multi_move_mx -s/ connection to server at "\w+" \(127\.0\.0\.1\), port [0-9]+ failed://g +s/ connection to server at "\w+" (\(127\.0\.0\.1\)|\(::1\)), port [0-9]+ failed://g # normalize differences in tablespace of new index s/pg14\.idx.*/pg14\.xxxxx/g From d0b093c975c8b3056f9e35c81903e7cfa05644d2 Mon Sep 17 00:00:00 2001 From: Nils Dijk Date: Fri, 27 Oct 2023 16:57:51 +0200 Subject: [PATCH 02/12] automatically add a breakpoint that breaks on postgres errors (#7279) When debugging postgres it is quite hard to get to the source for `errfinish` in `elog.c`. Instead of relying on the developer to set a breakpoint in the `elog.c` file for `errfinish` for `elevel == ERROR`, this change adds the breakpoint to `.gdbinit`. This makes sure that whenever a debugger is attached to a postgres backend it will break on postgres errors. When attaching the debugger a small banner is printed that explains how to disable the breakpoint. --- .devcontainer/.gdbinit | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/.devcontainer/.gdbinit b/.devcontainer/.gdbinit index 9c710923f..9d544512b 100644 --- a/.devcontainer/.gdbinit +++ b/.devcontainer/.gdbinit @@ -3,3 +3,31 @@ # actually also works when debugging with vscode. Providing nice tools # to understand the internal datastructures we are working with. source /root/gdbpg.py + +# when debugging postgres it is convenient to _always_ have a breakpoint +# trigger when an error is logged. Because .gdbinit is sourced before gdb +# is fully attached and has the sources loaded. To make sure the breakpoint +# is added when the library is loaded we temporary set the breakpoint pending +# to on. After we have added out breakpoint we revert back to the default +# configuration for breakpoint pending. +# The breakpoint is hard to read, but at entry of the function we don't have +# the level loaded in elevel. Instead we hardcode the location where the +# level of the current error is stored. Also gdb doesn't understand the +# ERROR symbol so we hardcode this to the value of ERROR. It is very unlikely +# this value will ever change in postgres, but if it does we might need to +# find a way to conditionally load the correct breakpoint. +set breakpoint pending on +break elog.c:errfinish if errordata[errordata_stack_depth].elevel == 21 +set breakpoint pending auto + +echo \n +echo ----------------------------------------------------------------------------------\n +echo when attaching to a postgres backend a breakpoint will be set on elog.c:errfinish \n +echo it will only break on errors being raised in postgres \n +echo \n +echo to disable this breakpoint from vscode run `-exec disable 1` in the debug console \n +echo this assumes it's the first breakpoint loaded as it is loaded from .gdbinit \n +echo this can be verified with `-exec info break`, enabling can be done with \n +echo `-exec enable 1` \n +echo ----------------------------------------------------------------------------------\n +echo \n From ee8f4bb7e851b210b72ceb0d2d952890de14a3e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emel=20=C5=9Eim=C5=9Fek?= Date: Mon, 30 Oct 2023 09:44:13 +0300 Subject: [PATCH 03/12] Start Maintenance Daemon for Main DB at the server start. (#7254) DESCRIPTION: This change starts a maintenance deamon at the time of server start if there is a designated main database. This is the code flow: 1. User designates a main database: `ALTER SYSTEM SET citus.main_db = "myadmindb";` 2. When postmaster starts, in _PG_Init, citus calls `InitializeMaintenanceDaemonForMainDb` This function registers a background worker to run `CitusMaintenanceDaemonMain `with `databaseOid = 0 ` 3. `CitusMaintenanceDaemonMain ` takes some special actions when databaseOid is 0: - Gets the citus.main_db value. - Connects to the citus.main_db - Now the `MyDatabaseId `is available, creates a hash entry for it. - Then follows the same control flow as for a regular db, --- src/backend/distributed/shared_library_init.c | 11 + src/backend/distributed/utils/maintenanced.c | 283 +++++++++++++----- src/include/distributed/maintenanced.h | 2 + src/test/regress/citus_tests/common.py | 14 + .../test/test_maintenancedeamon.py | 74 +++++ 5 files changed, 307 insertions(+), 77 deletions(-) create mode 100644 src/test/regress/citus_tests/test/test_maintenancedeamon.py diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index 1ac20c8bc..9b5768ee7 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -481,6 +481,7 @@ _PG_init(void) #endif InitializeMaintenanceDaemon(); + InitializeMaintenanceDaemonForMainDb(); /* initialize coordinated transaction management */ InitializeTransactionManagement(); @@ -1820,6 +1821,16 @@ RegisterCitusConfigVariables(void) GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE | GUC_UNIT_MS, NULL, NULL, NULL); + DefineCustomStringVariable( + "citus.main_db", + gettext_noop("Which database is designated as the main_db"), + NULL, + &MainDb, + "", + PGC_POSTMASTER, + GUC_STANDARD, + NULL, NULL, NULL); + DefineCustomIntVariable( "citus.max_adaptive_executor_pool_size", gettext_noop("Sets the maximum number of connections per worker node used by " diff --git a/src/backend/distributed/utils/maintenanced.c b/src/backend/distributed/utils/maintenanced.c index 5f49de20a..22a0843bd 100644 --- a/src/backend/distributed/utils/maintenanced.c +++ b/src/backend/distributed/utils/maintenanced.c @@ -99,6 +99,7 @@ int Recover2PCInterval = 60000; int DeferShardDeleteInterval = 15000; int BackgroundTaskQueueCheckInterval = 5000; int MaxBackgroundTaskExecutors = 4; +char *MainDb = ""; /* config variables for metadata sync timeout */ int MetadataSyncInterval = 60000; @@ -112,7 +113,7 @@ static MaintenanceDaemonControlData *MaintenanceDaemonControl = NULL; * activated. */ static HTAB *MaintenanceDaemonDBHash; - +static ErrorContextCallback errorCallback = { 0 }; static volatile sig_atomic_t got_SIGHUP = false; static volatile sig_atomic_t got_SIGTERM = false; @@ -125,6 +126,8 @@ static void MaintenanceDaemonShmemExit(int code, Datum arg); static void MaintenanceDaemonErrorContext(void *arg); static bool MetadataSyncTriggeredCheckAndReset(MaintenanceDaemonDBData *dbData); static void WarnMaintenanceDaemonNotStarted(void); +static MaintenanceDaemonDBData * GetMaintenanceDaemonDBHashEntry(Oid databaseId, + bool *found); /* * InitializeMaintenanceDaemon, called at server start, is responsible for @@ -139,6 +142,82 @@ InitializeMaintenanceDaemon(void) } +/* + * GetMaintenanceDaemonDBHashEntry searches the MaintenanceDaemonDBHash for the + * databaseId. It returns the entry if found or creates a new entry and initializes + * the value with zeroes. + */ +MaintenanceDaemonDBData * +GetMaintenanceDaemonDBHashEntry(Oid databaseId, bool *found) +{ + MaintenanceDaemonDBData *dbData = (MaintenanceDaemonDBData *) hash_search( + MaintenanceDaemonDBHash, + &MyDatabaseId, + HASH_ENTER_NULL, + found); + + if (!dbData) + { + elog(LOG, + "cannot create or find the maintenance deamon hash entry for database %u", + databaseId); + return NULL; + } + + if (!*found) + { + /* ensure the values in MaintenanceDaemonDBData are zero */ + memset(((char *) dbData) + sizeof(Oid), 0, + sizeof(MaintenanceDaemonDBData) - sizeof(Oid)); + } + + return dbData; +} + + +/* + * InitializeMaintenanceDaemonForMainDb is called in _PG_Init + * at which stage we are not in a transaction or have databaseOid + */ +void +InitializeMaintenanceDaemonForMainDb(void) +{ + if (strcmp(MainDb, "") == 0) + { + elog(LOG, "There is no designated Main database."); + return; + } + + BackgroundWorker worker; + + memset(&worker, 0, sizeof(worker)); + + + strcpy_s(worker.bgw_name, sizeof(worker.bgw_name), + "Citus Maintenance Daemon for Main DB"); + + /* request ability to connect to target database */ + worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + + /* + * No point in getting started before able to run query, but we do + * want to get started on Hot-Standby. + */ + worker.bgw_start_time = BgWorkerStart_ConsistentState; + + /* Restart after a bit after errors, but don't bog the system. */ + worker.bgw_restart_time = 5; + strcpy_s(worker.bgw_library_name, + sizeof(worker.bgw_library_name), "citus"); + strcpy_s(worker.bgw_function_name, sizeof(worker.bgw_library_name), + "CitusMaintenanceDaemonMain"); + + worker.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&worker); +} + + /* * InitializeMaintenanceDaemonBackend, called at backend start and * configuration changes, is responsible for starting a per-database @@ -148,31 +227,20 @@ void InitializeMaintenanceDaemonBackend(void) { Oid extensionOwner = CitusExtensionOwner(); - bool found; + bool found = false; LWLockAcquire(&MaintenanceDaemonControl->lock, LW_EXCLUSIVE); - MaintenanceDaemonDBData *dbData = (MaintenanceDaemonDBData *) hash_search( - MaintenanceDaemonDBHash, - &MyDatabaseId, - HASH_ENTER_NULL, - &found); + MaintenanceDaemonDBData *dbData = GetMaintenanceDaemonDBHashEntry(MyDatabaseId, + &found); if (dbData == NULL) { WarnMaintenanceDaemonNotStarted(); LWLockRelease(&MaintenanceDaemonControl->lock); - return; } - if (!found) - { - /* ensure the values in MaintenanceDaemonDBData are zero */ - memset(((char *) dbData) + sizeof(Oid), 0, - sizeof(MaintenanceDaemonDBData) - sizeof(Oid)); - } - if (IsMaintenanceDaemon) { /* @@ -271,66 +339,97 @@ WarnMaintenanceDaemonNotStarted(void) /* - * CitusMaintenanceDaemonMain is the maintenance daemon's main routine, it'll - * be started by the background worker infrastructure. If it errors out, - * it'll be restarted after a few seconds. + * ConnectToDatabase connects to the database for the given databaseOid. + * if databaseOid is 0, connects to MainDb and then creates a hash entry. + * If a hash entry cannot be created for MainDb it exits the process requesting a restart. + * However for regular databases, it exits without requesting a restart since another + * subsequent backend is expected to start the Maintenance Daemon. + * If the found hash entry has a valid workerPid, it exits + * without requesting a restart since there is already a daemon running. */ -void -CitusMaintenanceDaemonMain(Datum main_arg) +static MaintenanceDaemonDBData * +ConnectToDatabase(Oid databaseOid) { - Oid databaseOid = DatumGetObjectId(main_arg); - TimestampTz nextStatsCollectionTime USED_WITH_LIBCURL_ONLY = - TimestampTzPlusMilliseconds(GetCurrentTimestamp(), 60 * 1000); - bool retryStatsCollection USED_WITH_LIBCURL_ONLY = false; - TimestampTz lastRecoveryTime = 0; - TimestampTz lastShardCleanTime = 0; - TimestampTz lastStatStatementsPurgeTime = 0; - TimestampTz nextMetadataSyncTime = 0; + MaintenanceDaemonDBData *myDbData = NULL; - /* state kept for the background tasks queue monitor */ - TimestampTz lastBackgroundTaskQueueCheck = GetCurrentTimestamp(); - BackgroundWorkerHandle *backgroundTasksQueueBgwHandle = NULL; - bool backgroundTasksQueueWarnedForLock = false; - /* - * We do metadata sync in a separate background worker. We need its - * handle to be able to check its status. - */ - BackgroundWorkerHandle *metadataSyncBgwHandle = NULL; + bool isMainDb = false; - /* - * Look up this worker's configuration. - */ LWLockAcquire(&MaintenanceDaemonControl->lock, LW_EXCLUSIVE); - MaintenanceDaemonDBData *myDbData = (MaintenanceDaemonDBData *) - hash_search(MaintenanceDaemonDBHash, &databaseOid, - HASH_FIND, NULL); - if (!myDbData) - { - /* - * When the database crashes, background workers are restarted, but - * the state in shared memory is lost. In that case, we exit and - * wait for a session to call InitializeMaintenanceDaemonBackend - * to properly add it to the hash. - */ - proc_exit(0); + if (databaseOid == 0) + { + char *databaseName = MainDb; + + /* + * Since we cannot query databaseOid without initializing Postgres + * first, connect to the database by name. + */ + BackgroundWorkerInitializeConnection(databaseName, NULL, 0); + + /* + * Now we have a valid MyDatabaseId. + * Insert the hash entry for the database to the Maintenance Deamon Hash. + */ + bool found = false; + + myDbData = GetMaintenanceDaemonDBHashEntry(MyDatabaseId, &found); + + if (!myDbData) + { + /* + * If an entry cannot be created, + * return code of 1 requests worker restart + * Since BackgroundWorker for the MainDb is only registered + * once during server startup, we need to retry. + */ + proc_exit(1); + } + + if (found && myDbData->workerPid != 0) + { + /* Another maintenance daemon is running.*/ + + proc_exit(0); + } + + databaseOid = MyDatabaseId; + myDbData->userOid = GetSessionUserId(); + isMainDb = true; + } + else + { + myDbData = (MaintenanceDaemonDBData *) + hash_search(MaintenanceDaemonDBHash, &databaseOid, + HASH_FIND, NULL); + + if (!myDbData) + { + /* + * When the database crashes, background workers are restarted, but + * the state in shared memory is lost. In that case, we exit and + * wait for a session to call InitializeMaintenanceDaemonBackend + * to properly add it to the hash. + */ + + proc_exit(0); + } + + if (myDbData->workerPid != 0) + { + /* + * Another maintenance daemon is running. This usually happens because + * postgres restarts the daemon after an non-zero exit, and + * InitializeMaintenanceDaemonBackend started one before postgres did. + * In that case, the first one stays and the last one exits. + */ + + proc_exit(0); + } } - if (myDbData->workerPid != 0) - { - /* - * Another maintenance daemon is running. This usually happens because - * postgres restarts the daemon after an non-zero exit, and - * InitializeMaintenanceDaemonBackend started one before postgres did. - * In that case, the first one stays and the last one exits. - */ - - proc_exit(0); - } - - before_shmem_exit(MaintenanceDaemonShmemExit, main_arg); + before_shmem_exit(MaintenanceDaemonShmemExit, ObjectIdGetDatum(databaseOid)); /* * Signal that I am the maintenance daemon now. @@ -356,25 +455,55 @@ CitusMaintenanceDaemonMain(Datum main_arg) LWLockRelease(&MaintenanceDaemonControl->lock); - /* - * Setup error context so log messages can be properly attributed. Some of - * them otherwise sound like they might be from a normal user connection. - * Do so before setting up signals etc, so we never exit without the - * context setup. - */ - ErrorContextCallback errorCallback = { 0 }; memset(&errorCallback, 0, sizeof(errorCallback)); errorCallback.callback = MaintenanceDaemonErrorContext; errorCallback.arg = (void *) myDbData; errorCallback.previous = error_context_stack; error_context_stack = &errorCallback; - elog(LOG, "starting maintenance daemon on database %u user %u", databaseOid, myDbData->userOid); - /* connect to database, after that we can actually access catalogs */ - BackgroundWorkerInitializeConnectionByOid(databaseOid, myDbData->userOid, 0); + if (!isMainDb) + { + /* connect to database, after that we can actually access catalogs */ + BackgroundWorkerInitializeConnectionByOid(databaseOid, myDbData->userOid, 0); + } + + return myDbData; +} + + +/* + * CitusMaintenanceDaemonMain is the maintenance daemon's main routine, it'll + * be started by the background worker infrastructure. If it errors out, + * it'll be restarted after a few seconds. + */ +void +CitusMaintenanceDaemonMain(Datum main_arg) +{ + Oid databaseOid = DatumGetObjectId(main_arg); + TimestampTz nextStatsCollectionTime USED_WITH_LIBCURL_ONLY = + TimestampTzPlusMilliseconds(GetCurrentTimestamp(), 60 * 1000); + bool retryStatsCollection USED_WITH_LIBCURL_ONLY = false; + TimestampTz lastRecoveryTime = 0; + TimestampTz lastShardCleanTime = 0; + TimestampTz lastStatStatementsPurgeTime = 0; + TimestampTz nextMetadataSyncTime = 0; + + /* state kept for the background tasks queue monitor */ + TimestampTz lastBackgroundTaskQueueCheck = GetCurrentTimestamp(); + BackgroundWorkerHandle *backgroundTasksQueueBgwHandle = NULL; + bool backgroundTasksQueueWarnedForLock = false; + + + /* + * We do metadata sync in a separate background worker. We need its + * handle to be able to check its status. + */ + BackgroundWorkerHandle *metadataSyncBgwHandle = NULL; + + MaintenanceDaemonDBData *myDbData = ConnectToDatabase(databaseOid); /* make worker recognizable in pg_stat_activity */ pgstat_report_appname("Citus Maintenance Daemon"); @@ -383,7 +512,7 @@ CitusMaintenanceDaemonMain(Datum main_arg) * Terminate orphaned metadata sync daemons spawned from previously terminated * or crashed maintenanced instances. */ - SignalMetadataSyncDaemon(databaseOid, SIGTERM); + SignalMetadataSyncDaemon(MyDatabaseId, SIGTERM); /* enter main loop */ while (!got_SIGTERM) @@ -945,7 +1074,7 @@ MaintenanceDaemonShmemExit(int code, Datum arg) } -/* MaintenanceDaemonSigTermHandler calls proc_exit(0) */ +/* MaintenanceDaemonSigTermHandler sets the got_SIGTERM flag.*/ static void MaintenanceDaemonSigTermHandler(SIGNAL_ARGS) { diff --git a/src/include/distributed/maintenanced.h b/src/include/distributed/maintenanced.h index de1e68883..07387a7fd 100644 --- a/src/include/distributed/maintenanced.h +++ b/src/include/distributed/maintenanced.h @@ -20,6 +20,7 @@ /* config variable for */ extern double DistributedDeadlockDetectionTimeoutFactor; +extern char *MainDb; extern void StopMaintenanceDaemon(Oid databaseId); extern void TriggerNodeMetadataSync(Oid databaseId); @@ -27,6 +28,7 @@ extern void InitializeMaintenanceDaemon(void); extern size_t MaintenanceDaemonShmemSize(void); extern void MaintenanceDaemonShmemInit(void); extern void InitializeMaintenanceDaemonBackend(void); +extern void InitializeMaintenanceDaemonForMainDb(void); extern bool LockCitusExtension(void); extern PGDLLEXPORT void CitusMaintenanceDaemonMain(Datum main_arg); diff --git a/src/test/regress/citus_tests/common.py b/src/test/regress/citus_tests/common.py index 907102482..53c9c7944 100644 --- a/src/test/regress/citus_tests/common.py +++ b/src/test/regress/citus_tests/common.py @@ -453,6 +453,9 @@ def cleanup_test_leftovers(nodes): for node in nodes: node.cleanup_schemas() + for node in nodes: + node.cleanup_databases() + for node in nodes: node.cleanup_users() @@ -753,6 +756,7 @@ class Postgres(QueryRunner): self.subscriptions = set() self.publications = set() self.replication_slots = set() + self.databases = set() self.schemas = set() self.users = set() @@ -993,6 +997,10 @@ class Postgres(QueryRunner): args = sql.SQL("") self.sql(sql.SQL("CREATE USER {} {}").format(sql.Identifier(name), args)) + def create_database(self, name): + self.databases.add(name) + self.sql(sql.SQL("CREATE DATABASE {}").format(sql.Identifier(name))) + def create_schema(self, name): self.schemas.add(name) self.sql(sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(name))) @@ -1020,6 +1028,12 @@ class Postgres(QueryRunner): for user in self.users: self.sql(sql.SQL("DROP USER IF EXISTS {}").format(sql.Identifier(user))) + def cleanup_databases(self): + for database in self.databases: + self.sql( + sql.SQL("DROP DATABASE IF EXISTS {}").format(sql.Identifier(database)) + ) + def cleanup_schemas(self): for schema in self.schemas: self.sql( diff --git a/src/test/regress/citus_tests/test/test_maintenancedeamon.py b/src/test/regress/citus_tests/test/test_maintenancedeamon.py new file mode 100644 index 000000000..3f6cb501e --- /dev/null +++ b/src/test/regress/citus_tests/test/test_maintenancedeamon.py @@ -0,0 +1,74 @@ +# This test checks that once citus.main_db is set and the +# server is restarted. A Citus Maintenance Daemon for the main_db +# is launched. This should happen even if there is no query run +# in main_db yet. +import time + + +def wait_until_maintenance_deamons_start(deamoncount, cluster): + i = 0 + n = 0 + + while i < 10: + i += 1 + n = cluster.coordinator.sql_value( + "SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon';" + ) + + if n == deamoncount: + break + + time.sleep(0.1) + + assert n == deamoncount + + +def test_set_maindb(cluster_factory): + cluster = cluster_factory(0) + + # Test that once citus.main_db is set to a database name + # there are two maintenance deamons running upon restart. + # One maintenance deamon for the database of the current connection + # and one for the citus.main_db. + cluster.coordinator.create_database("mymaindb") + cluster.coordinator.configure("citus.main_db='mymaindb'") + cluster.coordinator.restart() + + assert cluster.coordinator.sql_value("SHOW citus.main_db;") == "mymaindb" + + wait_until_maintenance_deamons_start(2, cluster) + + assert ( + cluster.coordinator.sql_value( + "SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon' AND datname='mymaindb';" + ) + == 1 + ) + + # Test that once citus.main_db is set to empty string + # there is only one maintenance deamon for the database + # of the current connection. + cluster.coordinator.configure("citus.main_db=''") + cluster.coordinator.restart() + assert cluster.coordinator.sql_value("SHOW citus.main_db;") == "" + + wait_until_maintenance_deamons_start(1, cluster) + + # Test that after citus.main_db is dropped. The maintenance + # deamon for this database is terminated. + cluster.coordinator.configure("citus.main_db='mymaindb'") + cluster.coordinator.restart() + assert cluster.coordinator.sql_value("SHOW citus.main_db;") == "mymaindb" + + wait_until_maintenance_deamons_start(2, cluster) + + cluster.coordinator.sql("DROP DATABASE mymaindb;") + + wait_until_maintenance_deamons_start(1, cluster) + + assert ( + cluster.coordinator.sql_value( + "SELECT count(*) FROM pg_stat_activity WHERE application_name = 'Citus Maintenance Daemon' AND datname='mymaindb';" + ) + == 0 + ) From 83e3fb817de8fc93c4819808b0dc8925c2bb6d38 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 31 Oct 2023 14:05:09 +0100 Subject: [PATCH 04/12] Only put major Postgres version in CI task name (#7289) Making tasks in CI required before merging to master is important and useful. The way this works is by saving the exact names of the required tasks in the admin interface of the repo. It has a search box to add them so it's not completely horrible, but doing so is quite a hassle since we have so many jobs. So limiting the amount of churn in this list of required jobs is quite useful. This changes the names of tasks to only include the major versions of Postgres, not the minor ones. Otherwise the next time we bump the minor versions we would have to remove and re-add each of the jobs. --- .github/workflows/build_and_test.yml | 26 +++++++++---------- .../workflows/packaging-test-pipelines.yml | 8 +++--- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1f22ff034..5f087fc08 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -27,9 +27,9 @@ jobs: style_checker_image_name: "citus/stylechecker" style_checker_tools_version: "0.8.18" image_suffix: "-v9d71045" - pg14_version: "14.9" - pg15_version: "15.4" - pg16_version: "16.0" + pg14_version: '{ "major": "14", "full": "14.9" }' + pg15_version: '{ "major": "15", "full": "15.4" }' + pg16_version: '{ "major": "16", "full": "16.0" }' upgrade_pg_versions: "14.9-15.4-16.0" steps: # Since GHA jobs needs at least one step we use a noop step here. @@ -93,7 +93,7 @@ jobs: run: ci/check_migration_files.sh build: needs: params - name: Build for PG ${{ matrix.pg_version}} + name: Build for PG${{ fromJson(matrix.pg_version).major }} strategy: fail-fast: false matrix: @@ -107,7 +107,7 @@ jobs: - ${{ needs.params.outputs.pg16_version }} runs-on: ubuntu-20.04 container: - image: "${{ matrix.image_name }}:${{ matrix.pg_version }}${{ matrix.image_suffix }}" + image: "${{ matrix.image_name }}:${{ fromJson(matrix.pg_version).full }}${{ matrix.image_suffix }}" options: --user root steps: - uses: actions/checkout@v3.5.0 @@ -124,7 +124,7 @@ jobs: ./build-${{ env.PG_MAJOR }}/* ./install-${{ env.PG_MAJOR }}.tar test-citus: - name: PG${{ matrix.pg_version }} - ${{ matrix.make }} + name: PG${{ fromJson(matrix.pg_version).major }} - ${{ matrix.make }} strategy: fail-fast: false matrix: @@ -211,7 +211,7 @@ jobs: image_name: ${{ needs.params.outputs.fail_test_image_name }} runs-on: ubuntu-20.04 container: - image: "${{ matrix.image_name }}:${{ matrix.pg_version }}${{ needs.params.outputs.image_suffix }}" + image: "${{ matrix.image_name }}:${{ fromJson(matrix.pg_version).full }}${{ needs.params.outputs.image_suffix }}" options: --user root --dns=8.8.8.8 # Due to Github creates a default network for each job, we need to use # --dns= to have similar DNS settings as our other CI systems or local @@ -228,17 +228,17 @@ jobs: - uses: "./.github/actions/save_logs_and_results" if: always() with: - folder: ${{ matrix.pg_version }}_${{ matrix.make }} + folder: ${{ fromJson(matrix.pg_version).major }}_${{ matrix.make }} - uses: "./.github/actions/upload_coverage" if: always() with: flags: ${{ env.PG_MAJOR }}_${{ matrix.suite }}_${{ matrix.make }} codecov_token: ${{ secrets.CODECOV_TOKEN }} test-arbitrary-configs: - name: PG${{ matrix.pg_version }} - check-arbitrary-configs-${{ matrix.parallel }} + name: PG${{ fromJson(matrix.pg_version).major }} - check-arbitrary-configs-${{ matrix.parallel }} runs-on: ["self-hosted", "1ES.Pool=1es-gha-citusdata-pool"] container: - image: "${{ matrix.image_name }}:${{ matrix.pg_version }}${{ needs.params.outputs.image_suffix }}" + image: "${{ matrix.image_name }}:${{ fromJson(matrix.pg_version).full }}${{ needs.params.outputs.image_suffix }}" options: --user root needs: - params @@ -333,10 +333,10 @@ jobs: flags: ${{ env.old_pg_major }}_${{ env.new_pg_major }}_upgrade codecov_token: ${{ secrets.CODECOV_TOKEN }} test-citus-upgrade: - name: PG${{ needs.params.outputs.pg14_version }} - check-citus-upgrade + name: PG${{ fromJson(needs.params.outputs.pg14_version).major }} - check-citus-upgrade runs-on: ubuntu-20.04 container: - image: "${{ needs.params.outputs.citusupgrade_image_name }}:${{ needs.params.outputs.pg14_version }}${{ needs.params.outputs.image_suffix }}" + image: "${{ needs.params.outputs.citusupgrade_image_name }}:${{ fromJson(needs.params.outputs.pg14_version).full }}${{ needs.params.outputs.image_suffix }}" options: --user root needs: - params @@ -383,7 +383,7 @@ jobs: CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }} runs-on: ubuntu-20.04 container: - image: ${{ needs.params.outputs.test_image_name }}:${{ needs.params.outputs.pg16_version }}${{ needs.params.outputs.image_suffix }} + image: ${{ needs.params.outputs.test_image_name }}:${{ fromJson(needs.params.outputs.pg16_version).full }}${{ needs.params.outputs.image_suffix }} needs: - params - test-citus diff --git a/.github/workflows/packaging-test-pipelines.yml b/.github/workflows/packaging-test-pipelines.yml index 0fb4b7092..8690fce1f 100644 --- a/.github/workflows/packaging-test-pipelines.yml +++ b/.github/workflows/packaging-test-pipelines.yml @@ -24,9 +24,11 @@ jobs: - name: Get Postgres Versions id: get-postgres-versions run: | - # Postgres versions are stored in .github/workflows/build_and_test.yml file in "pg[pg-version]_version" - # format. Below command extracts the versions and get the unique values. - pg_versions=$(cat .github/workflows/build_and_test.yml | grep -oE 'pg[0-9]+_version: "[0-9.]+"' | sed -E 's/pg([0-9]+)_version: "([0-9.]+)"/\1/g' | sort | uniq | tr '\n', ',') + set -euxo pipefail + # Postgres versions are stored in .github/workflows/build_and_test.yml + # file in json strings with major and full keys. + # Below command extracts the versions and get the unique values. + pg_versions=$(cat .github/workflows/build_and_test.yml | grep -oE '"major": "[0-9]+", "full": "[0-9.]+"' | sed -E 's/"major": "([0-9]+)", "full": "([0-9.]+)"/\1/g' | sort | uniq | tr '\n', ',') pg_versions_array="[ ${pg_versions} ]" echo "Supported PG Versions: ${pg_versions_array}" # Below line is needed to set the output variable to be used in the next job From ce58c043049ad4fdbc2971cfda8e168d0b3d4a55 Mon Sep 17 00:00:00 2001 From: Gokhan Gulbiz Date: Tue, 31 Oct 2023 18:00:10 +0300 Subject: [PATCH 05/12] Disable CircleCI (#7276) We are switching to Github Actions. In the test period it has worked well enough, so now we can stop using CircleCI. --- .circleci/config.yml | 1128 ---------------------------- ci/check_all_ci_scripts_are_run.sh | 4 +- ci/check_enterprise_merge.sh | 96 --- 3 files changed, 2 insertions(+), 1226 deletions(-) delete mode 100644 .circleci/config.yml delete mode 100755 ci/check_enterprise_merge.sh diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 376c44331..000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,1128 +0,0 @@ -version: 2.1 -orbs: - codecov: codecov/codecov@1.1.1 - azure-cli: circleci/azure-cli@1.0.0 - -parameters: - image_suffix: - type: string - default: '-v9d71045' - pg14_version: - type: string - default: '14.9' - pg15_version: - type: string - default: '15.4' - pg16_version: - type: string - default: '16.0' - upgrade_pg_versions: - type: string - default: '14.9-15.4-16.0' - style_checker_tools_version: - type: string - default: '0.8.18' - flaky_test: - type: string - default: '' - flaky_test_runs_per_job: - type: integer - default: 50 - skip_flaky_tests: - type: boolean - default: false - -commands: - install_extension: - parameters: - pg_major: - description: 'postgres major version to use' - type: integer - steps: - - run: - name: 'Install Extension' - command: | - tar xfv "${CIRCLE_WORKING_DIRECTORY}/install-<< parameters.pg_major >>.tar" --directory / - - configure: - steps: - - run: - name: 'Configure' - command: | - chown -R circleci . - gosu circleci ./configure --without-pg-version-check - - enable_core: - steps: - - run: - name: 'Enable core dumps' - command: | - ulimit -c unlimited - - save_regressions: - steps: - - run: - name: 'Regressions' - command: | - if [ -f "src/test/regress/regression.diffs" ]; then - cat src/test/regress/regression.diffs - exit 1 - fi - when: on_fail - - store_artifacts: - name: 'Save regressions' - path: src/test/regress/regression.diffs - - save_logs_and_results: - steps: - - store_artifacts: - name: 'Save mitmproxy output (failure test specific)' - path: src/test/regress/proxy.output - - store_artifacts: - name: 'Save results' - path: src/test/regress/results/ - - store_artifacts: - name: 'Save coordinator log' - path: src/test/regress/tmp_check/master/log - - store_artifacts: - name: 'Save worker1 log' - path: src/test/regress/tmp_check/worker.57637/log - - store_artifacts: - name: 'Save worker2 log' - path: src/test/regress/tmp_check/worker.57638/log - - stack_trace: - steps: - - run: - name: 'Print stack traces' - command: | - ./ci/print_stack_trace.sh - when: on_fail - - coverage: - parameters: - flags: - description: 'codecov flags' - type: string - steps: - - codecov/upload: - flags: '<< parameters.flags >>' - - run: - name: 'Create codeclimate coverage' - command: | - lcov --directory . --capture --output-file lcov.info - lcov --remove lcov.info -o lcov.info '/usr/*' - sed "s=^SF:$PWD/=SF:=g" -i lcov.info # relative pats are required by codeclimate - mkdir -p /tmp/codeclimate - # We started getting permissions error. This fixes them and since - # weqre not on a multi-user system so this is safe to do. - git config --global --add safe.directory /home/circleci/project - cc-test-reporter format-coverage -t lcov -o /tmp/codeclimate/$CIRCLE_JOB.json lcov.info - - persist_to_workspace: - root: /tmp - paths: - - codeclimate/*.json - -jobs: - build: - description: Build the citus extension - parameters: - pg_major: - description: postgres major version building citus for - type: integer - image: - description: docker image to use for the build - type: string - default: citus/extbuilder - image_tag: - description: tag to use for the docker image - type: string - docker: - - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>' - steps: - - checkout - - run: - name: 'Configure, Build, and Install' - command: | - ./ci/build-citus.sh - - persist_to_workspace: - root: . - paths: - - build-<< parameters.pg_major >>/* - - install-<>.tar - - check-style: - docker: - - image: 'citus/stylechecker:<< pipeline.parameters.style_checker_tools_version >><< pipeline.parameters.image_suffix >>' - steps: - - checkout - - run: - name: 'Check C Style' - command: citus_indent --check - - run: - name: 'Check Python style' - command: black --check . - - run: - name: 'Check Python import order' - command: isort --check . - - run: - name: 'Check Python lints' - command: flake8 . - - run: - name: 'Fix whitespace' - command: ci/editorconfig.sh && git diff --exit-code - - run: - name: 'Remove useless declarations' - command: ci/remove_useless_declarations.sh && git diff --cached --exit-code - - run: - name: 'Normalize test output' - command: ci/normalize_expected.sh && git diff --exit-code - - run: - name: 'Check for C-style comments in migration files' - command: ci/disallow_c_comments_in_migrations.sh && git diff --exit-code - - run: - name: 'Check for comment--cached ns that start with # character in spec files' - command: ci/disallow_hash_comments_in_spec_files.sh && git diff --exit-code - - run: - name: 'Check for gitignore entries .for source files' - command: ci/fix_gitignore.sh && git diff --exit-code - - run: - name: 'Check for lengths of changelog entries' - command: ci/disallow_long_changelog_entries.sh - - run: - name: 'Check for banned C API usage' - command: ci/banned.h.sh - - run: - name: 'Check for tests missing in schedules' - command: ci/check_all_tests_are_run.sh - - run: - name: 'Check if all CI scripts are actually run' - command: ci/check_all_ci_scripts_are_run.sh - - run: - name: 'Check if all GUCs are sorted alphabetically' - command: ci/check_gucs_are_alphabetically_sorted.sh - - run: - name: 'Check for missing downgrade scripts' - command: ci/check_migration_files.sh - - check-sql-snapshots: - docker: - - image: 'citus/extbuilder:latest' - steps: - - checkout - - run: - name: 'Check Snapshots' - command: ci/check_sql_snapshots.sh - - test-pg-upgrade: - description: Runs postgres upgrade tests - parameters: - old_pg_major: - description: 'postgres major version to use before the upgrade' - type: integer - new_pg_major: - description: 'postgres major version to upgrade to' - type: integer - image: - description: 'docker image to use as for the tests' - type: string - default: citus/pgupgradetester - image_tag: - description: 'docker image tag to use' - type: string - docker: - - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>' - working_directory: /home/circleci/project - steps: - - checkout - - attach_workspace: - at: . - - install_extension: - pg_major: << parameters.old_pg_major >> - - install_extension: - pg_major: << parameters.new_pg_major >> - - configure - - enable_core - - run: - name: 'Install and test postgres upgrade' - command: | - gosu circleci \ - make -C src/test/regress \ - check-pg-upgrade \ - old-bindir=/usr/lib/postgresql/<< parameters.old_pg_major >>/bin \ - new-bindir=/usr/lib/postgresql/<< parameters.new_pg_major >>/bin - no_output_timeout: 2m - - run: - name: 'Copy pg_upgrade logs for newData dir' - command: | - mkdir -p /tmp/pg_upgrade_newData_logs - if ls src/test/regress/tmp_upgrade/newData/*.log 1> /dev/null 2>&1; then - cp src/test/regress/tmp_upgrade/newData/*.log /tmp/pg_upgrade_newData_logs - fi - when: on_fail - - store_artifacts: - name: 'Save pg_upgrade logs for newData dir' - path: /tmp/pg_upgrade_newData_logs - - save_logs_and_results - - save_regressions - - stack_trace - - coverage: - flags: 'test_<< parameters.old_pg_major >>_<< parameters.new_pg_major >>,upgrade' - - test-pytest: - description: Runs pytest based tests - parameters: - pg_major: - description: 'postgres major version' - type: integer - image: - description: 'docker image to use as for the tests' - type: string - default: citus/failtester - image_tag: - description: 'docker image tag to use' - type: string - docker: - - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>' - working_directory: /home/circleci/project - steps: - - checkout - - attach_workspace: - at: . - - install_extension: - pg_major: << parameters.pg_major >> - - configure - - enable_core - - run: - name: 'Run pytest' - command: | - gosu circleci \ - make -C src/test/regress check-pytest - no_output_timeout: 2m - - stack_trace - - coverage: - flags: 'test_<< parameters.pg_major >>,pytest' - - - test-arbitrary-configs: - description: Runs tests on arbitrary configs - parallelism: 6 - parameters: - pg_major: - description: 'postgres major version to use' - type: integer - image: - description: 'docker image to use as for the tests' - type: string - default: citus/failtester - image_tag: - description: 'docker image tag to use' - type: string - docker: - - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>' - resource_class: xlarge - working_directory: /home/circleci/project - steps: - - checkout - - attach_workspace: - at: . - - install_extension: - pg_major: << parameters.pg_major >> - - configure - - enable_core - - run: - name: 'Test arbitrary configs' - command: | - TESTS=$(src/test/regress/citus_tests/print_test_names.py | circleci tests split) - # Our test suite expects comma separated values - TESTS=$(echo $TESTS | tr ' ' ',') - # TESTS will contain subset of configs that will be run on a container and we use multiple containers - # to run the test suite - gosu circleci \ - make -C src/test/regress \ - check-arbitrary-configs parallel=4 CONFIGS=$TESTS - no_output_timeout: 2m - - run: - name: 'Show regressions' - command: | - find src/test/regress/tmp_citus_test/ -name "regression*.diffs" -exec cat {} + - lines=$(find src/test/regress/tmp_citus_test/ -name "regression*.diffs" | wc -l) - if [ $lines -ne 0 ]; then - exit 1 - fi - - when: on_fail - - run: - name: 'Copy logfiles' - command: | - mkdir src/test/regress/tmp_citus_test/logfiles - find src/test/regress/tmp_citus_test/ -name "logfile_*" -exec cp -t src/test/regress/tmp_citus_test/logfiles/ {} + - when: on_fail - - store_artifacts: - name: 'Save logfiles' - path: src/test/regress/tmp_citus_test/logfiles - - save_logs_and_results - - stack_trace - - coverage: - flags: 'test_<< parameters.pg_major >>,upgrade' - - test-citus-upgrade: - description: Runs citus upgrade tests - parameters: - pg_major: - description: 'postgres major version' - type: integer - image: - description: 'docker image to use as for the tests' - type: string - default: citus/citusupgradetester - image_tag: - description: 'docker image tag to use' - type: string - docker: - - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>' - working_directory: /home/circleci/project - steps: - - checkout - - attach_workspace: - at: . - - configure - - enable_core - - run: - name: 'Install and test citus upgrade' - command: | - # run make check-citus-upgrade for all citus versions - # the image has ${CITUS_VERSIONS} set with all verions it contains the binaries of - for citus_version in ${CITUS_VERSIONS}; do \ - gosu circleci \ - make -C src/test/regress \ - check-citus-upgrade \ - bindir=/usr/lib/postgresql/${PG_MAJOR}/bin \ - citus-old-version=${citus_version} \ - citus-pre-tar=/install-pg${PG_MAJOR}-citus${citus_version}.tar \ - citus-post-tar=/home/circleci/project/install-$PG_MAJOR.tar; \ - done; - - # run make check-citus-upgrade-mixed for all citus versions - # the image has ${CITUS_VERSIONS} set with all verions it contains the binaries of - for citus_version in ${CITUS_VERSIONS}; do \ - gosu circleci \ - make -C src/test/regress \ - check-citus-upgrade-mixed \ - citus-old-version=${citus_version} \ - bindir=/usr/lib/postgresql/${PG_MAJOR}/bin \ - citus-pre-tar=/install-pg${PG_MAJOR}-citus${citus_version}.tar \ - citus-post-tar=/home/circleci/project/install-$PG_MAJOR.tar; \ - done; - no_output_timeout: 2m - - save_logs_and_results - - save_regressions - - stack_trace - - coverage: - flags: 'test_<< parameters.pg_major >>,upgrade' - - test-query-generator: - description: Expects that the generated queries that are run on distributed and local tables would have the same results - parameters: - pg_major: - description: 'postgres major version' - type: integer - image: - description: 'docker image to use as for the tests' - type: string - default: citus/failtester - image_tag: - description: 'docker image tag to use' - type: string - docker: - - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>' - working_directory: /home/circleci/project - steps: - - checkout - - attach_workspace: - at: . - - install_extension: - pg_major: << parameters.pg_major >> - - configure - - enable_core - - run: - name: 'Run Test' - command: | - gosu circleci make -C src/test/regress check-query-generator - no_output_timeout: 5m - - run: - name: 'Show regressions' - command: | - find src/test/regress/citus_tests/query_generator/out/ -name "local_dist.diffs" -exec cat {} + - lines=$(find src/test/regress/citus_tests/query_generator/out/ -name "local_dist.diffs" | wc -l) - if [ $lines -ne 0 ]; then - exit 1 - fi - when: on_fail - - run: - name: 'Copy logfiles' - command: | - mkdir src/test/regress/tmp_citus_test/logfiles - find src/test/regress/tmp_citus_test/ -name "logfile_*" -exec cp -t src/test/regress/tmp_citus_test/logfiles/ {} + - when: on_fail - - store_artifacts: - name: 'Save logfiles' - path: src/test/regress/tmp_citus_test/logfiles - - store_artifacts: - name: 'Save ddls' - path: src/test/regress/citus_tests/query_generator/out/ddls.sql - - store_artifacts: - name: 'Save dmls' - path: src/test/regress/citus_tests/query_generator/out/queries.sql - - store_artifacts: - name: 'Save diffs' - path: src/test/regress/citus_tests/query_generator/out/local_dist.diffs - - stack_trace - - coverage: - flags: 'test_<< parameters.pg_major >>,querygen' - - test-citus: - description: Runs the common tests of citus - parameters: - pg_major: - description: 'postgres major version' - type: integer - image: - description: 'docker image to use as for the tests' - type: string - default: citus/exttester - image_tag: - description: 'docker image tag to use' - type: string - make: - description: 'make target' - type: string - docker: - - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>' - working_directory: /home/circleci/project - steps: - - checkout - - attach_workspace: - at: . - - install_extension: - pg_major: << parameters.pg_major >> - - configure - - enable_core - - run: - name: 'Run Test' - command: | - gosu circleci make -C src/test/regress << parameters.make >> - no_output_timeout: 2m - - save_logs_and_results - - save_regressions - - stack_trace - - coverage: - flags: 'test_<< parameters.pg_major >>,<< parameters.make >>' - - tap-test-citus: - description: Runs tap tests for citus - parameters: - pg_major: - description: 'postgres major version' - type: integer - image: - description: 'docker image to use as for the tests' - type: string - default: citus/exttester - image_tag: - description: 'docker image tag to use' - type: string - suite: - description: 'name of the tap test suite to run' - type: string - make: - description: 'make target' - type: string - default: installcheck - docker: - - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>' - working_directory: /home/circleci/project - steps: - - checkout - - attach_workspace: - at: . - - install_extension: - pg_major: << parameters.pg_major >> - - configure - - enable_core - - run: - name: 'Run Test' - command: | - gosu circleci make -C src/test/<< parameters.suite >> << parameters.make >> - no_output_timeout: 2m - - store_artifacts: - name: 'Save tap logs' - path: /home/circleci/project/src/test/<< parameters.suite >>/tmp_check/log - - save_logs_and_results - - stack_trace - - coverage: - flags: 'test_<< parameters.pg_major >>,tap_<< parameters.suite >>_<< parameters.make >>' - - check-merge-to-enterprise: - docker: - - image: citus/extbuilder:<< pipeline.parameters.pg14_version >> - working_directory: /home/circleci/project - steps: - - checkout - - run: - command: | - ci/check_enterprise_merge.sh - - ch_benchmark: - docker: - - image: buildpack-deps:stretch - working_directory: /home/circleci/project - steps: - - checkout - - azure-cli/install - - azure-cli/login-with-service-principal - - run: - command: | - cd ./src/test/hammerdb - sh run_hammerdb.sh citusbot_ch_benchmark_rg - name: install dependencies and run ch_benchmark tests - no_output_timeout: 20m - - tpcc_benchmark: - docker: - - image: buildpack-deps:stretch - working_directory: /home/circleci/project - steps: - - checkout - - azure-cli/install - - azure-cli/login-with-service-principal - - run: - command: | - cd ./src/test/hammerdb - sh run_hammerdb.sh citusbot_tpcc_benchmark_rg - name: install dependencies and run ch_benchmark tests - no_output_timeout: 20m - - test-flakyness: - description: Runs a test multiple times to see if it's flaky - parallelism: 32 - parameters: - pg_major: - description: 'postgres major version' - type: integer - image: - description: 'docker image to use as for the tests' - type: string - default: citus/failtester - image_tag: - description: 'docker image tag to use' - type: string - test: - description: 'the test file path that should be run multiple times' - type: string - default: '' - runs: - description: 'number of times that the test should be run in total' - type: integer - default: 8 - skip: - description: 'A flag to bypass flaky test detection.' - type: boolean - default: false - docker: - - image: '<< parameters.image >>:<< parameters.image_tag >><< pipeline.parameters.image_suffix >>' - working_directory: /home/circleci/project - resource_class: small - steps: - - checkout - - attach_workspace: - at: . - - run: - name: 'Detect regression tests need to be ran' - command: | - skip=<< parameters.skip >> - if [ "$skip" = true ]; then - echo "Skipping flaky test detection." - circleci-agent step halt - fi - - testForDebugging="<< parameters.test >>" - - if [ -z "$testForDebugging" ]; then - detected_changes=$(git diff origin/main... --name-only --diff-filter=AM | (grep 'src/test/regress/sql/.*\.sql\|src/test/regress/spec/.*\.spec\|src/test/regress/citus_tests/test/test_.*\.py' || true)) - tests=${detected_changes} - else - tests=$testForDebugging; - fi - - if [ -z "$tests" ]; then - echo "No test found." - circleci-agent step halt - else - echo "Detected tests " $tests - fi - - echo export tests=\""$tests"\" >> "$BASH_ENV" - source "$BASH_ENV" - - install_extension: - pg_major: << parameters.pg_major >> - - configure - - enable_core - - run: - name: 'Run minimal tests' - command: | - tests_array=($tests) - for test in "${tests_array[@]}" - do - test_name=$(echo "$test" | sed -r "s/.+\/(.+)\..+/\1/") - gosu circleci src/test/regress/citus_tests/run_test.py $test_name --repeat << parameters.runs >> --use-base-schedule --use-whole-schedule-line - done - no_output_timeout: 2m - - save_logs_and_results - - save_regressions - - stack_trace - - upload-coverage: - docker: - - image: 'citus/exttester:<< pipeline.parameters.pg15_version >><< pipeline.parameters.image_suffix >>' - working_directory: /home/circleci/project - steps: - - attach_workspace: - at: . - - run: - name: Upload coverage results to Code Climate - command: | - cc-test-reporter sum-coverage codeclimate/*.json -o total.json - cc-test-reporter upload-coverage -i total.json - -workflows: - version: 2 - flaky_test_debugging: - jobs: - - build: - name: build-flaky-15 - pg_major: 15 - image_tag: '<< pipeline.parameters.pg15_version >>' - - - test-flakyness: - name: 'test-15_flaky' - pg_major: 15 - image_tag: '<< pipeline.parameters.pg15_version >>' - requires: [build-flaky-15] - test: '<< pipeline.parameters.flaky_test >>' - runs: << pipeline.parameters.flaky_test_runs_per_job >> - - build_and_test: - jobs: - - build: - name: build-14 - pg_major: 14 - image_tag: '<< pipeline.parameters.pg14_version >>' - - build: - name: build-15 - pg_major: 15 - image_tag: '<< pipeline.parameters.pg15_version >>' - - build: - name: build-16 - pg_major: 16 - image_tag: '<< pipeline.parameters.pg16_version >>' - - - check-style - - check-sql-snapshots - - - test-citus: &test-citus-14 - name: 'test-14_check-split' - make: check-split - pg_major: 14 - image_tag: '<< pipeline.parameters.pg14_version >>' - requires: [build-14] - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-enterprise' - make: check-enterprise - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-enterprise-isolation' - make: check-enterprise-isolation - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-enterprise-isolation-logicalrep-1' - make: check-enterprise-isolation-logicalrep-1 - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-enterprise-isolation-logicalrep-2' - make: check-enterprise-isolation-logicalrep-2 - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-enterprise-isolation-logicalrep-3' - make: check-enterprise-isolation-logicalrep-3 - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-enterprise-failure' - image: citus/failtester - make: check-enterprise-failure - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-multi' - make: check-multi - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-multi-1' - make: check-multi-1 - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-mx' - make: check-multi-mx - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-vanilla' - make: check-vanilla - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-isolation' - make: check-isolation - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-operations' - make: check-operations - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-follower-cluster' - make: check-follower-cluster - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-columnar' - make: check-columnar - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-columnar-isolation' - make: check-columnar-isolation - - test-citus: - <<: *test-citus-14 - name: 'test-14_check-failure' - image: citus/failtester - make: check-failure - - - test-citus: &test-citus-15 - name: 'test-15_check-split' - make: check-split - pg_major: 15 - image_tag: '<< pipeline.parameters.pg15_version >>' - requires: [build-15] - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-enterprise' - make: check-enterprise - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-enterprise-isolation' - make: check-enterprise-isolation - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-enterprise-isolation-logicalrep-1' - make: check-enterprise-isolation-logicalrep-1 - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-enterprise-isolation-logicalrep-2' - make: check-enterprise-isolation-logicalrep-2 - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-enterprise-isolation-logicalrep-3' - make: check-enterprise-isolation-logicalrep-3 - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-enterprise-failure' - image: citus/failtester - make: check-enterprise-failure - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-multi' - make: check-multi - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-multi-1' - make: check-multi-1 - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-mx' - make: check-multi-mx - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-vanilla' - make: check-vanilla - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-isolation' - make: check-isolation - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-operations' - make: check-operations - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-follower-cluster' - make: check-follower-cluster - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-columnar' - make: check-columnar - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-columnar-isolation' - make: check-columnar-isolation - - test-citus: - <<: *test-citus-15 - name: 'test-15_check-failure' - image: citus/failtester - make: check-failure - - - test-citus: &test-citus-16 - name: 'test-16_check-split' - make: check-split - pg_major: 16 - image_tag: '<< pipeline.parameters.pg16_version >>' - requires: [build-16] - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-enterprise' - make: check-enterprise - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-enterprise-isolation' - make: check-enterprise-isolation - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-enterprise-isolation-logicalrep-1' - make: check-enterprise-isolation-logicalrep-1 - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-enterprise-isolation-logicalrep-2' - make: check-enterprise-isolation-logicalrep-2 - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-enterprise-isolation-logicalrep-3' - make: check-enterprise-isolation-logicalrep-3 - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-enterprise-failure' - image: citus/failtester - make: check-enterprise-failure - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-multi' - make: check-multi - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-multi-1' - make: check-multi-1 - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-mx' - make: check-multi-mx - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-vanilla' - make: check-vanilla - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-isolation' - make: check-isolation - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-operations' - make: check-operations - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-follower-cluster' - make: check-follower-cluster - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-columnar' - make: check-columnar - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-columnar-isolation' - make: check-columnar-isolation - - test-citus: - <<: *test-citus-16 - name: 'test-16_check-failure' - image: citus/failtester - make: check-failure - - - test-pytest: - name: 'test-14_pytest' - pg_major: 14 - image_tag: '<< pipeline.parameters.pg14_version >>' - requires: [build-14] - - - test-pytest: - name: 'test-15_pytest' - pg_major: 15 - image_tag: '<< pipeline.parameters.pg15_version >>' - requires: [build-15] - - - test-pytest: - name: 'test-16_pytest' - pg_major: 16 - image_tag: '<< pipeline.parameters.pg16_version >>' - requires: [build-16] - - - tap-test-citus: - name: 'test-15_tap-cdc' - suite: cdc - pg_major: 15 - image_tag: '<< pipeline.parameters.pg15_version >>' - requires: [build-15] - - - tap-test-citus: - name: 'test-16_tap-cdc' - suite: cdc - pg_major: 16 - image_tag: '<< pipeline.parameters.pg16_version >>' - requires: [build-16] - - - test-arbitrary-configs: - name: 'test-14_check-arbitrary-configs' - pg_major: 14 - image_tag: '<< pipeline.parameters.pg14_version >>' - requires: [build-14] - - - test-arbitrary-configs: - name: 'test-15_check-arbitrary-configs' - pg_major: 15 - image_tag: '<< pipeline.parameters.pg15_version >>' - requires: [build-15] - - - test-arbitrary-configs: - name: 'test-16_check-arbitrary-configs' - pg_major: 16 - image_tag: '<< pipeline.parameters.pg16_version >>' - requires: [build-16] - - - test-query-generator: - name: 'test-14_check-query-generator' - pg_major: 14 - image_tag: '<< pipeline.parameters.pg14_version >>' - requires: [build-14] - - - test-query-generator: - name: 'test-15_check-query-generator' - pg_major: 15 - image_tag: '<< pipeline.parameters.pg15_version >>' - requires: [build-15] - - - test-query-generator: - name: 'test-16_check-query-generator' - pg_major: 16 - image_tag: '<< pipeline.parameters.pg16_version >>' - requires: [build-16] - - - test-pg-upgrade: - name: 'test-14-15_check-pg-upgrade' - old_pg_major: 14 - new_pg_major: 15 - image_tag: '<< pipeline.parameters.upgrade_pg_versions >>' - requires: [build-14, build-15] - - - test-pg-upgrade: - name: 'test-15-16_check-pg-upgrade' - old_pg_major: 15 - new_pg_major: 16 - image_tag: '<< pipeline.parameters.upgrade_pg_versions >>' - requires: [build-15, build-16] - - - test-pg-upgrade: - name: 'test-14-16_check-pg-upgrade' - old_pg_major: 14 - new_pg_major: 16 - image_tag: '<< pipeline.parameters.upgrade_pg_versions >>' - requires: [build-14, build-16] - - - test-citus-upgrade: - name: test-14_check-citus-upgrade - pg_major: 14 - image_tag: '<< pipeline.parameters.pg14_version >>' - requires: [build-14] - - - upload-coverage: - requires: - - test-14_check-multi - - test-14_check-multi-1 - - test-14_check-mx - - test-14_check-vanilla - - test-14_check-isolation - - test-14_check-operations - - test-14_check-follower-cluster - - test-14_check-columnar - - test-14_check-columnar-isolation - - test-14_check-failure - - test-14_check-enterprise - - test-14_check-enterprise-isolation - - test-14_check-enterprise-isolation-logicalrep-1 - - test-14_check-enterprise-isolation-logicalrep-2 - - test-14_check-enterprise-isolation-logicalrep-3 - - test-14_check-enterprise-failure - - test-14_check-split - - test-14_check-arbitrary-configs - - test-14_check-query-generator - - test-15_check-multi - - test-15_check-multi-1 - - test-15_check-mx - - test-15_check-vanilla - - test-15_check-isolation - - test-15_check-operations - - test-15_check-follower-cluster - - test-15_check-columnar - - test-15_check-columnar-isolation - - test-15_check-failure - - test-15_check-enterprise - - test-15_check-enterprise-isolation - - test-15_check-enterprise-isolation-logicalrep-1 - - test-15_check-enterprise-isolation-logicalrep-2 - - test-15_check-enterprise-isolation-logicalrep-3 - - test-15_check-enterprise-failure - - test-15_check-split - - test-15_check-arbitrary-configs - - test-15_check-query-generator - - test-16_check-multi - - test-16_check-multi-1 - - test-16_check-mx - - test-16_check-vanilla - - test-16_check-isolation - - test-16_check-operations - - test-16_check-follower-cluster - - test-16_check-columnar - - test-16_check-columnar-isolation - - test-16_check-failure - - test-16_check-enterprise - - test-16_check-enterprise-isolation - - test-16_check-enterprise-isolation-logicalrep-1 - - test-16_check-enterprise-isolation-logicalrep-2 - - test-16_check-enterprise-isolation-logicalrep-3 - - test-16_check-enterprise-failure - - test-16_check-split - - test-16_check-arbitrary-configs - - test-16_check-query-generator - - test-14-15_check-pg-upgrade - - test-15-16_check-pg-upgrade - - test-14-16_check-pg-upgrade - - test-14_check-citus-upgrade - - - ch_benchmark: - requires: [build-14] - filters: - branches: - only: - - /ch_benchmark\/.*/ # match with ch_benchmark/ prefix - - tpcc_benchmark: - requires: [build-14] - filters: - branches: - only: - - /tpcc_benchmark\/.*/ # match with tpcc_benchmark/ prefix - - test-flakyness: - name: 'test-15_flaky' - pg_major: 15 - image_tag: '<< pipeline.parameters.pg15_version >>' - requires: [build-15] - skip: << pipeline.parameters.skip_flaky_tests >> diff --git a/ci/check_all_ci_scripts_are_run.sh b/ci/check_all_ci_scripts_are_run.sh index 0b7abb3e3..12516f793 100755 --- a/ci/check_all_ci_scripts_are_run.sh +++ b/ci/check_all_ci_scripts_are_run.sh @@ -14,8 +14,8 @@ ci_scripts=$( grep -v -E '^(ci_helpers.sh|fix_style.sh)$' ) for script in $ci_scripts; do - if ! grep "\\bci/$script\\b" .circleci/config.yml > /dev/null; then - echo "ERROR: CI script with name \"$script\" is not actually used in .circleci/config.yml" + if ! grep "\\bci/$script\\b" -r .github > /dev/null; then + echo "ERROR: CI script with name \"$script\" is not actually used in .github folder" exit 1 fi if ! grep "^## \`$script\`\$" ci/README.md > /dev/null; then diff --git a/ci/check_enterprise_merge.sh b/ci/check_enterprise_merge.sh deleted file mode 100755 index d29ffcad8..000000000 --- a/ci/check_enterprise_merge.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash - -# Testing this script locally requires you to set the following environment -# variables: -# CIRCLE_BRANCH, GIT_USERNAME and GIT_TOKEN - -# fail if trying to reference a variable that is not set. -set -u -# exit immediately if a command fails -set -e -# Fail on pipe failures -set -o pipefail - -PR_BRANCH="${CIRCLE_BRANCH}" -ENTERPRISE_REMOTE="https://${GIT_USERNAME}:${GIT_TOKEN}@github.com/citusdata/citus-enterprise" - -# shellcheck disable=SC1091 -source ci/ci_helpers.sh - -# List executed commands. This is done so debugging this script is easier when -# it fails. It's explicitly done after git remote add so username and password -# are not shown in CI output (even though it's also filtered out by CircleCI) -set -x - -check_compile () { - echo "INFO: checking if merged code can be compiled" - ./configure --without-libcurl - make -j10 -} - -# Clone current git repo (which should be community) to a temporary working -# directory and go there -GIT_DIR_ROOT="$(git rev-parse --show-toplevel)" -TMP_GIT_DIR="$(mktemp --directory -t citus-merge-check.XXXXXXXXX)" -git clone "$GIT_DIR_ROOT" "$TMP_GIT_DIR" -cd "$TMP_GIT_DIR" - -# Fails in CI without this -git config user.email "citus-bot@microsoft.com" -git config user.name "citus bot" - -# Disable "set -x" temporarily, because $ENTERPRISE_REMOTE contains passwords -{ set +x ; } 2> /dev/null -git remote add enterprise "$ENTERPRISE_REMOTE" -set -x - -git remote set-url --push enterprise no-pushing - -# Fetch enterprise-master -git fetch enterprise enterprise-master - - -git checkout "enterprise/enterprise-master" - -if git merge --no-commit "origin/$PR_BRANCH"; then - echo "INFO: community PR branch could be merged into enterprise-master" - # check that we can compile after the merge - if check_compile; then - exit 0 - fi - - echo "WARN: Failed to compile after community PR branch was merged into enterprise" -fi - -# undo partial merge -git merge --abort - -# If we have a conflict on enterprise merge on the master branch, we have a problem. -# Provide an error message to indicate that enterprise merge is needed to fix this check. -if [[ $PR_BRANCH = master ]]; then - echo "ERROR: Master branch has merge conflicts with enterprise-master." - echo "Try re-running this CI job after merging your changes into enterprise-master." - exit 1 -fi - -if ! git fetch enterprise "$PR_BRANCH" ; then - echo "ERROR: enterprise/$PR_BRANCH was not found and community PR branch could not be merged into enterprise-master" - exit 1 -fi - -# Show the top commit of the enterprise PR branch to make debugging easier -git log -n 1 "enterprise/$PR_BRANCH" - -# Check that this branch contains the top commit of the current community PR -# branch. If it does not it means it's not up to date with the current PR, so -# the enterprise branch should be updated. -if ! git merge-base --is-ancestor "origin/$PR_BRANCH" "enterprise/$PR_BRANCH" ; then - echo "ERROR: enterprise/$PR_BRANCH is not up to date with community PR branch" - exit 1 -fi - -# Now check if we can merge the enterprise PR into enterprise-master without -# issues. -git merge --no-commit "enterprise/$PR_BRANCH" -# check that we can compile after the merge -check_compile From 81aa660b3140ec5312e0107daa920c628ec7a738 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 31 Oct 2023 16:59:16 +0100 Subject: [PATCH 06/12] Fix flaky test detection (#7291) PR #7289 broke flaky test detction. This fixes that. --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5f087fc08..d285e4f50 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -478,7 +478,7 @@ jobs: name: Test flakyness runs-on: ubuntu-20.04 container: - image: ${{ needs.params.outputs.fail_test_image_name }}:${{ needs.params.outputs.pg16_version }}${{ needs.params.outputs.image_suffix }} + image: ${{ needs.params.outputs.fail_test_image_name }}:${{ fromJson(needs.params.outputs.pg16_version).full }}${{ needs.params.outputs.image_suffix }} options: --user root env: runs: 8 From a76a832553ec4a9896d080c4b8bde8f1971aef0b Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 1 Nov 2023 09:41:28 +0100 Subject: [PATCH 07/12] Fix flaky validate_constraint test (#7293) Sometimes validate constraint would fail like this: ```diff validatable_constraint_8000016 | t (10 rows) DROP TABLE constrained_table; +ERROR: deadlock detected +DETAIL: Process 16602 waits for ShareRowExclusiveLock on relation 56258 of database 16384; blocked by process 16601. +Process 16601 waits for AccessShareLock on relation 56120 of database 16384; blocked by process 16602. +HINT: See server log for query details. DROP TABLE referenced_table CASCADE; DROP TABLE referencing_table; DROP SCHEMA validate_constraint CASCADE; -NOTICE: drop cascades to 3 other objects +NOTICE: drop cascades to 4 other objects DETAIL: drop cascades to type constraint_validity drop cascades to view constraint_validations_in_workers drop cascades to view constraint_validations +drop cascades to table constrained_table SET search_path TO DEFAULT; ``` Source: https://github.com/citusdata/citus/actions/runs/6708383699?pr=7291 This change fixes that by not running together with the foreign_key_to_reference_table test anymore. In passing it also simplifies dropping of the test its resources. --- src/test/regress/expected/validate_constraint.out | 8 +------- src/test/regress/multi_1_schedule | 3 ++- src/test/regress/multi_schedule_hyperscale | 3 ++- src/test/regress/multi_schedule_hyperscale_superuser | 4 +++- src/test/regress/sql/validate_constraint.sql | 5 +---- 5 files changed, 9 insertions(+), 14 deletions(-) diff --git a/src/test/regress/expected/validate_constraint.out b/src/test/regress/expected/validate_constraint.out index 08b03a2bf..b38e835fd 100644 --- a/src/test/regress/expected/validate_constraint.out +++ b/src/test/regress/expected/validate_constraint.out @@ -133,12 +133,6 @@ ORDER BY 1, 2; validatable_constraint_8000016 | t (10 rows) -DROP TABLE constrained_table; -DROP TABLE referenced_table CASCADE; -DROP TABLE referencing_table; +SET client_min_messages TO WARNING; DROP SCHEMA validate_constraint CASCADE; -NOTICE: drop cascades to 3 other objects -DETAIL: drop cascades to type constraint_validity -drop cascades to view constraint_validations_in_workers -drop cascades to view constraint_validations SET search_path TO DEFAULT; diff --git a/src/test/regress/multi_1_schedule b/src/test/regress/multi_1_schedule index ad70f136e..e824f5cba 100644 --- a/src/test/regress/multi_1_schedule +++ b/src/test/regress/multi_1_schedule @@ -201,7 +201,8 @@ test: citus_copy_shard_placement # multi_utilities cannot be run in parallel with other tests because it checks # global locks test: multi_utilities -test: foreign_key_to_reference_table validate_constraint +test: foreign_key_to_reference_table +test: validate_constraint test: multi_repartition_udt multi_repartitioned_subquery_udf multi_subtransactions test: multi_modifying_xacts diff --git a/src/test/regress/multi_schedule_hyperscale b/src/test/regress/multi_schedule_hyperscale index 8849e81f2..86ac16d4f 100644 --- a/src/test/regress/multi_schedule_hyperscale +++ b/src/test/regress/multi_schedule_hyperscale @@ -154,7 +154,8 @@ test: multi_outer_join # --- test: multi_complex_count_distinct test: multi_upsert multi_simple_queries -test: foreign_key_to_reference_table validate_constraint +test: foreign_key_to_reference_table +test: validate_constraint # --------- # creates hash and range-partitioned tables and performs COPY diff --git a/src/test/regress/multi_schedule_hyperscale_superuser b/src/test/regress/multi_schedule_hyperscale_superuser index 052b93786..f5cddfc05 100644 --- a/src/test/regress/multi_schedule_hyperscale_superuser +++ b/src/test/regress/multi_schedule_hyperscale_superuser @@ -150,7 +150,9 @@ test: multi_outer_join test: multi_create_fdw test: multi_generate_ddl_commands multi_create_shards multi_prune_shard_list test: multi_upsert multi_simple_queries multi_data_types -test: multi_utilities foreign_key_to_reference_table validate_constraint +test: multi_utilities +test: foreign_key_to_reference_table +test: validate_constraint test: multi_repartition_udt multi_repartitioned_subquery_udf # --------- diff --git a/src/test/regress/sql/validate_constraint.sql b/src/test/regress/sql/validate_constraint.sql index 294e9a8b2..bb63f2854 100644 --- a/src/test/regress/sql/validate_constraint.sql +++ b/src/test/regress/sql/validate_constraint.sql @@ -116,9 +116,6 @@ SELECT * FROM constraint_validations_in_workers ORDER BY 1, 2; -DROP TABLE constrained_table; -DROP TABLE referenced_table CASCADE; -DROP TABLE referencing_table; - +SET client_min_messages TO WARNING; DROP SCHEMA validate_constraint CASCADE; SET search_path TO DEFAULT; From 37415ef8f50473c651b11aea807b490fb7926f1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Villemain?= <94834419+c2main@users.noreply.github.com> Date: Wed, 1 Nov 2023 10:05:51 +0100 Subject: [PATCH 08/12] Allow citus_*_size on index related to a distributed table (#7271) I just enhanced the existing code to check if the relation is an index belonging to a distributed table. If so the shardId is appended to relation (index) name and the *_size function are executed as before. There is a change in an extern function: `extern StringInfo GenerateSizeQueryOnMultiplePlacements(...)` It's possible to create a new function and deprecate this one later if compatibility is an issue. Fixes https://github.com/citusdata/citus/issues/6496. DESCRIPTION: Allows using Citus size functions on distributed tables indexes. --------- Co-authored-by: Onur Tirtir --- .../distributed/metadata/metadata_utility.c | 144 ++++++++++----- .../distributed/operations/shard_transfer.c | 5 + src/include/distributed/metadata_utility.h | 1 + src/test/regress/citus_tests/run_test.py | 1 + .../expected/isolation_drop_vs_all.out | 2 +- .../expected/multi_cluster_management.out | 6 + .../regress/expected/multi_size_queries.out | 168 ++++++++++++++++-- .../regress/sql/multi_cluster_management.sql | 7 + src/test/regress/sql/multi_size_queries.sql | 67 ++++++- 9 files changed, 333 insertions(+), 68 deletions(-) diff --git a/src/backend/distributed/metadata/metadata_utility.c b/src/backend/distributed/metadata/metadata_utility.c index ae0f6589a..ec41e4eb2 100644 --- a/src/backend/distributed/metadata/metadata_utility.c +++ b/src/backend/distributed/metadata/metadata_utility.c @@ -24,6 +24,7 @@ #include "access/sysattr.h" #include "access/xact.h" #include "catalog/dependency.h" +#include "catalog/index.h" #include "catalog/indexing.h" #include "catalog/pg_authid.h" #include "catalog/pg_constraint.h" @@ -88,11 +89,11 @@ static uint64 * AllocateUint64(uint64 value); static void RecordDistributedRelationDependencies(Oid distributedRelationId); static GroupShardPlacement * TupleToGroupShardPlacement(TupleDesc tupleDesc, HeapTuple heapTuple); -static bool DistributedTableSize(Oid relationId, SizeQueryType sizeQueryType, - bool failOnError, uint64 *tableSize); -static bool DistributedTableSizeOnWorker(WorkerNode *workerNode, Oid relationId, - SizeQueryType sizeQueryType, bool failOnError, - uint64 *tableSize); +static bool DistributedRelationSize(Oid relationId, SizeQueryType sizeQueryType, + bool failOnError, uint64 *relationSize); +static bool DistributedRelationSizeOnWorker(WorkerNode *workerNode, Oid relationId, + SizeQueryType sizeQueryType, bool failOnError, + uint64 *relationSize); static List * ShardIntervalsOnWorkerGroup(WorkerNode *workerNode, Oid relationId); static char * GenerateShardIdNameValuesForShardList(List *shardIntervalList, bool firstValue); @@ -282,7 +283,7 @@ citus_shard_sizes(PG_FUNCTION_ARGS) /* - * citus_total_relation_size accepts a table name and returns a distributed table + * citus_total_relation_size accepts a distributed table name and returns a distributed table * and its indexes' total relation size. */ Datum @@ -294,20 +295,20 @@ citus_total_relation_size(PG_FUNCTION_ARGS) bool failOnError = PG_GETARG_BOOL(1); SizeQueryType sizeQueryType = TOTAL_RELATION_SIZE; - uint64 tableSize = 0; + uint64 relationSize = 0; - if (!DistributedTableSize(relationId, sizeQueryType, failOnError, &tableSize)) + if (!DistributedRelationSize(relationId, sizeQueryType, failOnError, &relationSize)) { Assert(!failOnError); PG_RETURN_NULL(); } - PG_RETURN_INT64(tableSize); + PG_RETURN_INT64(relationSize); } /* - * citus_table_size accepts a table name and returns a distributed table's total + * citus_table_size accepts a distributed table name and returns a distributed table's total * relation size. */ Datum @@ -318,21 +319,24 @@ citus_table_size(PG_FUNCTION_ARGS) Oid relationId = PG_GETARG_OID(0); bool failOnError = true; SizeQueryType sizeQueryType = TABLE_SIZE; - uint64 tableSize = 0; + uint64 relationSize = 0; - if (!DistributedTableSize(relationId, sizeQueryType, failOnError, &tableSize)) + /* We do not check if relation is really a table, like PostgreSQL is doing. */ + if (!DistributedRelationSize(relationId, sizeQueryType, failOnError, &relationSize)) { Assert(!failOnError); PG_RETURN_NULL(); } - PG_RETURN_INT64(tableSize); + PG_RETURN_INT64(relationSize); } /* - * citus_relation_size accept a table name and returns a relation's 'main' + * citus_relation_size accept a distributed relation name and returns a relation's 'main' * fork's size. + * + * Input relation is allowed to be an index on a distributed table too. */ Datum citus_relation_size(PG_FUNCTION_ARGS) @@ -344,7 +348,7 @@ citus_relation_size(PG_FUNCTION_ARGS) SizeQueryType sizeQueryType = RELATION_SIZE; uint64 relationSize = 0; - if (!DistributedTableSize(relationId, sizeQueryType, failOnError, &relationSize)) + if (!DistributedRelationSize(relationId, sizeQueryType, failOnError, &relationSize)) { Assert(!failOnError); PG_RETURN_NULL(); @@ -506,13 +510,16 @@ ReceiveShardIdAndSizeResults(List *connectionList, Tuplestorestate *tupleStore, /* - * DistributedTableSize is helper function for each kind of citus size functions. - * It first checks whether the table is distributed and size query can be run on - * it. Connection to each node has to be established to get the size of the table. + * DistributedRelationSize is helper function for each kind of citus size + * functions. It first checks whether the relation is a distributed table or an + * index belonging to a distributed table and size query can be run on it. + * Connection to each node has to be established to get the size of the + * relation. + * Input relation is allowed to be an index on a distributed table too. */ static bool -DistributedTableSize(Oid relationId, SizeQueryType sizeQueryType, bool failOnError, - uint64 *tableSize) +DistributedRelationSize(Oid relationId, SizeQueryType sizeQueryType, + bool failOnError, uint64 *relationSize) { int logLevel = WARNING; @@ -538,7 +545,7 @@ DistributedTableSize(Oid relationId, SizeQueryType sizeQueryType, bool failOnErr if (relation == NULL) { ereport(logLevel, - (errmsg("could not compute table size: relation does not exist"))); + (errmsg("could not compute relation size: relation does not exist"))); return false; } @@ -553,8 +560,9 @@ DistributedTableSize(Oid relationId, SizeQueryType sizeQueryType, bool failOnErr { uint64 relationSizeOnNode = 0; - bool gotSize = DistributedTableSizeOnWorker(workerNode, relationId, sizeQueryType, - failOnError, &relationSizeOnNode); + bool gotSize = DistributedRelationSizeOnWorker(workerNode, relationId, + sizeQueryType, + failOnError, &relationSizeOnNode); if (!gotSize) { return false; @@ -563,21 +571,22 @@ DistributedTableSize(Oid relationId, SizeQueryType sizeQueryType, bool failOnErr sumOfSizes += relationSizeOnNode; } - *tableSize = sumOfSizes; + *relationSize = sumOfSizes; return true; } /* - * DistributedTableSizeOnWorker gets the workerNode and relationId to calculate + * DistributedRelationSizeOnWorker gets the workerNode and relationId to calculate * size of that relation on the given workerNode by summing up the size of each * shard placement. + * Input relation is allowed to be an index on a distributed table too. */ static bool -DistributedTableSizeOnWorker(WorkerNode *workerNode, Oid relationId, - SizeQueryType sizeQueryType, - bool failOnError, uint64 *tableSize) +DistributedRelationSizeOnWorker(WorkerNode *workerNode, Oid relationId, + SizeQueryType sizeQueryType, + bool failOnError, uint64 *relationSize) { int logLevel = WARNING; @@ -591,6 +600,17 @@ DistributedTableSizeOnWorker(WorkerNode *workerNode, Oid relationId, uint32 connectionFlag = 0; PGresult *result = NULL; + /* if the relation is an index, update relationId and define indexId */ + Oid indexId = InvalidOid; + Oid relKind = get_rel_relkind(relationId); + if (relKind == RELKIND_INDEX || relKind == RELKIND_PARTITIONED_INDEX) + { + indexId = relationId; + + bool missingOk = false; + relationId = IndexGetRelation(indexId, missingOk); + } + List *shardIntervalsOnNode = ShardIntervalsOnWorkerGroup(workerNode, relationId); /* @@ -598,21 +618,22 @@ DistributedTableSizeOnWorker(WorkerNode *workerNode, Oid relationId, * But citus size functions shouldn't include them, like PG. */ bool optimizePartitionCalculations = false; - StringInfo tableSizeQuery = GenerateSizeQueryOnMultiplePlacements( + StringInfo relationSizeQuery = GenerateSizeQueryOnMultiplePlacements( shardIntervalsOnNode, + indexId, sizeQueryType, optimizePartitionCalculations); MultiConnection *connection = GetNodeConnection(connectionFlag, workerNodeName, workerNodePort); - int queryResult = ExecuteOptionalRemoteCommand(connection, tableSizeQuery->data, + int queryResult = ExecuteOptionalRemoteCommand(connection, relationSizeQuery->data, &result); if (queryResult != 0) { ereport(logLevel, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("could not connect to %s:%d to get size of " - "table \"%s\"", + "relation \"%s\"", workerNodeName, workerNodePort, get_rel_name(relationId)))); @@ -626,19 +647,19 @@ DistributedTableSizeOnWorker(WorkerNode *workerNode, Oid relationId, ClearResults(connection, failOnError); ereport(logLevel, (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("cannot parse size of table \"%s\" from %s:%d", + errmsg("cannot parse size of relation \"%s\" from %s:%d", get_rel_name(relationId), workerNodeName, workerNodePort))); return false; } - StringInfo tableSizeStringInfo = (StringInfo) linitial(sizeList); - char *tableSizeString = tableSizeStringInfo->data; + StringInfo relationSizeStringInfo = (StringInfo) linitial(sizeList); + char *relationSizeString = relationSizeStringInfo->data; - if (strlen(tableSizeString) > 0) + if (strlen(relationSizeString) > 0) { - *tableSize = SafeStringToUint64(tableSizeString); + *relationSize = SafeStringToUint64(relationSizeString); } else { @@ -647,7 +668,7 @@ DistributedTableSizeOnWorker(WorkerNode *workerNode, Oid relationId, * being executed. For this case we get an empty string as table size. * We can take that as zero to prevent any unnecessary errors. */ - *tableSize = 0; + *relationSize = 0; } PQclear(result); @@ -732,7 +753,7 @@ ShardIntervalsOnWorkerGroup(WorkerNode *workerNode, Oid relationId) /* * GenerateSizeQueryOnMultiplePlacements generates a select size query to get - * size of multiple tables. Note that, different size functions supported by PG + * size of multiple relations. Note that, different size functions supported by PG * are also supported by this function changing the size query type given as the * last parameter to function. Depending on the sizeQueryType enum parameter, the * generated query will call one of the functions: pg_relation_size, @@ -740,9 +761,13 @@ ShardIntervalsOnWorkerGroup(WorkerNode *workerNode, Oid relationId) * This function uses UDFs named worker_partitioned_*_size for partitioned tables, * if the parameter optimizePartitionCalculations is true. The UDF to be called is * determined by the parameter sizeQueryType. + * + * indexId is provided if we're interested in the size of an index, not the whole + * table. */ StringInfo GenerateSizeQueryOnMultiplePlacements(List *shardIntervalList, + Oid indexId, SizeQueryType sizeQueryType, bool optimizePartitionCalculations) { @@ -766,16 +791,20 @@ GenerateSizeQueryOnMultiplePlacements(List *shardIntervalList, */ continue; } + + /* we need to build the shard relation name, being an index or table */ + Oid objectId = OidIsValid(indexId) ? indexId : shardInterval->relationId; + uint64 shardId = shardInterval->shardId; - Oid schemaId = get_rel_namespace(shardInterval->relationId); + Oid schemaId = get_rel_namespace(objectId); char *schemaName = get_namespace_name(schemaId); - char *shardName = get_rel_name(shardInterval->relationId); + char *shardName = get_rel_name(objectId); AppendShardIdToName(&shardName, shardId); char *shardQualifiedName = quote_qualified_identifier(schemaName, shardName); char *quotedShardName = quote_literal_cstr(shardQualifiedName); - /* for partitoned tables, we will call worker_partitioned_... size functions */ + /* for partitioned tables, we will call worker_partitioned_... size functions */ if (optimizePartitionCalculations && PartitionedTable(shardInterval->relationId)) { partitionedShardNames = lappend(partitionedShardNames, quotedShardName); @@ -1010,7 +1039,7 @@ AppendShardIdNameValues(StringInfo selectQuery, ShardInterval *shardInterval) /* - * ErrorIfNotSuitableToGetSize determines whether the table is suitable to find + * ErrorIfNotSuitableToGetSize determines whether the relation is suitable to find * its' size with internal functions. */ static void @@ -1018,11 +1047,32 @@ ErrorIfNotSuitableToGetSize(Oid relationId) { if (!IsCitusTable(relationId)) { - char *relationName = get_rel_name(relationId); - char *escapedQueryString = quote_literal_cstr(relationName); - ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), - errmsg("cannot calculate the size because relation %s is not " - "distributed", escapedQueryString))); + Oid relKind = get_rel_relkind(relationId); + if (relKind != RELKIND_INDEX && relKind != RELKIND_PARTITIONED_INDEX) + { + char *relationName = get_rel_name(relationId); + char *escapedRelationName = quote_literal_cstr(relationName); + ereport(ERROR, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg( + "cannot calculate the size because relation %s " + "is not distributed", + escapedRelationName))); + } + bool missingOk = false; + Oid indexId = relationId; + relationId = IndexGetRelation(relationId, missingOk); + if (!IsCitusTable(relationId)) + { + char *tableName = get_rel_name(relationId); + char *escapedTableName = quote_literal_cstr(tableName); + char *indexName = get_rel_name(indexId); + char *escapedIndexName = quote_literal_cstr(indexName); + ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg( + "cannot calculate the size because table %s for " + "index %s is not distributed", + escapedTableName, escapedIndexName))); + } } } diff --git a/src/backend/distributed/operations/shard_transfer.c b/src/backend/distributed/operations/shard_transfer.c index 23925a315..79895cc3d 100644 --- a/src/backend/distributed/operations/shard_transfer.c +++ b/src/backend/distributed/operations/shard_transfer.c @@ -792,7 +792,12 @@ ShardListSizeInBytes(List *shardList, char *workerNodeName, uint32 /* we skip child tables of a partitioned table if this boolean variable is true */ bool optimizePartitionCalculations = true; + + /* we're interested in whole table, not a particular index */ + Oid indexId = InvalidOid; + StringInfo tableSizeQuery = GenerateSizeQueryOnMultiplePlacements(shardList, + indexId, TOTAL_RELATION_SIZE, optimizePartitionCalculations); diff --git a/src/include/distributed/metadata_utility.h b/src/include/distributed/metadata_utility.h index 9234adc76..7e50a6af6 100644 --- a/src/include/distributed/metadata_utility.h +++ b/src/include/distributed/metadata_utility.h @@ -342,6 +342,7 @@ extern void LookupTaskPlacementHostAndPort(ShardPlacement *taskPlacement, char * int *nodePort); extern bool IsDummyPlacement(ShardPlacement *taskPlacement); extern StringInfo GenerateSizeQueryOnMultiplePlacements(List *shardIntervalList, + Oid indexId, SizeQueryType sizeQueryType, bool optimizePartitionCalculations); extern List * RemoveCoordinatorPlacementIfNotSingleNode(List *placementList); diff --git a/src/test/regress/citus_tests/run_test.py b/src/test/regress/citus_tests/run_test.py index 6528834ae..b28341e5c 100755 --- a/src/test/regress/citus_tests/run_test.py +++ b/src/test/regress/citus_tests/run_test.py @@ -175,6 +175,7 @@ DEPS = { ), "grant_on_schema_propagation": TestDeps("minimal_schedule"), "propagate_extension_commands": TestDeps("minimal_schedule"), + "multi_size_queries": TestDeps("base_schedule", ["multi_copy"]), } diff --git a/src/test/regress/expected/isolation_drop_vs_all.out b/src/test/regress/expected/isolation_drop_vs_all.out index 7dab13615..2c8912c21 100644 --- a/src/test/regress/expected/isolation_drop_vs_all.out +++ b/src/test/regress/expected/isolation_drop_vs_all.out @@ -226,7 +226,7 @@ step s1-drop: DROP TABLE drop_hash; step s2-table-size: SELECT citus_total_relation_size('drop_hash'); step s1-commit: COMMIT; step s2-table-size: <... completed> -ERROR: could not compute table size: relation does not exist +ERROR: could not compute relation size: relation does not exist step s2-commit: COMMIT; step s1-select-count: SELECT COUNT(*) FROM drop_hash; ERROR: relation "drop_hash" does not exist diff --git a/src/test/regress/expected/multi_cluster_management.out b/src/test/regress/expected/multi_cluster_management.out index e58b02937..b92d8d136 100644 --- a/src/test/regress/expected/multi_cluster_management.out +++ b/src/test/regress/expected/multi_cluster_management.out @@ -1258,3 +1258,9 @@ SELECT bool_and(hasmetadata) AND bool_and(metadatasynced) FROM pg_dist_node WHER t (1 row) +-- Grant all on public schema to public +-- +-- That's the default on Postgres versions < 15 and we want to +-- keep permissions compatible accross versions, in regression +-- tests. +GRANT ALL ON SCHEMA public TO PUBLIC; diff --git a/src/test/regress/expected/multi_size_queries.out b/src/test/regress/expected/multi_size_queries.out index 2ff8d9c4b..eb1981e64 100644 --- a/src/test/regress/expected/multi_size_queries.out +++ b/src/test/regress/expected/multi_size_queries.out @@ -7,19 +7,25 @@ SET citus.next_shard_id TO 1390000; -- Tests with invalid relation IDs SELECT citus_table_size(1); -ERROR: could not compute table size: relation does not exist +ERROR: could not compute relation size: relation does not exist SELECT citus_relation_size(1); -ERROR: could not compute table size: relation does not exist +ERROR: could not compute relation size: relation does not exist SELECT citus_total_relation_size(1); -ERROR: could not compute table size: relation does not exist +ERROR: could not compute relation size: relation does not exist -- Tests with non-distributed table -CREATE TABLE non_distributed_table (x int); +CREATE TABLE non_distributed_table (x int primary key); SELECT citus_table_size('non_distributed_table'); ERROR: cannot calculate the size because relation 'non_distributed_table' is not distributed SELECT citus_relation_size('non_distributed_table'); ERROR: cannot calculate the size because relation 'non_distributed_table' is not distributed SELECT citus_total_relation_size('non_distributed_table'); ERROR: cannot calculate the size because relation 'non_distributed_table' is not distributed +SELECT citus_table_size('non_distributed_table_pkey'); +ERROR: cannot calculate the size because table 'non_distributed_table' for index 'non_distributed_table_pkey' is not distributed +SELECT citus_relation_size('non_distributed_table_pkey'); +ERROR: cannot calculate the size because table 'non_distributed_table' for index 'non_distributed_table_pkey' is not distributed +SELECT citus_total_relation_size('non_distributed_table_pkey'); +ERROR: cannot calculate the size because table 'non_distributed_table' for index 'non_distributed_table_pkey' is not distributed DROP TABLE non_distributed_table; -- fix broken placements via disabling the node SET client_min_messages TO ERROR; @@ -31,24 +37,70 @@ SELECT replicate_table_shards('lineitem_hash_part', shard_replication_factor:=2, -- Tests on distributed table with replication factor > 1 VACUUM (FULL) lineitem_hash_part; -SELECT citus_table_size('lineitem_hash_part'); - citus_table_size +SELECT citus_relation_size('lineitem_hash_part') <= citus_table_size('lineitem_hash_part'); + ?column? --------------------------------------------------------------------- - 3801088 + t (1 row) -SELECT citus_relation_size('lineitem_hash_part'); - citus_relation_size +SELECT citus_table_size('lineitem_hash_part') <= citus_total_relation_size('lineitem_hash_part'); + ?column? --------------------------------------------------------------------- - 3801088 + t (1 row) -SELECT citus_total_relation_size('lineitem_hash_part'); - citus_total_relation_size +SELECT citus_relation_size('lineitem_hash_part') > 0; + ?column? --------------------------------------------------------------------- - 3801088 + t (1 row) +CREATE INDEX lineitem_hash_part_idx ON lineitem_hash_part(l_orderkey); +VACUUM (FULL) lineitem_hash_part; +SELECT citus_relation_size('lineitem_hash_part') <= citus_table_size('lineitem_hash_part'); + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT citus_table_size('lineitem_hash_part') <= citus_total_relation_size('lineitem_hash_part'); + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT citus_relation_size('lineitem_hash_part') > 0; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT citus_relation_size('lineitem_hash_part_idx') <= citus_table_size('lineitem_hash_part_idx'); + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT citus_table_size('lineitem_hash_part_idx') <= citus_total_relation_size('lineitem_hash_part_idx'); + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT citus_relation_size('lineitem_hash_part_idx') > 0; + ?column? +--------------------------------------------------------------------- + t +(1 row) + +SELECT citus_total_relation_size('lineitem_hash_part') >= + citus_table_size('lineitem_hash_part') + citus_table_size('lineitem_hash_part_idx'); + ?column? +--------------------------------------------------------------------- + t +(1 row) + +DROP INDEX lineitem_hash_part_idx; VACUUM (FULL) customer_copy_hash; -- Tests on distributed tables with streaming replication. SELECT citus_table_size('customer_copy_hash'); @@ -72,10 +124,10 @@ SELECT citus_total_relation_size('customer_copy_hash'); -- Make sure we can get multiple sizes in a single query SELECT citus_table_size('customer_copy_hash'), citus_table_size('customer_copy_hash'), - citus_table_size('supplier'); + citus_table_size('customer_copy_hash'); citus_table_size | citus_table_size | citus_table_size --------------------------------------------------------------------- - 548864 | 548864 | 655360 + 548864 | 548864 | 548864 (1 row) CREATE INDEX index_1 on customer_copy_hash(c_custkey); @@ -99,6 +151,24 @@ SELECT citus_total_relation_size('customer_copy_hash'); 2646016 (1 row) +SELECT citus_table_size('index_1'); + citus_table_size +--------------------------------------------------------------------- + 1048576 +(1 row) + +SELECT citus_relation_size('index_1'); + citus_relation_size +--------------------------------------------------------------------- + 1048576 +(1 row) + +SELECT citus_total_relation_size('index_1'); + citus_total_relation_size +--------------------------------------------------------------------- + 1048576 +(1 row) + -- Tests on reference table VACUUM (FULL) supplier; SELECT citus_table_size('supplier'); @@ -139,6 +209,74 @@ SELECT citus_total_relation_size('supplier'); 688128 (1 row) +SELECT citus_table_size('index_2'); + citus_table_size +--------------------------------------------------------------------- + 122880 +(1 row) + +SELECT citus_relation_size('index_2'); + citus_relation_size +--------------------------------------------------------------------- + 122880 +(1 row) + +SELECT citus_total_relation_size('index_2'); + citus_total_relation_size +--------------------------------------------------------------------- + 122880 +(1 row) + +-- Test on partitioned table +CREATE TABLE split_me (dist_col int, partition_col timestamp) PARTITION BY RANGE (partition_col); +CREATE INDEX ON split_me(dist_col); +-- create 2 partitions +CREATE TABLE m PARTITION OF split_me FOR VALUES FROM ('2018-01-01') TO ('2019-01-01'); +CREATE TABLE e PARTITION OF split_me FOR VALUES FROM ('2019-01-01') TO ('2020-01-01'); +INSERT INTO split_me SELECT 1, '2018-01-01'::timestamp + i * interval '1 day' FROM generate_series(1, 360) i; +INSERT INTO split_me SELECT 2, '2019-01-01'::timestamp + i * interval '1 day' FROM generate_series(1, 180) i; +-- before citus +SELECT citus_relation_size('split_me'); +ERROR: cannot calculate the size because relation 'split_me' is not distributed +SELECT citus_relation_size('split_me_dist_col_idx'); +ERROR: cannot calculate the size because table 'split_me' for index 'split_me_dist_col_idx' is not distributed +SELECT citus_relation_size('m'); +ERROR: cannot calculate the size because relation 'm' is not distributed +SELECT citus_relation_size('m_dist_col_idx'); +ERROR: cannot calculate the size because table 'm' for index 'm_dist_col_idx' is not distributed +-- distribute the table(s) +SELECT create_distributed_table('split_me', 'dist_col'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +-- after citus +SELECT citus_relation_size('split_me'); + citus_relation_size +--------------------------------------------------------------------- + 0 +(1 row) + +SELECT citus_relation_size('split_me_dist_col_idx'); + citus_relation_size +--------------------------------------------------------------------- + 0 +(1 row) + +SELECT citus_relation_size('m'); + citus_relation_size +--------------------------------------------------------------------- + 32768 +(1 row) + +SELECT citus_relation_size('m_dist_col_idx'); + citus_relation_size +--------------------------------------------------------------------- + 81920 +(1 row) + +DROP TABLE split_me; -- Test inside the transaction BEGIN; ALTER TABLE supplier ALTER COLUMN s_suppkey SET NOT NULL; diff --git a/src/test/regress/sql/multi_cluster_management.sql b/src/test/regress/sql/multi_cluster_management.sql index 9ec0eb28e..ab268939f 100644 --- a/src/test/regress/sql/multi_cluster_management.sql +++ b/src/test/regress/sql/multi_cluster_management.sql @@ -530,3 +530,10 @@ RESET citus.metadata_sync_mode; -- verify that at the end of this file, all primary nodes have metadata synced SELECT bool_and(hasmetadata) AND bool_and(metadatasynced) FROM pg_dist_node WHERE isactive = 't' and noderole = 'primary'; + +-- Grant all on public schema to public +-- +-- That's the default on Postgres versions < 15 and we want to +-- keep permissions compatible accross versions, in regression +-- tests. +GRANT ALL ON SCHEMA public TO PUBLIC; diff --git a/src/test/regress/sql/multi_size_queries.sql b/src/test/regress/sql/multi_size_queries.sql index ff8d203f1..fdc3f7892 100644 --- a/src/test/regress/sql/multi_size_queries.sql +++ b/src/test/regress/sql/multi_size_queries.sql @@ -13,10 +13,15 @@ SELECT citus_relation_size(1); SELECT citus_total_relation_size(1); -- Tests with non-distributed table -CREATE TABLE non_distributed_table (x int); +CREATE TABLE non_distributed_table (x int primary key); + SELECT citus_table_size('non_distributed_table'); SELECT citus_relation_size('non_distributed_table'); SELECT citus_total_relation_size('non_distributed_table'); + +SELECT citus_table_size('non_distributed_table_pkey'); +SELECT citus_relation_size('non_distributed_table_pkey'); +SELECT citus_total_relation_size('non_distributed_table_pkey'); DROP TABLE non_distributed_table; -- fix broken placements via disabling the node @@ -26,9 +31,25 @@ SELECT replicate_table_shards('lineitem_hash_part', shard_replication_factor:=2, -- Tests on distributed table with replication factor > 1 VACUUM (FULL) lineitem_hash_part; -SELECT citus_table_size('lineitem_hash_part'); -SELECT citus_relation_size('lineitem_hash_part'); -SELECT citus_total_relation_size('lineitem_hash_part'); +SELECT citus_relation_size('lineitem_hash_part') <= citus_table_size('lineitem_hash_part'); +SELECT citus_table_size('lineitem_hash_part') <= citus_total_relation_size('lineitem_hash_part'); +SELECT citus_relation_size('lineitem_hash_part') > 0; + +CREATE INDEX lineitem_hash_part_idx ON lineitem_hash_part(l_orderkey); +VACUUM (FULL) lineitem_hash_part; + +SELECT citus_relation_size('lineitem_hash_part') <= citus_table_size('lineitem_hash_part'); +SELECT citus_table_size('lineitem_hash_part') <= citus_total_relation_size('lineitem_hash_part'); +SELECT citus_relation_size('lineitem_hash_part') > 0; + +SELECT citus_relation_size('lineitem_hash_part_idx') <= citus_table_size('lineitem_hash_part_idx'); +SELECT citus_table_size('lineitem_hash_part_idx') <= citus_total_relation_size('lineitem_hash_part_idx'); +SELECT citus_relation_size('lineitem_hash_part_idx') > 0; + +SELECT citus_total_relation_size('lineitem_hash_part') >= + citus_table_size('lineitem_hash_part') + citus_table_size('lineitem_hash_part_idx'); + +DROP INDEX lineitem_hash_part_idx; VACUUM (FULL) customer_copy_hash; @@ -40,7 +61,7 @@ SELECT citus_total_relation_size('customer_copy_hash'); -- Make sure we can get multiple sizes in a single query SELECT citus_table_size('customer_copy_hash'), citus_table_size('customer_copy_hash'), - citus_table_size('supplier'); + citus_table_size('customer_copy_hash'); CREATE INDEX index_1 on customer_copy_hash(c_custkey); VACUUM (FULL) customer_copy_hash; @@ -50,6 +71,10 @@ SELECT citus_table_size('customer_copy_hash'); SELECT citus_relation_size('customer_copy_hash'); SELECT citus_total_relation_size('customer_copy_hash'); +SELECT citus_table_size('index_1'); +SELECT citus_relation_size('index_1'); +SELECT citus_total_relation_size('index_1'); + -- Tests on reference table VACUUM (FULL) supplier; @@ -64,6 +89,38 @@ SELECT citus_table_size('supplier'); SELECT citus_relation_size('supplier'); SELECT citus_total_relation_size('supplier'); +SELECT citus_table_size('index_2'); +SELECT citus_relation_size('index_2'); +SELECT citus_total_relation_size('index_2'); + +-- Test on partitioned table +CREATE TABLE split_me (dist_col int, partition_col timestamp) PARTITION BY RANGE (partition_col); +CREATE INDEX ON split_me(dist_col); + +-- create 2 partitions +CREATE TABLE m PARTITION OF split_me FOR VALUES FROM ('2018-01-01') TO ('2019-01-01'); +CREATE TABLE e PARTITION OF split_me FOR VALUES FROM ('2019-01-01') TO ('2020-01-01'); + +INSERT INTO split_me SELECT 1, '2018-01-01'::timestamp + i * interval '1 day' FROM generate_series(1, 360) i; +INSERT INTO split_me SELECT 2, '2019-01-01'::timestamp + i * interval '1 day' FROM generate_series(1, 180) i; + +-- before citus +SELECT citus_relation_size('split_me'); +SELECT citus_relation_size('split_me_dist_col_idx'); +SELECT citus_relation_size('m'); +SELECT citus_relation_size('m_dist_col_idx'); + +-- distribute the table(s) +SELECT create_distributed_table('split_me', 'dist_col'); + +-- after citus +SELECT citus_relation_size('split_me'); +SELECT citus_relation_size('split_me_dist_col_idx'); +SELECT citus_relation_size('m'); +SELECT citus_relation_size('m_dist_col_idx'); + +DROP TABLE split_me; + -- Test inside the transaction BEGIN; ALTER TABLE supplier ALTER COLUMN s_suppkey SET NOT NULL; From 20ae42e7fa4bf8b29a028fa80b8905a9496f56dd Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 1 Nov 2023 11:12:06 +0100 Subject: [PATCH 09/12] Fix flaky multi_reference_table test (#7294) Sometimes multi_reference_table failed in CI like this: ```diff \c - - - :master_port DROP INDEX reference_schema.reference_index_2; \c - - - :worker_1_port SELECT "Column", "Type", "Modifiers" FROM table_desc WHERE relid='reference_schema.reference_table_ddl_1250019'::regclass; - Column | Type | Modifiers ---------------------------------------------------------------------- - value_2 | double precision | default 25.0 - value_3 | text | not null - value_4 | timestamp without time zone | - value_5 | double precision | -(4 rows) - +ERROR: schema "citus_local_table_queries" does not exist \di reference_schema.reference_index_2* List of relations Schema | Name | Type | Owner | Table ``` Source: https://github.com/citusdata/citus/actions/runs/6707535961/attempts/2#summary-18226879513 Reading from table_desc apparantly has an issue that if the schema gets deleted from one of the items, while it is being read that we get such an error. This change fixes that by not running multi_reference_table in parallel with citus_local_tables_queries anymore. --- src/test/regress/multi_1_schedule | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/regress/multi_1_schedule b/src/test/regress/multi_1_schedule index e824f5cba..4e2b19795 100644 --- a/src/test/regress/multi_1_schedule +++ b/src/test/regress/multi_1_schedule @@ -298,7 +298,8 @@ test: replicate_reference_tables_to_coordinator test: citus_local_tables test: mixed_relkind_tests test: multi_row_router_insert create_distributed_table_concurrently -test: multi_reference_table citus_local_tables_queries +test: multi_reference_table +test: citus_local_tables_queries test: citus_local_table_triggers test: coordinator_shouldhaveshards test: local_shard_utility_command_execution From 0d83ab57de8c266764fc47810afd115758da5034 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 1 Nov 2023 11:46:01 +0100 Subject: [PATCH 10/12] Fix flaky multi_cluster_management (#7295) One of our most flaky and most anoying tests is multi_cluster_management. It usually fails like this: ```diff SELECT citus_disable_node('localhost', :worker_2_port); citus_disable_node -------------------- (1 row) SELECT public.wait_until_metadata_sync(60000); +WARNING: waiting for metadata sync timed out wait_until_metadata_sync -------------------------- (1 row) ``` This tries to address that by hardening wait_until_metadata_sync. I believe the reason for this warning is that there is a race condition in wait_until_metadata_sync. It's possible for the pre-check to fail, then have the maintenance daemon send a notification. And only then have the backend start to listen. I tried to fix it in two ways: 1. First run LISTEN, and only then read do the pre-check. 2. If we time out, check again just to make sure that we did not miss the notification somehow. And don't show a warning if all metadata is synced after the timeout. It's hard to know for sure that this fixes it because the test is not repeatable and I could not reproduce it locally. Let's just hope for the best. --------- Co-authored-by: Onur Tirtir --- src/backend/distributed/test/metadata_sync.c | 48 +++++++++++-------- .../expected/multi_cluster_management.out | 4 +- .../regress/sql/multi_cluster_management.sql | 4 +- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/src/backend/distributed/test/metadata_sync.c b/src/backend/distributed/test/metadata_sync.c index 46d2303d6..8ad4b15f2 100644 --- a/src/backend/distributed/test/metadata_sync.c +++ b/src/backend/distributed/test/metadata_sync.c @@ -90,6 +90,28 @@ activate_node_snapshot(PG_FUNCTION_ARGS) } +/* + * IsMetadataSynced checks the workers to see if all workers with metadata are + * synced. + */ +static bool +IsMetadataSynced(void) +{ + List *workerList = ActivePrimaryNonCoordinatorNodeList(NoLock); + + WorkerNode *workerNode = NULL; + foreach_ptr(workerNode, workerList) + { + if (workerNode->hasMetadata && !workerNode->metadataSynced) + { + return false; + } + } + + return true; +} + + /* * wait_until_metadata_sync waits until the maintenance daemon does a metadata * sync, or times out. @@ -99,19 +121,10 @@ wait_until_metadata_sync(PG_FUNCTION_ARGS) { uint32 timeout = PG_GETARG_UINT32(0); - List *workerList = ActivePrimaryNonCoordinatorNodeList(NoLock); - bool waitNotifications = false; - - WorkerNode *workerNode = NULL; - foreach_ptr(workerNode, workerList) - { - /* if already has metadata, no need to do it again */ - if (workerNode->hasMetadata && !workerNode->metadataSynced) - { - waitNotifications = true; - break; - } - } + /* First we start listening. */ + MultiConnection *connection = GetNodeConnection(FORCE_NEW_CONNECTION, + LOCAL_HOST_NAME, PostPortNumber); + ExecuteCriticalRemoteCommand(connection, "LISTEN " METADATA_SYNC_CHANNEL); /* * If all the metadata nodes have already been synced, we should not wait. @@ -119,15 +132,12 @@ wait_until_metadata_sync(PG_FUNCTION_ARGS) * the notification and we'd wait unnecessarily here. Worse, the test outputs * might be inconsistent across executions due to the warning. */ - if (!waitNotifications) + if (IsMetadataSynced()) { + CloseConnection(connection); PG_RETURN_VOID(); } - MultiConnection *connection = GetNodeConnection(FORCE_NEW_CONNECTION, - LOCAL_HOST_NAME, PostPortNumber); - ExecuteCriticalRemoteCommand(connection, "LISTEN " METADATA_SYNC_CHANNEL); - int waitFlags = WL_SOCKET_READABLE | WL_TIMEOUT | WL_POSTMASTER_DEATH; int waitResult = WaitLatchOrSocket(NULL, waitFlags, PQsocket(connection->pgConn), timeout, 0); @@ -139,7 +149,7 @@ wait_until_metadata_sync(PG_FUNCTION_ARGS) { ClearResults(connection, true); } - else if (waitResult & WL_TIMEOUT) + else if (waitResult & WL_TIMEOUT && !IsMetadataSynced()) { elog(WARNING, "waiting for metadata sync timed out"); } diff --git a/src/test/regress/expected/multi_cluster_management.out b/src/test/regress/expected/multi_cluster_management.out index b92d8d136..3eb549ab5 100644 --- a/src/test/regress/expected/multi_cluster_management.out +++ b/src/test/regress/expected/multi_cluster_management.out @@ -90,7 +90,7 @@ SELECT citus_disable_node('localhost', :worker_2_port); (1 row) -SELECT public.wait_until_metadata_sync(60000); +SELECT public.wait_until_metadata_sync(20000); wait_until_metadata_sync --------------------------------------------------------------------- @@ -812,7 +812,7 @@ SELECT citus_disable_node('localhost', 9999); (1 row) -SELECT public.wait_until_metadata_sync(60000); +SELECT public.wait_until_metadata_sync(20000); wait_until_metadata_sync --------------------------------------------------------------------- diff --git a/src/test/regress/sql/multi_cluster_management.sql b/src/test/regress/sql/multi_cluster_management.sql index ab268939f..86fbd15b6 100644 --- a/src/test/regress/sql/multi_cluster_management.sql +++ b/src/test/regress/sql/multi_cluster_management.sql @@ -39,7 +39,7 @@ SELECT master_get_active_worker_nodes(); SELECT 1 FROM master_add_node('localhost', :worker_2_port); SELECT citus_disable_node('localhost', :worker_2_port); -SELECT public.wait_until_metadata_sync(60000); +SELECT public.wait_until_metadata_sync(20000); SELECT master_get_active_worker_nodes(); -- add some shard placements to the cluster @@ -328,7 +328,7 @@ SELECT 1 FROM master_add_inactive_node('localhost', 9996, groupid => :worker_2_g SELECT master_add_inactive_node('localhost', 9999, groupid => :worker_2_group, nodecluster => 'olap', noderole => 'secondary'); SELECT master_activate_node('localhost', 9999); SELECT citus_disable_node('localhost', 9999); -SELECT public.wait_until_metadata_sync(60000); +SELECT public.wait_until_metadata_sync(20000); SELECT master_remove_node('localhost', 9999); -- check that you can't manually add two primaries to a group From 2bccb5815770b67cf646a7dc5bb9539d5a29c010 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 1 Nov 2023 13:12:20 +0100 Subject: [PATCH 11/12] Run github actions on main (#7292) We want the nice looking green checkmark on our main branch too. This PR includes running on pushes to release branches too, but that won't come into effect until we have release branches with this workflow file. --- .github/workflows/build_and_test.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d285e4f50..d900fe867 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -10,6 +10,10 @@ on: required: false default: false type: boolean + push: + branches: + - "main" + - "release-*" pull_request: types: [opened, reopened,synchronize] jobs: From c83c5567028d2035651c39f737ac5a944a70db16 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 1 Nov 2023 14:44:45 +0100 Subject: [PATCH 12/12] Fix flaky isolation_master_update_node (#7303) Sometimes in CI isolation_master_update_node fails like this: ```diff ------------------ (1 row) step s2-abort: ABORT; step s1-abort: ABORT; FATAL: terminating connection due to administrator command FATAL: terminating connection due to administrator command SSL connection has been closed unexpectedly +server closed the connection unexpectedly master_remove_node ------------------ ``` This just seesm like a random error line. The only way to reasonably fix this is by adding an extra output file. So that's what this PR does. --- .../isolation_master_update_node_1.out | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 src/test/regress/expected/isolation_master_update_node_1.out diff --git a/src/test/regress/expected/isolation_master_update_node_1.out b/src/test/regress/expected/isolation_master_update_node_1.out new file mode 100644 index 000000000..474956629 --- /dev/null +++ b/src/test/regress/expected/isolation_master_update_node_1.out @@ -0,0 +1,68 @@ +Parsed test spec with 2 sessions + +starting permutation: s1-begin s1-insert s2-begin s2-update-node-1 s1-abort s2-abort +create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +step s1-begin: BEGIN; +step s1-insert: INSERT INTO t1 SELECT generate_series(1, 100); +step s2-begin: BEGIN; +step s2-update-node-1: + -- update a specific node by address + SELECT master_update_node(nodeid, 'localhost', nodeport + 10) + FROM pg_dist_node + WHERE nodename = 'localhost' + AND nodeport = 57637; + +step s1-abort: ABORT; +step s2-update-node-1: <... completed> +master_update_node +--------------------------------------------------------------------- + +(1 row) + +step s2-abort: ABORT; +master_remove_node +--------------------------------------------------------------------- + + +(2 rows) + + +starting permutation: s1-begin s1-insert s2-begin s2-update-node-1-force s2-abort s1-abort +create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +step s1-begin: BEGIN; +step s1-insert: INSERT INTO t1 SELECT generate_series(1, 100); +step s2-begin: BEGIN; +step s2-update-node-1-force: + -- update a specific node by address (force) + SELECT master_update_node(nodeid, 'localhost', nodeport + 10, force => true, lock_cooldown => 100) + FROM pg_dist_node + WHERE nodename = 'localhost' + AND nodeport = 57637; + +step s2-update-node-1-force: <... completed> +master_update_node +--------------------------------------------------------------------- + +(1 row) + +step s2-abort: ABORT; +step s1-abort: ABORT; +FATAL: terminating connection due to administrator command +FATAL: terminating connection due to administrator command +SSL connection has been closed unexpectedly +server closed the connection unexpectedly + +master_remove_node +--------------------------------------------------------------------- + + +(2 rows) +