diff --git a/src/backend/distributed/planner/multi_explain.c b/src/backend/distributed/planner/multi_explain.c index e778ea0fa..5ab31a1a1 100644 --- a/src/backend/distributed/planner/multi_explain.c +++ b/src/backend/distributed/planner/multi_explain.c @@ -68,6 +68,7 @@ /* Config variables that enable printing distributed query plans */ bool ExplainDistributedQueries = true; bool ExplainAllTasks = false; +int ExplainAnalyzeSortMethod = EXPLAIN_ANALYZE_SORT_BY_TIME; /* * If enabled, EXPLAIN ANALYZE output & other statistics of last worker task @@ -529,8 +530,34 @@ ExplainMapMergeJob(MapMergeJob *mapMergeJob, ExplainState *es) /* - * ExplainTaskList shows the remote EXPLAIN for the first task in taskList, - * or all tasks if citus.explain_all_tasks is on. + * CompareTasksByFetchedExplainAnalyzeDuration is a helper function to compare two tasks by their execution duration. + */ +static int +CompareTasksByFetchedExplainAnalyzeDuration(const void *leftElement, const + void *rightElement) +{ + const Task *leftTask = *((const Task **) leftElement); + const Task *rightTask = *((const Task **) rightElement); + + double leftTaskExecutionDuration = leftTask->fetchedExplainAnalyzeExecutionDuration; + double rightTaskExecutionDuration = rightTask->fetchedExplainAnalyzeExecutionDuration; + + double diff = leftTaskExecutionDuration - rightTaskExecutionDuration; + if (diff > 0) + { + return -1; + } + else if (diff < 0) + { + return 1; + } + return 0; +} + + +/* + * ExplainTaskList shows the remote EXPLAIN and execution time for the first task + * in taskList, or all tasks if citus.explain_all_tasks is on. */ static void ExplainTaskList(CitusScanState *scanState, List *taskList, ExplainState *es, @@ -540,8 +567,17 @@ ExplainTaskList(CitusScanState *scanState, List *taskList, ExplainState *es, ListCell *remoteExplainCell = NULL; List *remoteExplainList = NIL; - /* make sure that the output is consistent */ - taskList = SortList(taskList, CompareTasksByTaskId); + /* if tasks are executed, we sort them by time; unless we are on a test env */ + if (es->analyze && ExplainAnalyzeSortMethod == EXPLAIN_ANALYZE_SORT_BY_TIME) + { + /* sort by execution duration only in case of ANALYZE */ + taskList = SortList(taskList, CompareTasksByFetchedExplainAnalyzeDuration); + } + else + { + /* make sure that the output is consistent */ + taskList = SortList(taskList, CompareTasksByTaskId); + } foreach(taskCell, taskList) { @@ -1225,13 +1261,15 @@ CreateExplainAnlyzeDestination(Task *task, TupleDestination *taskDest) tupleDestination->originalTaskDestination = taskDest; #if PG_VERSION_NUM >= PG_VERSION_12 - TupleDesc lastSavedExplainAnalyzeTupDesc = CreateTemplateTupleDesc(1); + TupleDesc lastSavedExplainAnalyzeTupDesc = CreateTemplateTupleDesc(2); #else - TupleDesc lastSavedExplainAnalyzeTupDesc = CreateTemplateTupleDesc(1, false); + TupleDesc lastSavedExplainAnalyzeTupDesc = CreateTemplateTupleDesc(2, false); #endif TupleDescInitEntry(lastSavedExplainAnalyzeTupDesc, 1, "explain analyze", TEXTOID, 0, 0); + TupleDescInitEntry(lastSavedExplainAnalyzeTupDesc, 2, "duration", FLOAT8OID, 0, 0); + tupleDestination->lastSavedExplainAnalyzeTupDesc = lastSavedExplainAnalyzeTupDesc; tupleDestination->pub.putTuple = ExplainAnalyzeDestPutTuple; @@ -1271,7 +1309,16 @@ ExplainAnalyzeDestPutTuple(TupleDestination *self, Task *task, return; } + Datum executionDuration = heap_getattr(heapTuple, 2, tupDesc, &isNull); + + if (isNull) + { + ereport(WARNING, (errmsg("received null execution time from worker"))); + return; + } + char *fetchedExplainAnalyzePlan = TextDatumGetCString(explainAnalyze); + double fetchedExplainAnalyzeExecutionDuration = DatumGetFloat8(executionDuration); /* * Allocate fetchedExplainAnalyzePlan in the same context as the Task, since we are @@ -1281,7 +1328,7 @@ ExplainAnalyzeDestPutTuple(TupleDestination *self, Task *task, * calls to CheckNodeCopyAndSerialization() which asserts copy functions of the task * work as expected, which will try to copy this value in a future execution. * - * Why we don't we just allocate this field in executor context and reset it before + * Why don't we just allocate this field in executor context and reset it before * the next execution? Because when an error is raised we can skip pretty much most * of the meaningful places that we can insert the reset. * @@ -1295,6 +1342,8 @@ ExplainAnalyzeDestPutTuple(TupleDestination *self, Task *task, MemoryContextStrdup(taskContext, fetchedExplainAnalyzePlan); tupleDestination->originalTask->fetchedExplainAnalyzePlacementIndex = placementIndex; + tupleDestination->originalTask->fetchedExplainAnalyzeExecutionDuration = + fetchedExplainAnalyzeExecutionDuration; } else { @@ -1385,7 +1434,7 @@ ExplainAnalyzeTaskList(List *originalTaskList, const char *queryString = TaskQueryString(explainAnalyzeTask); char *wrappedQuery = WrapQueryForExplainAnalyze(queryString, tupleDesc); char *fetchQuery = - "SELECT explain_analyze_output FROM worker_last_saved_explain_analyze()"; + "SELECT explain_analyze_output, execution_duration FROM worker_last_saved_explain_analyze()"; SetTaskQueryStringList(explainAnalyzeTask, list_make2(wrappedQuery, fetchQuery)); TupleDestination *originalTaskDest = originalTask->tupleDest ? diff --git a/src/backend/distributed/planner/multi_physical_planner.c b/src/backend/distributed/planner/multi_physical_planner.c index 73d4d3c41..dd4c2ebfd 100644 --- a/src/backend/distributed/planner/multi_physical_planner.c +++ b/src/backend/distributed/planner/multi_physical_planner.c @@ -5570,7 +5570,9 @@ SetPlacementNodeMetadata(ShardPlacement *placement, WorkerNode *workerNode) } -/* Helper function to compare two tasks by their taskId. */ +/* + * CompareTasksByTaskId is a helper function to compare two tasks by their taskId. + */ int CompareTasksByTaskId(const void *leftElement, const void *rightElement) { diff --git a/src/backend/distributed/shared_library_init.c b/src/backend/distributed/shared_library_init.c index b67889c16..6b4673a23 100644 --- a/src/backend/distributed/shared_library_init.c +++ b/src/backend/distributed/shared_library_init.c @@ -191,6 +191,12 @@ static const struct config_enum_entry multi_shard_modify_connection_options[] = { NULL, 0, false } }; +static const struct config_enum_entry explain_analyze_sort_method_options[] = { + { "execution-time", EXPLAIN_ANALYZE_SORT_BY_TIME, false }, + { "taskId", EXPLAIN_ANALYZE_SORT_BY_TASK_ID, false }, + { NULL, 0, false } +}; + /* *INDENT-ON* */ @@ -1487,6 +1493,21 @@ RegisterCitusConfigVariables(void) GUC_NO_SHOW_ALL, NULL, NULL, NULL); + DefineCustomEnumVariable( + "citus.explain_analyze_sort_method", + gettext_noop("Sets the sorting method for EXPLAIN ANALYZE queries."), + gettext_noop("This parameter is intended for testing. It is developed " + "to get consistent regression test outputs. When it is set " + "to 'time', EXPLAIN ANALYZE output is sorted by execution " + "duration on workers. When it is set to 'taskId', it is " + "sorted by task id. By default, it is set to 'time'; but " + "in regression tests, it's set to 'taskId' for consistency."), + &ExplainAnalyzeSortMethod, + EXPLAIN_ANALYZE_SORT_BY_TIME, explain_analyze_sort_method_options, + PGC_USERSET, + GUC_NO_SHOW_ALL, + NULL, NULL, NULL); + /* warn about config items in the citus namespace that are not registered above */ EmitWarningsOnPlaceholders("citus"); } diff --git a/src/backend/distributed/utils/citus_copyfuncs.c b/src/backend/distributed/utils/citus_copyfuncs.c index cecfe2116..24caf6a8d 100644 --- a/src/backend/distributed/utils/citus_copyfuncs.c +++ b/src/backend/distributed/utils/citus_copyfuncs.c @@ -326,6 +326,7 @@ CopyNodeTask(COPYFUNC_ARGS) COPY_SCALAR_FIELD(totalReceivedTupleData); COPY_SCALAR_FIELD(fetchedExplainAnalyzePlacementIndex); COPY_STRING_FIELD(fetchedExplainAnalyzePlan); + COPY_SCALAR_FIELD(fetchedExplainAnalyzeExecutionDuration); } diff --git a/src/backend/distributed/utils/citus_outfuncs.c b/src/backend/distributed/utils/citus_outfuncs.c index 5f30cb860..97d4d47d6 100644 --- a/src/backend/distributed/utils/citus_outfuncs.c +++ b/src/backend/distributed/utils/citus_outfuncs.c @@ -527,6 +527,11 @@ OutTask(OUTFUNC_ARGS) WRITE_NODE_FIELD(rowValuesLists); WRITE_BOOL_FIELD(partiallyLocalOrRemote); WRITE_BOOL_FIELD(parametersInQueryStringResolved); + WRITE_INT_FIELD(queryCount); + WRITE_UINT64_FIELD(totalReceivedTupleData); + WRITE_INT_FIELD(fetchedExplainAnalyzePlacementIndex); + WRITE_STRING_FIELD(fetchedExplainAnalyzePlan); + WRITE_FLOAT_FIELD(fetchedExplainAnalyzeExecutionDuration, "%.2f"); } diff --git a/src/include/distributed/multi_explain.h b/src/include/distributed/multi_explain.h index 01c07b4ad..296634905 100644 --- a/src/include/distributed/multi_explain.h +++ b/src/include/distributed/multi_explain.h @@ -13,9 +13,16 @@ #include "executor/executor.h" #include "tuple_destination.h" +typedef enum +{ + EXPLAIN_ANALYZE_SORT_BY_TIME = 0, + EXPLAIN_ANALYZE_SORT_BY_TASK_ID = 1 +} ExplainAnalyzeSortMethods; + /* Config variables managed via guc.c to explain distributed query plans */ extern bool ExplainDistributedQueries; extern bool ExplainAllTasks; +extern int ExplainAnalyzeSortMethod; extern void FreeSavedExplainPlan(void); extern void CitusExplainOneQuery(Query *query, int cursorOptions, IntoClause *into, diff --git a/src/include/distributed/multi_physical_planner.h b/src/include/distributed/multi_physical_planner.h index d13595d0f..b8a94e929 100644 --- a/src/include/distributed/multi_physical_planner.h +++ b/src/include/distributed/multi_physical_planner.h @@ -321,6 +321,12 @@ typedef struct Task */ char *fetchedExplainAnalyzePlan; int fetchedExplainAnalyzePlacementIndex; + + /* + * Execution Duration fetched from worker. This is saved to be used later by + * ExplainTaskList(). + */ + double fetchedExplainAnalyzeExecutionDuration; } Task; @@ -564,6 +570,8 @@ extern List * RoundRobinAssignTaskList(List *taskList); extern List * RoundRobinReorder(List *placementList); extern void SetPlacementNodeMetadata(ShardPlacement *placement, WorkerNode *workerNode); extern int CompareTasksByTaskId(const void *leftElement, const void *rightElement); +extern int CompareTasksByExecutionDuration(const void *leftElement, const + void *rightElement); /* function declaration for creating Task */ extern List * QueryPushdownSqlTaskList(Query *query, uint64 jobId, diff --git a/src/test/regress/expected/multi_explain.out b/src/test/regress/expected/multi_explain.out index ef39545b3..80a43b26e 100644 --- a/src/test/regress/expected/multi_explain.out +++ b/src/test/regress/expected/multi_explain.out @@ -2924,3 +2924,37 @@ Custom Scan (Citus Adaptive) (actual rows=0 loops=1) Node: host=localhost port=xxxxx dbname=regression -> Seq Scan on users_table_2_570028 users_table_2 (actual rows=0 loops=1) Filter: ((value_1)::text = '00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000X'::text) +-- sorted explain analyze output +CREATE TABLE explain_analyze_execution_time (a int); +INSERT INTO explain_analyze_execution_time VALUES (2); +SELECT create_distributed_table('explain_analyze_execution_time', 'a'); + +-- show that we can sort the output wrt execution time +-- we do the following hack to make the test outputs +-- be consistent. First, ingest a single row then add +-- pg_sleep() call on the query. Postgres will only +-- sleep for the shard that has the single row, so that +-- will definitely be slower +set citus.explain_analyze_sort_method to "taskId"; +EXPLAIN (COSTS FALSE, ANALYZE TRUE, TIMING FALSE, SUMMARY FALSE) select a, CASE WHEN pg_sleep(0.4) IS NULL THEN 'x' END from explain_analyze_execution_time; +Custom Scan (Citus Adaptive) (actual rows=1 loops=1) + Task Count: 2 + Tuple data received from nodes: 1 bytes + Tasks Shown: One of 2 + -> Task + Tuple data received from node: 0 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Seq Scan on explain_analyze_execution_time_570029 explain_analyze_execution_time (actual rows=0 loops=1) +set citus.explain_analyze_sort_method to "execution-time"; +EXPLAIN (COSTS FALSE, ANALYZE TRUE, TIMING FALSE, SUMMARY FALSE) select a, CASE WHEN pg_sleep(0.4) IS NULL THEN 'x' END from explain_analyze_execution_time; +Custom Scan (Citus Adaptive) (actual rows=1 loops=1) + Task Count: 2 + Tuple data received from nodes: 1 bytes + Tasks Shown: One of 2 + -> Task + Tuple data received from node: 1 bytes + Node: host=localhost port=xxxxx dbname=regression + -> Seq Scan on explain_analyze_execution_time_570030 explain_analyze_execution_time (actual rows=1 loops=1) +-- reset back +reset citus.explain_analyze_sort_method; +DROP TABLE explain_analyze_execution_time; diff --git a/src/test/regress/pg_regress_multi.pl b/src/test/regress/pg_regress_multi.pl index 5614bfbec..3e3fdaa39 100755 --- a/src/test/regress/pg_regress_multi.pl +++ b/src/test/regress/pg_regress_multi.pl @@ -416,6 +416,7 @@ push(@pgOptions, '-c', "citus.task_tracker_delay=10ms"); push(@pgOptions, '-c', "citus.remote_task_check_interval=1ms"); push(@pgOptions, '-c', "citus.shard_replication_factor=2"); push(@pgOptions, '-c', "citus.node_connection_timeout=${connectionTimeout}"); +push(@pgOptions, '-c', "citus.explain_analyze_sort_method=taskId"); # we disable slow start by default to encourage parallelism within tests push(@pgOptions, '-c', "citus.executor_slow_start_interval=0ms"); diff --git a/src/test/regress/sql/multi_explain.sql b/src/test/regress/sql/multi_explain.sql index a08bbbecf..2f0c9ac45 100644 --- a/src/test/regress/sql/multi_explain.sql +++ b/src/test/regress/sql/multi_explain.sql @@ -1034,3 +1034,21 @@ EXPLAIN :default_analyze_flags execute p4(20,20); -- simple test to confirm we can fetch long (>4KB) plans EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM users_table_2 WHERE value_1::text = '00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000X'; + +-- sorted explain analyze output +CREATE TABLE explain_analyze_execution_time (a int); +INSERT INTO explain_analyze_execution_time VALUES (2); +SELECT create_distributed_table('explain_analyze_execution_time', 'a'); +-- show that we can sort the output wrt execution time +-- we do the following hack to make the test outputs +-- be consistent. First, ingest a single row then add +-- pg_sleep() call on the query. Postgres will only +-- sleep for the shard that has the single row, so that +-- will definitely be slower +set citus.explain_analyze_sort_method to "taskId"; +EXPLAIN (COSTS FALSE, ANALYZE TRUE, TIMING FALSE, SUMMARY FALSE) select a, CASE WHEN pg_sleep(0.4) IS NULL THEN 'x' END from explain_analyze_execution_time; +set citus.explain_analyze_sort_method to "execution-time"; +EXPLAIN (COSTS FALSE, ANALYZE TRUE, TIMING FALSE, SUMMARY FALSE) select a, CASE WHEN pg_sleep(0.4) IS NULL THEN 'x' END from explain_analyze_execution_time; +-- reset back +reset citus.explain_analyze_sort_method; +DROP TABLE explain_analyze_execution_time;