Merge pull request #4191 from citusdata/sort-explain-analyze-output-by-time

Sort explain analyze output by task time
2020-09-24 14:38:06 +03:00 · 2020-09-24 14:38:06 +03:00 · e892e253b1
parent 64d5ac6a10 abfb79bda6
commit e892e253b1
10 changed files with 155 additions and 9 deletions
--- a/src/backend/distributed/planner/multi_explain.c
+++ b/src/backend/distributed/planner/multi_explain.c
@ -68,6 +68,7 @@
 /* Config variables that enable printing distributed query plans */
 bool ExplainDistributedQueries = true;
 bool ExplainAllTasks = false;
+int ExplainAnalyzeSortMethod = EXPLAIN_ANALYZE_SORT_BY_TIME;

 /*
 * If enabled, EXPLAIN ANALYZE output & other statistics of last worker task
@ -529,8 +530,34 @@ ExplainMapMergeJob(MapMergeJob *mapMergeJob, ExplainState *es)


 /*
- * ExplainTaskList shows the remote EXPLAIN for the first task in taskList,
- * or all tasks if citus.explain_all_tasks is on.
+ * CompareTasksByFetchedExplainAnalyzeDuration is a helper function to compare two tasks by their execution duration.
+ */
+static int
+CompareTasksByFetchedExplainAnalyzeDuration(const void *leftElement, const
+											void *rightElement)
+{
+	const Task *leftTask = *((const Task **) leftElement);
+	const Task *rightTask = *((const Task **) rightElement);
+
+	double leftTaskExecutionDuration = leftTask->fetchedExplainAnalyzeExecutionDuration;
+	double rightTaskExecutionDuration = rightTask->fetchedExplainAnalyzeExecutionDuration;
+
+	double diff = leftTaskExecutionDuration - rightTaskExecutionDuration;
+	if (diff > 0)
+	{
+		return -1;
+	}
+	else if (diff < 0)
+	{
+		return 1;
+	}
+	return 0;
+}
+
+
+/*
+ * ExplainTaskList shows the remote EXPLAIN and execution time for the first task
+ * in taskList, or all tasks if citus.explain_all_tasks is on.
 */
 static void
 ExplainTaskList(CitusScanState *scanState, List *taskList, ExplainState *es,
@ -540,8 +567,17 @@ ExplainTaskList(CitusScanState *scanState, List *taskList, ExplainState *es,
 	ListCell *remoteExplainCell = NULL;
 	List *remoteExplainList = NIL;

-	/* make sure that the output is consistent */
-	taskList = SortList(taskList, CompareTasksByTaskId);
+	/* if tasks are executed, we sort them by time; unless we are on a test env */
+	if (es->analyze && ExplainAnalyzeSortMethod == EXPLAIN_ANALYZE_SORT_BY_TIME)
+	{
+		/* sort by execution duration only in case of ANALYZE */
+		taskList = SortList(taskList, CompareTasksByFetchedExplainAnalyzeDuration);
+	}
+	else
+	{
+		/* make sure that the output is consistent */
+		taskList = SortList(taskList, CompareTasksByTaskId);
+	}

 	foreach(taskCell, taskList)
 	{
@ -1225,13 +1261,15 @@ CreateExplainAnlyzeDestination(Task *task, TupleDestination *taskDest)
 	tupleDestination->originalTaskDestination = taskDest;

 #if PG_VERSION_NUM >= PG_VERSION_12
-	TupleDesc lastSavedExplainAnalyzeTupDesc = CreateTemplateTupleDesc(1);
+	TupleDesc lastSavedExplainAnalyzeTupDesc = CreateTemplateTupleDesc(2);
 #else
-	TupleDesc lastSavedExplainAnalyzeTupDesc = CreateTemplateTupleDesc(1, false);
+	TupleDesc lastSavedExplainAnalyzeTupDesc = CreateTemplateTupleDesc(2, false);
 #endif

 	TupleDescInitEntry(lastSavedExplainAnalyzeTupDesc, 1, "explain analyze", TEXTOID, 0,
 					   0);
+	TupleDescInitEntry(lastSavedExplainAnalyzeTupDesc, 2, "duration", FLOAT8OID, 0, 0);
+
 	tupleDestination->lastSavedExplainAnalyzeTupDesc = lastSavedExplainAnalyzeTupDesc;

 	tupleDestination->pub.putTuple = ExplainAnalyzeDestPutTuple;
@ -1271,7 +1309,16 @@ ExplainAnalyzeDestPutTuple(TupleDestination *self, Task *task,
 			return;
 		}

+		Datum executionDuration = heap_getattr(heapTuple, 2, tupDesc, &isNull);
+
+		if (isNull)
+		{
+			ereport(WARNING, (errmsg("received null execution time from worker")));
+			return;
+		}
+
 		char *fetchedExplainAnalyzePlan = TextDatumGetCString(explainAnalyze);
+		double fetchedExplainAnalyzeExecutionDuration = DatumGetFloat8(executionDuration);

 		/*
 		 * Allocate fetchedExplainAnalyzePlan in the same context as the Task, since we are
@ -1281,7 +1328,7 @@ ExplainAnalyzeDestPutTuple(TupleDestination *self, Task *task,
 		 * calls to CheckNodeCopyAndSerialization() which asserts copy functions of the task
 		 * work as expected, which will try to copy this value in a future execution.
 		 *
-		 * Why we don't we just allocate this field in executor context and reset it before
+		 * Why don't we just allocate this field in executor context and reset it before
 		 * the next execution? Because when an error is raised we can skip pretty much most
 		 * of the meaningful places that we can insert the reset.
 		 *
@ -1295,6 +1342,8 @@ ExplainAnalyzeDestPutTuple(TupleDestination *self, Task *task,
 			MemoryContextStrdup(taskContext, fetchedExplainAnalyzePlan);
 		tupleDestination->originalTask->fetchedExplainAnalyzePlacementIndex =
 			placementIndex;
+		tupleDestination->originalTask->fetchedExplainAnalyzeExecutionDuration =
+			fetchedExplainAnalyzeExecutionDuration;
 	}
 	else
 	{
@ -1385,7 +1434,7 @@ ExplainAnalyzeTaskList(List *originalTaskList,
 		const char *queryString = TaskQueryString(explainAnalyzeTask);
 		char *wrappedQuery = WrapQueryForExplainAnalyze(queryString, tupleDesc);
 		char *fetchQuery =
-			"SELECT explain_analyze_output FROM worker_last_saved_explain_analyze()";
+			"SELECT explain_analyze_output, execution_duration FROM worker_last_saved_explain_analyze()";
 		SetTaskQueryStringList(explainAnalyzeTask, list_make2(wrappedQuery, fetchQuery));

 		TupleDestination *originalTaskDest = originalTask->tupleDest ?
--- a/src/backend/distributed/planner/multi_physical_planner.c
+++ b/src/backend/distributed/planner/multi_physical_planner.c
@ -5570,7 +5570,9 @@ SetPlacementNodeMetadata(ShardPlacement *placement, WorkerNode *workerNode)
 }


-/* Helper function to compare two tasks by their taskId. */
+/*
+ * CompareTasksByTaskId is a helper function to compare two tasks by their taskId.
+ */
 int
 CompareTasksByTaskId(const void *leftElement, const void *rightElement)
 {
--- a/src/backend/distributed/shared_library_init.c
+++ b/src/backend/distributed/shared_library_init.c
@ -191,6 +191,12 @@ static const struct config_enum_entry multi_shard_modify_connection_options[] =
 	{ NULL, 0, false }
 };

+static const struct config_enum_entry explain_analyze_sort_method_options[] = {
+	{ "execution-time", EXPLAIN_ANALYZE_SORT_BY_TIME, false },
+	{ "taskId", EXPLAIN_ANALYZE_SORT_BY_TASK_ID, false },
+	{ NULL, 0, false }
+};
+
 /* *INDENT-ON* */


@ -1487,6 +1493,21 @@ RegisterCitusConfigVariables(void)
 		GUC_NO_SHOW_ALL,
 		NULL, NULL, NULL);

+	DefineCustomEnumVariable(
+		"citus.explain_analyze_sort_method",
+		gettext_noop("Sets the sorting method for EXPLAIN ANALYZE queries."),
+		gettext_noop("This parameter is intended for testing. It is developed "
+					 "to get consistent regression test outputs. When it is set "
+					 "to 'time', EXPLAIN ANALYZE output is sorted by execution "
+					 "duration on workers. When it is set to 'taskId', it is "
+					 "sorted by task id. By default, it is set to 'time'; but "
+					 "in regression tests, it's set to 'taskId' for consistency."),
+		&ExplainAnalyzeSortMethod,
+		EXPLAIN_ANALYZE_SORT_BY_TIME, explain_analyze_sort_method_options,
+		PGC_USERSET,
+		GUC_NO_SHOW_ALL,
+		NULL, NULL, NULL);
+
 	/* warn about config items in the citus namespace that are not registered above */
 	EmitWarningsOnPlaceholders("citus");
 }
--- a/src/backend/distributed/utils/citus_copyfuncs.c
+++ b/src/backend/distributed/utils/citus_copyfuncs.c
@ -326,6 +326,7 @@ CopyNodeTask(COPYFUNC_ARGS)
 	COPY_SCALAR_FIELD(totalReceivedTupleData);
 	COPY_SCALAR_FIELD(fetchedExplainAnalyzePlacementIndex);
 	COPY_STRING_FIELD(fetchedExplainAnalyzePlan);
+	COPY_SCALAR_FIELD(fetchedExplainAnalyzeExecutionDuration);
 }


--- a/src/backend/distributed/utils/citus_outfuncs.c
+++ b/src/backend/distributed/utils/citus_outfuncs.c
@ -527,6 +527,11 @@ OutTask(OUTFUNC_ARGS)
 	WRITE_NODE_FIELD(rowValuesLists);
 	WRITE_BOOL_FIELD(partiallyLocalOrRemote);
 	WRITE_BOOL_FIELD(parametersInQueryStringResolved);
+	WRITE_INT_FIELD(queryCount);
+	WRITE_UINT64_FIELD(totalReceivedTupleData);
+	WRITE_INT_FIELD(fetchedExplainAnalyzePlacementIndex);
+	WRITE_STRING_FIELD(fetchedExplainAnalyzePlan);
+	WRITE_FLOAT_FIELD(fetchedExplainAnalyzeExecutionDuration, "%.2f");
 }


--- a/src/include/distributed/multi_explain.h
+++ b/src/include/distributed/multi_explain.h
@ -13,9 +13,16 @@
 #include "executor/executor.h"
 #include "tuple_destination.h"

+typedef enum
+{
+	EXPLAIN_ANALYZE_SORT_BY_TIME = 0,
+	EXPLAIN_ANALYZE_SORT_BY_TASK_ID = 1
+} ExplainAnalyzeSortMethods;
+
 /* Config variables managed via guc.c to explain distributed query plans */
 extern bool ExplainDistributedQueries;
 extern bool ExplainAllTasks;
+extern int ExplainAnalyzeSortMethod;

 extern void FreeSavedExplainPlan(void);
 extern void CitusExplainOneQuery(Query *query, int cursorOptions, IntoClause *into,
--- a/src/include/distributed/multi_physical_planner.h
+++ b/src/include/distributed/multi_physical_planner.h
@ -321,6 +321,12 @@ typedef struct Task
 	 */
 	char *fetchedExplainAnalyzePlan;
 	int fetchedExplainAnalyzePlacementIndex;
+
+	/*
+	 * Execution Duration fetched from worker. This is saved to be used later by
+	 * ExplainTaskList().
+	 */
+	double fetchedExplainAnalyzeExecutionDuration;
 } Task;


@ -564,6 +570,8 @@ extern List * RoundRobinAssignTaskList(List *taskList);
 extern List * RoundRobinReorder(List *placementList);
 extern void SetPlacementNodeMetadata(ShardPlacement *placement, WorkerNode *workerNode);
 extern int CompareTasksByTaskId(const void *leftElement, const void *rightElement);
+extern int CompareTasksByExecutionDuration(const void *leftElement, const
+										   void *rightElement);

 /* function declaration for creating Task */
 extern List * QueryPushdownSqlTaskList(Query *query, uint64 jobId,
--- a/src/test/regress/expected/multi_explain.out
+++ b/src/test/regress/expected/multi_explain.out
@ -2924,3 +2924,37 @@ Custom Scan (Citus Adaptive) (actual rows=0 loops=1)
        Node: host=localhost port=xxxxx dbname=regression
        ->  Seq Scan on users_table_2_570028 users_table_2 (actual rows=0 loops=1)
              Filter: ((value_1)::text = '00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000X'::text)
+-- sorted explain analyze output
+CREATE TABLE explain_analyze_execution_time (a int);
+INSERT INTO explain_analyze_execution_time VALUES (2);
+SELECT create_distributed_table('explain_analyze_execution_time', 'a');
+
+-- show that we can sort the output wrt execution time
+-- we do the following hack to make the test outputs
+-- be consistent. First, ingest a single row then add
+-- pg_sleep() call on the query. Postgres will only
+-- sleep for the shard that has the single row, so that
+-- will definitely be slower
+set citus.explain_analyze_sort_method to "taskId";
+EXPLAIN (COSTS FALSE, ANALYZE TRUE, TIMING FALSE, SUMMARY FALSE) select a, CASE WHEN pg_sleep(0.4) IS NULL THEN  'x' END from explain_analyze_execution_time;
+Custom Scan (Citus Adaptive) (actual rows=1 loops=1)
+  Task Count: 2
+  Tuple data received from nodes: 1 bytes
+  Tasks Shown: One of 2
+  ->  Task
+        Tuple data received from node: 0 bytes
+        Node: host=localhost port=xxxxx dbname=regression
+        ->  Seq Scan on explain_analyze_execution_time_570029 explain_analyze_execution_time (actual rows=0 loops=1)
+set citus.explain_analyze_sort_method to "execution-time";
+EXPLAIN (COSTS FALSE, ANALYZE TRUE, TIMING FALSE, SUMMARY FALSE) select a, CASE WHEN pg_sleep(0.4) IS NULL THEN  'x' END from explain_analyze_execution_time;
+Custom Scan (Citus Adaptive) (actual rows=1 loops=1)
+  Task Count: 2
+  Tuple data received from nodes: 1 bytes
+  Tasks Shown: One of 2
+  ->  Task
+        Tuple data received from node: 1 bytes
+        Node: host=localhost port=xxxxx dbname=regression
+        ->  Seq Scan on explain_analyze_execution_time_570030 explain_analyze_execution_time (actual rows=1 loops=1)
+-- reset back
+reset citus.explain_analyze_sort_method;
+DROP TABLE explain_analyze_execution_time;
--- a/src/test/regress/pg_regress_multi.pl
+++ b/src/test/regress/pg_regress_multi.pl
@ -416,6 +416,7 @@ push(@pgOptions, '-c', "citus.task_tracker_delay=10ms");
 push(@pgOptions, '-c', "citus.remote_task_check_interval=1ms");
 push(@pgOptions, '-c', "citus.shard_replication_factor=2");
 push(@pgOptions, '-c', "citus.node_connection_timeout=${connectionTimeout}");
+push(@pgOptions, '-c', "citus.explain_analyze_sort_method=taskId");

 # we disable slow start by default to encourage parallelism within tests
 push(@pgOptions, '-c', "citus.executor_slow_start_interval=0ms");
--- a/src/test/regress/sql/multi_explain.sql
+++ b/src/test/regress/sql/multi_explain.sql
@ -1034,3 +1034,21 @@ EXPLAIN :default_analyze_flags execute p4(20,20);

 -- simple test to confirm we can fetch long (>4KB) plans
 EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM users_table_2 WHERE value_1::text = '00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000X';
+
+-- sorted explain analyze output
+CREATE TABLE explain_analyze_execution_time (a int);
+INSERT INTO explain_analyze_execution_time VALUES (2);
+SELECT create_distributed_table('explain_analyze_execution_time', 'a');
+-- show that we can sort the output wrt execution time
+-- we do the following hack to make the test outputs
+-- be consistent. First, ingest a single row then add
+-- pg_sleep() call on the query. Postgres will only
+-- sleep for the shard that has the single row, so that
+-- will definitely be slower
+set citus.explain_analyze_sort_method to "taskId";
+EXPLAIN (COSTS FALSE, ANALYZE TRUE, TIMING FALSE, SUMMARY FALSE) select a, CASE WHEN pg_sleep(0.4) IS NULL THEN  'x' END from explain_analyze_execution_time;
+set citus.explain_analyze_sort_method to "execution-time";
+EXPLAIN (COSTS FALSE, ANALYZE TRUE, TIMING FALSE, SUMMARY FALSE) select a, CASE WHEN pg_sleep(0.4) IS NULL THEN  'x' END from explain_analyze_execution_time;
+-- reset back
+reset citus.explain_analyze_sort_method;
+DROP TABLE explain_analyze_execution_time;