Add citus.distributed_data_dump to only return results from local shards

marcocitus/distributed-data-dump
Marco Slot 2023-08-17 09:46:01 +02:00
parent 682dca1f12
commit 738b41c055
7 changed files with 200 additions and 1 deletions

View File

@ -315,6 +315,13 @@ PublishDistributedTableChanges(LogicalDecodingContext *ctx, ReorderBufferTXN *tx
return; return;
} }
/*
* TODO: consider replicated shards
*
* We should only emit from one of the replicas, and it should align with the
* IsDistributedDataDump logic.
*/
/* translate and publish from shard relation to distributed table relation for CDC. */ /* translate and publish from shard relation to distributed table relation for CDC. */
TranslateAndPublishRelationForCDC(ctx, txn, relation, change, shardId, TranslateAndPublishRelationForCDC(ctx, txn, relation, change, shardId,
distRelationId); distRelationId);

View File

@ -92,6 +92,7 @@
#include "distributed/resource_lock.h" #include "distributed/resource_lock.h"
#include "distributed/shard_pruning.h" #include "distributed/shard_pruning.h"
#include "distributed/shared_connection_stats.h" #include "distributed/shared_connection_stats.h"
#include "distributed/task_execution_utils.h"
#include "distributed/version_compat.h" #include "distributed/version_compat.h"
#include "distributed/worker_protocol.h" #include "distributed/worker_protocol.h"
#include "distributed/local_multi_copy.h" #include "distributed/local_multi_copy.h"
@ -3010,11 +3011,33 @@ CitusCopyTo(CopyStmt *copyStatement, QueryCompletion *completionTag)
ShardInterval *shardInterval = lfirst(shardIntervalCell); ShardInterval *shardInterval = lfirst(shardIntervalCell);
List *shardPlacementList = ActiveShardPlacementList(shardInterval->shardId); List *shardPlacementList = ActiveShardPlacementList(shardInterval->shardId);
ListCell *shardPlacementCell = NULL; ListCell *shardPlacementCell = NULL;
int placementIndex = 0; int placementIndex = -1;
StringInfo copyCommand = ConstructCopyStatement(copyStatement, StringInfo copyCommand = ConstructCopyStatement(copyStatement,
shardInterval->shardId); shardInterval->shardId);
/*
* When citus.distributed_data_dump is enabled, only emit from local shards.
*
* That way, users can run COPY table TO STDOUT on all nodes to get a full
* copy of the data with much higher bandwidth than running it via the
* coordinator. Moreover, each command can use a snapshot that aligns with
* a specific replication slot.
*/
if (IsDistributedDataDump)
{
List *newPlacementList =
TaskPlacementListForDistributedDataDump(shardPlacementList);
if (newPlacementList == NIL)
{
/* shard does not have local placements */
continue;
}
shardPlacementList = newPlacementList;
}
foreach(shardPlacementCell, shardPlacementList) foreach(shardPlacementCell, shardPlacementList)
{ {
ShardPlacement *shardPlacement = lfirst(shardPlacementCell); ShardPlacement *shardPlacement = lfirst(shardPlacementCell);
@ -3022,6 +3045,8 @@ CitusCopyTo(CopyStmt *copyStatement, QueryCompletion *completionTag)
char *userName = NULL; char *userName = NULL;
const bool raiseErrors = true; const bool raiseErrors = true;
placementIndex++;
MultiConnection *connection = GetPlacementConnection(connectionFlags, MultiConnection *connection = GetPlacementConnection(connectionFlags,
shardPlacement, shardPlacement,
userName); userName);

View File

@ -161,6 +161,7 @@
#include "distributed/resource_lock.h" #include "distributed/resource_lock.h"
#include "distributed/shared_connection_stats.h" #include "distributed/shared_connection_stats.h"
#include "distributed/subplan_execution.h" #include "distributed/subplan_execution.h"
#include "distributed/task_execution_utils.h"
#include "distributed/transaction_management.h" #include "distributed/transaction_management.h"
#include "distributed/transaction_identifier.h" #include "distributed/transaction_identifier.h"
#include "distributed/tuple_destination.h" #include "distributed/tuple_destination.h"
@ -854,6 +855,47 @@ AdaptiveExecutor(CitusScanState *scanState)
MarkUnreferencedExternParams((Node *) job->jobQuery, paramListInfo); MarkUnreferencedExternParams((Node *) job->jobQuery, paramListInfo);
} }
/*
* When citus.distributed_data_dump is enabled, only emit from local shards.
*
* That way, users can run SELECT * FROM table on all nodes to get a full
* copy of the data with much higher bandwidth than running it via the
* coordinator. Moreover, each command can use a snapshot that aligns with
* a specific replication slot.
*/
if (IsDistributedDataDump)
{
/*
* Throw errors for writes and complex queries to not cause confusion /
* data integrity issues.
*/
if (job->jobQuery->commandType != CMD_SELECT)
{
ereport(ERROR, (errmsg("can only use citus.distributed_data_dump for "
"SELECT queries")));
}
if (distributedPlan->modLevel != ROW_MODIFY_READONLY)
{
ereport(ERROR, (errmsg("can only use citus.distributed_data_dump for "
"read-only queries")));
}
if (list_length(distributedPlan->relationIdList) != 1)
{
ereport(ERROR, (errmsg("can only use citus.distributed_data_dump for "
"single-table queries")));
}
if (hasDependentJobs)
{
ereport(ERROR, (errmsg("cannot use citus.distributed_data_dump for "
"re-partition joins")));
}
taskList = TaskListForDistributedDataDump(taskList);
}
DistributedExecution *execution = CreateDistributedExecution( DistributedExecution *execution = CreateDistributedExecution(
distributedPlan->modLevel, distributedPlan->modLevel,
taskList, taskList,

View File

@ -1166,6 +1166,22 @@ RegisterCitusConfigVariables(void)
GUC_STANDARD, GUC_STANDARD,
NULL, NULL, NULL); NULL, NULL, NULL);
DefineCustomBoolVariable(
"citus.distributed_data_dump",
gettext_noop("When enabled, queries only return data from local shards"),
gettext_noop("When you need a full copy of a set of Citus tables, it can be "
"useful to only return data from local shards, or from the first "
"replica of replicated shards (incl. reference tables). That way, "
"you can pull from all nodes concurrently and construct a complete "
"snapshot. This is also necessary for logical replications "
"scenarios, since the snapshot of the data on each node needs to "
"be aligned with the replication slot on that node."),
&IsDistributedDataDump,
false,
PGC_USERSET,
GUC_STANDARD,
NULL, NULL, NULL);
DefineCustomRealVariable( DefineCustomRealVariable(
"citus.distributed_deadlock_detection_factor", "citus.distributed_deadlock_detection_factor",
gettext_noop("Sets the time to wait before checking for distributed " gettext_noop("Sets the time to wait before checking for distributed "

View File

@ -57,6 +57,9 @@ static Task * TaskHashEnter(HTAB *taskHash, Task *task);
static Task * TaskHashLookup(HTAB *trackerHash, TaskType taskType, uint64 jobId, static Task * TaskHashLookup(HTAB *trackerHash, TaskType taskType, uint64 jobId,
uint32 taskId); uint32 taskId);
/* citus.distributed_data_dump GUC */
bool IsDistributedDataDump;
/* /*
* CreateTaskListForJobTree visits all tasks in the job tree (by following dependentTaskList), * CreateTaskListForJobTree visits all tasks in the job tree (by following dependentTaskList),
* starting with the given job's task list. The function then returns the list. * starting with the given job's task list. The function then returns the list.
@ -194,3 +197,104 @@ TaskHashLookup(HTAB *taskHash, TaskType taskType, uint64 jobId, uint32 taskId)
return task; return task;
} }
/*
* TaskListForDistributedDataDump returns a task list that can be used
* in cases where citus.distributed_data_dump is true by only including
* local tasks.
*/
List *
TaskListForDistributedDataDump(List *taskList)
{
List *newTaskList = NIL;
Task *task = NULL;
foreach_ptr(task, taskList)
{
List *newPlacementList =
TaskPlacementListForDistributedDataDump(task->taskPlacementList);
if (newPlacementList == NIL)
{
/* skip task if there are no placements */
continue;
}
task->taskPlacementList = newPlacementList;
newTaskList = lappend(newTaskList, task);
}
return newTaskList;
}
/*
* TaskPlacementListForDistributedDataDump implements the logic for find task
* placements to use when doing a distributed data dump.
*
* It returns a new task placement list that only includes a local
* placement (if any). In case of a replicated placement (e.g. reference
* table), it only returns the placement on the coordinator or the one with
* the lowest ID.
*
* This logic should align with how CDC handles replicated placements.
*/
List *
TaskPlacementListForDistributedDataDump(List *taskPlacementList)
{
List *newPlacementList = NIL;
ShardPlacement *minPlacement = NULL;
bool isCoordinator = IsCoordinator();
int localGroupId = GetLocalGroupId();
ShardPlacement *taskPlacement = NULL;
foreach_ptr(taskPlacement, taskPlacementList)
{
if (taskPlacement->groupId != localGroupId)
{
/* skip all non-local shard placements */
continue;
}
if (isCoordinator)
{
/*
* If the coordinator has a placement, we prefer to emit from
* the coordinator. This is mainly to align with
* PublishDistributedTableChanges, which only emits changes
* to reference tables from the coordinator.
*/
newPlacementList = lappend(newPlacementList, taskPlacement);
break;
}
else if (taskPlacement->groupId == COORDINATOR_GROUP_ID)
{
/*
* We are not the coordinator, but there is a coordinator placement.
* The coordinator should emit instead.
*/
minPlacement = NULL;
}
else if (minPlacement == NULL ||
taskPlacement->placementId < minPlacement->placementId)
{
/*
* For replicated shards that do not have a coordinator placement,
* use the placement with the lowest ID.
*/
minPlacement = taskPlacement;
}
}
if (minPlacement != NULL)
{
/*
* Emit the local placement if it is the one with the lowest
* placement ID.
*/
newPlacementList = lappend(newPlacementList, minPlacement);
}
return newPlacementList;
}

View File

@ -72,6 +72,9 @@ extern int ExecutorSlowStartInterval;
extern bool SortReturning; extern bool SortReturning;
extern int ExecutorLevel; extern int ExecutorLevel;
/* citus.distributed_data_dump GUC */
extern bool IsDistributedDataDump;
extern void CitusExecutorStart(QueryDesc *queryDesc, int eflags); extern void CitusExecutorStart(QueryDesc *queryDesc, int eflags);
extern void CitusExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count, extern void CitusExecutorRun(QueryDesc *queryDesc, ScanDirection direction, uint64 count,

View File

@ -2,5 +2,7 @@
#define TASK_EXECUTION_UTILS_H #define TASK_EXECUTION_UTILS_H
extern List * CreateTaskListForJobTree(List *jobTaskList); extern List * CreateTaskListForJobTree(List *jobTaskList);
extern List * TaskListForDistributedDataDump(List *taskList);
extern List * TaskPlacementListForDistributedDataDump(List *taskPlacementList);
#endif /* TASK_EXECUTION_UTILS_H */ #endif /* TASK_EXECUTION_UTILS_H */