citus/src/include/distributed/multi_physical_planner.h

348 lines
11 KiB
C

/*-------------------------------------------------------------------------
*
* multi_physical_planner.h
* Type and function declarations used in creating the distributed execution
* plan.
*
* Copyright (c) 2012-2016, Citus Data, Inc.
*
* $Id$
*
*-------------------------------------------------------------------------
*/
#ifndef MULTI_PHYSICAL_PLANNER_H
#define MULTI_PHYSICAL_PLANNER_H
#include "postgres.h"
#include "c.h"
#include "datatype/timestamp.h"
#include "distributed/citus_nodes.h"
#include "distributed/errormessage.h"
#include "distributed/master_metadata_utility.h"
#include "distributed/multi_logical_planner.h"
#include "distributed/distributed_planner.h"
#include "lib/stringinfo.h"
#include "nodes/parsenodes.h"
#include "utils/array.h"
/* Definitions local to the physical planner */
#define ARRAY_OUT_FUNC_ID 751
#define NON_PRUNABLE_JOIN -1
#define RESERVED_HASHED_COLUMN_ID MaxAttrNumber
#define MERGE_COLUMN_FORMAT "merge_column_%u"
#define MAP_OUTPUT_FETCH_COMMAND "SELECT worker_fetch_partition_file \
(" UINT64_FORMAT ", %u, %u, %u, '%s', %u)"
#define RANGE_PARTITION_COMMAND "SELECT worker_range_partition_table \
(" UINT64_FORMAT ", %d, %s, '%s', '%s'::regtype, %s)"
#define HASH_PARTITION_COMMAND "SELECT worker_hash_partition_table \
(" UINT64_FORMAT ", %d, %s, '%s', '%s'::regtype, %s)"
#define MERGE_FILES_INTO_TABLE_COMMAND "SELECT worker_merge_files_into_table \
(" UINT64_FORMAT ", %d, '%s', '%s')"
#define MERGE_FILES_AND_RUN_QUERY_COMMAND \
"SELECT worker_merge_files_and_run_query(" UINT64_FORMAT ", %d, %s, %s)"
typedef enum CitusRTEKind
{
CITUS_RTE_RELATION = RTE_RELATION, /* ordinary relation reference */
CITUS_RTE_SUBQUERY = RTE_SUBQUERY, /* subquery in FROM */
CITUS_RTE_JOIN = RTE_JOIN, /* join */
CITUS_RTE_FUNCTION = RTE_FUNCTION, /* function in FROM */
#if (PG_VERSION_NUM >= 100000)
CITUS_RTE_TABLEFUNC = RTE_TABLEFUNC, /* TableFunc(.., column list) */
#endif
CITUS_RTE_VALUES = RTE_VALUES, /* VALUES (<exprlist>), (<exprlist>), ... */
CITUS_RTE_CTE = RTE_CTE, /* common table expr (WITH list element) */
#if (PG_VERSION_NUM >= 100000)
CITUS_RTE_NAMEDTUPLESTORE = RTE_NAMEDTUPLESTORE, /* tuplestore, e.g. for triggers */
#endif
CITUS_RTE_SHARD,
CITUS_RTE_REMOTE_QUERY
} CitusRTEKind;
/* Enumeration that defines the partition type for a remote job */
typedef enum
{
PARTITION_INVALID_FIRST = 0,
RANGE_PARTITION_TYPE = 1,
SINGLE_HASH_PARTITION_TYPE = 2,
DUAL_HASH_PARTITION_TYPE = 3
} PartitionType;
/* Enumeration that defines different task types */
typedef enum
{
TASK_TYPE_INVALID_FIRST = 0,
SQL_TASK = 1,
MAP_TASK = 2,
MERGE_TASK = 3,
MAP_OUTPUT_FETCH_TASK = 4,
MERGE_FETCH_TASK = 5,
MODIFY_TASK = 6,
ROUTER_TASK = 7,
DDL_TASK = 8,
VACUUM_ANALYZE_TASK = 9
} TaskType;
/* Enumeration that defines the task assignment policy to use */
typedef enum
{
TASK_ASSIGNMENT_INVALID_FIRST = 0,
TASK_ASSIGNMENT_GREEDY = 1,
TASK_ASSIGNMENT_ROUND_ROBIN = 2,
TASK_ASSIGNMENT_FIRST_REPLICA = 3
} TaskAssignmentPolicyType;
/* Enumeration that defines different job types */
typedef enum
{
JOB_INVALID_FIRST = 0,
JOIN_MAP_MERGE_JOB = 1,
SUBQUERY_MAP_MERGE_JOB = 2,
TOP_LEVEL_WORKER_JOB = 3
} BoundaryNodeJobType;
/*
* Job represents a logical unit of work that contains one set of data transfers
* in our physical plan. The physical planner maps each SQL query into one or
* more jobs depending on the query's complexity, and sets dependencies between
* these jobs. Each job consists of multiple executable tasks; and these tasks
* either operate on base shards, or repartitioned tables.
*/
typedef struct Job
{
CitusNode type;
uint64 jobId;
Query *jobQuery;
List *taskList;
List *dependedJobList;
bool subqueryPushdown;
bool requiresMasterEvaluation; /* only applies to modify jobs */
bool deferredPruning;
} Job;
/* Defines a repartitioning job and holds additional related data. */
typedef struct MapMergeJob
{
Job job;
Query *reduceQuery;
PartitionType partitionType;
Var *partitionColumn;
uint32 partitionCount;
int sortedShardIntervalArrayLength;
ShardInterval **sortedShardIntervalArray; /* only applies to range partitioning */
List *mapTaskList;
List *mergeTaskList;
} MapMergeJob;
/*
* Task represents an executable unit of work. We conceptualize our tasks into
* compute and data fetch task types. SQL, map, and merge tasks are considered
* as compute tasks; and map fetch, and merge fetch tasks are data
* fetch tasks. We also forward declare the task execution struct here to avoid
* including the executor header files.
*
* We currently do not take replication model into account for tasks other
* than modifications. When it is set to REPLICATION_MODEL_2PC, the execution
* of the modification task is done with two-phase commit. Set it to
* REPLICATION_MODEL_INVALID if it is not relevant for the task.
*
* NB: Changing this requires also changing _outTask in citus_outfuncs and _readTask
* in citus_readfuncs to correctly (de)serialize this struct.
*
* INSERT ... SELECT queries and modify queries with subqueries or multiple tables
* set modifyWithSubquery to true. We need to use it to take the necessary locks
* to get consistent results for subqueries.
*/
typedef struct TaskExecution TaskExecution;
typedef struct Task
{
CitusNode type;
TaskType taskType;
uint64 jobId;
uint32 taskId;
char *queryString;
uint64 anchorShardId; /* only applies to compute tasks */
List *taskPlacementList; /* only applies to compute tasks */
List *dependedTaskList; /* only applies to compute tasks */
uint32 partitionId;
uint32 upstreamTaskId; /* only applies to data fetch tasks */
ShardInterval *shardInterval; /* only applies to merge tasks */
bool assignmentConstrained; /* only applies to merge tasks */
TaskExecution *taskExecution; /* used by task tracker executor */
bool upsertQuery; /* only applies to modify tasks */
char replicationModel; /* only applies to modify tasks */
bool modifyWithSubquery;
List *relationShardList;
List *rowValuesLists; /* rows to use when building multi-row INSERT */
} Task;
/*
* RangeTableFragment represents a fragment of a range table. This fragment
* could be a regular shard or a merged table formed in a MapMerge job.
*/
typedef struct RangeTableFragment
{
CitusRTEKind fragmentType;
void *fragmentReference;
uint32 rangeTableId;
} RangeTableFragment;
/*
* JoinSequenceNode represents a range table in an ordered sequence of tables
* joined together. This representation helps build combinations of all range
* table fragments during task generation.
*/
typedef struct JoinSequenceNode
{
uint32 rangeTableId;
int32 joiningRangeTableId;
} JoinSequenceNode;
/*
* DistributedPlan contains all information necessary to execute a
* distribute query.
*/
typedef struct DistributedPlan
{
CitusNode type;
/* unique identifier of the plan within the session */
uint64 planId;
/* type of command to execute (SELECT/INSERT/...) */
CmdType operation;
/* specifies whether a DML command has a RETURNING */
bool hasReturning;
/* job tree containing the tasks to be executed on workers */
Job *workerJob;
/* local query that merges results from the workers */
Query *masterQuery;
/* a router executable query is executed entirely on a worker */
bool routerExecutable;
/* which relations are accessed by this distributed plan */
List *relationIdList;
/* SELECT query in an INSERT ... SELECT via the coordinator */
Query *insertSelectSubquery;
/* target list of an INSERT ... SELECT via the coordinator */
List *insertTargetList;
/* target relation of an INSERT ... SELECT via the coordinator */
Oid targetRelationId;
/* list of subplans to execute before the distributed query */
List *subPlanList;
/*
* NULL if this a valid plan, an error description otherwise. This will
* e.g. be set if SQL features are present that a planner doesn't support,
* or if prepared statement parameters prevented successful planning.
*/
DeferredErrorMessage *planningError;
} DistributedPlan;
/*
* DistributedSubPlan contains a subplan of a distributed plan. Subplans are
* executed before the distributed query and their results are written to
* temporary files. This is used to execute CTEs and subquery joins that
* cannot be distributed.
*/
typedef struct DistributedSubPlan
{
CitusNode type;
uint32 subPlanId;
PlannedStmt *plan;
} DistributedSubPlan;
/* OperatorCacheEntry contains information for each element in OperatorCache */
typedef struct OperatorCacheEntry
{
/* cache key consists of typeId, accessMethodId and strategyNumber */
Oid typeId;
Oid accessMethodId;
int16 strategyNumber;
Oid operatorId;
Oid operatorClassInputType;
char typeType;
} OperatorCacheEntry;
/* Config variable managed via guc.c */
extern int TaskAssignmentPolicy;
extern bool EnableUniqueJobIds;
/* Function declarations for building physical plans and constructing queries */
extern DistributedPlan * CreatePhysicalDistributedPlan(MultiTreeRoot *multiTree,
PlannerRestrictionContext *
plannerRestrictionContext);
extern Task * CreateBasicTask(uint64 jobId, uint32 taskId, TaskType taskType,
char *queryString);
extern OpExpr * MakeOpExpression(Var *variable, int16 strategyNumber);
/*
* Function declarations for building, updating constraints and simple operator
* expression check.
*/
extern Node * BuildBaseConstraint(Var *column);
extern void UpdateConstraint(Node *baseConstraint, ShardInterval *shardInterval);
extern bool SimpleOpExpression(Expr *clause);
extern bool OpExpressionContainsColumn(OpExpr *operatorExpression, Var *partitionColumn);
/* helper functions */
extern Var * MakeInt4Column(void);
extern Const * MakeInt4Constant(Datum constantValue);
extern int CompareShardPlacements(const void *leftElement, const void *rightElement);
extern bool ShardIntervalsOverlap(ShardInterval *firstInterval,
ShardInterval *secondInterval);
extern bool CoPartitionedTables(Oid firstRelationId, Oid secondRelationId);
extern ShardInterval ** GenerateSyntheticShardIntervalArray(int partitionCount);
/* function declarations for Task and Task list operations */
extern bool TasksEqual(const Task *a, const Task *b);
extern List * TaskListAppendUnique(List *list, Task *task);
extern List * TaskListConcatUnique(List *list1, List *list2);
extern bool TaskListMember(const List *taskList, const Task *task);
extern List * TaskListDifference(const List *list1, const List *list2);
extern List * AssignAnchorShardTaskList(List *taskList);
extern List * FirstReplicaAssignTaskList(List *taskList);
/* function declaration for creating Task */
extern List * QueryPushdownSqlTaskList(Query *query, uint64 jobId,
RelationRestrictionContext *
relationRestrictionContext,
List *prunedRelationShardList, TaskType taskType,
bool modifyRequiresMasterEvaluation);
#endif /* MULTI_PHYSICAL_PLANNER_H */