/*------------------------------------------------------------------------- * * multi_router_planner.c * * This file contains functions to plan multiple shard queries without any * aggregation step including distributed table modifications. * * Copyright (c) Citus Data, Inc. * *------------------------------------------------------------------------- */ #include "postgres.h" #include "distributed/pg_version_constants.h" #include #include "access/stratnum.h" #include "access/xact.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" #include "distributed/colocation_utils.h" #include "distributed/citus_clauses.h" #include "distributed/citus_nodes.h" #include "distributed/citus_nodefuncs.h" #include "distributed/deparse_shard_query.h" #include "distributed/distribution_column.h" #include "distributed/errormessage.h" #include "distributed/log_utils.h" #include "distributed/insert_select_planner.h" #include "distributed/intermediate_results.h" #include "distributed/intermediate_result_pruning.h" #include "distributed/metadata_utility.h" #include "distributed/coordinator_protocol.h" #include "distributed/metadata_cache.h" #include "distributed/multi_executor.h" #include "distributed/multi_join_order.h" #include "distributed/multi_logical_planner.h" #include "distributed/multi_logical_optimizer.h" #include "distributed/multi_partitioning_utils.h" #include "distributed/multi_physical_planner.h" #include "distributed/multi_router_planner.h" #include "distributed/multi_server_executor.h" #include "distributed/listutils.h" #include "distributed/citus_ruleutils.h" #include "distributed/query_pushdown_planning.h" #include "distributed/query_utils.h" #include "distributed/reference_table_utils.h" #include "distributed/relation_restriction_equivalence.h" #include "distributed/relay_utility.h" #include "distributed/resource_lock.h" #include "distributed/shardinterval_utils.h" #include "distributed/shard_pruning.h" #include "executor/execdesc.h" #include "lib/stringinfo.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "nodes/nodes.h" #include "nodes/parsenodes.h" #include "nodes/pg_list.h" #include "nodes/primnodes.h" #include "optimizer/clauses.h" #include "optimizer/joininfo.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" #if PG_VERSION_NUM >= PG_VERSION_12 #include "optimizer/optimizer.h" #else #include "optimizer/var.h" #include "optimizer/predtest.h" #endif #include "optimizer/restrictinfo.h" #include "parser/parsetree.h" #include "parser/parse_oper.h" #include "postmaster/postmaster.h" #include "storage/lock.h" #include "utils/builtins.h" #include "utils/elog.h" #include "utils/errcodes.h" #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/typcache.h" #include "catalog/pg_proc.h" #include "optimizer/planmain.h" /* intermediate value for INSERT processing */ typedef struct InsertValues { Expr *partitionValueExpr; /* partition value provided in INSERT row */ List *rowValues; /* full values list of INSERT row, possibly NIL */ int64 shardId; /* target shard for this row, possibly invalid */ Index listIndex; /* index to make our sorting stable */ } InsertValues; /* * A ModifyRoute encapsulates the information needed to route modifications * to the appropriate shard. For a single-shard modification, only one route * is needed, but in the case of e.g. a multi-row INSERT, lists of these values * will help divide the rows by their destination shards, permitting later * shard-and-row-specific extension of the original SQL. */ typedef struct ModifyRoute { int64 shardId; /* identifier of target shard */ List *rowValuesLists; /* for multi-row INSERTs, list of rows to be inserted */ } ModifyRoute; typedef struct WalkerState { bool containsVar; bool varArgument; bool badCoalesce; } WalkerState; bool EnableRouterExecution = true; /* planner functions forward declarations */ static void CreateSingleTaskRouterSelectPlan(DistributedPlan *distributedPlan, Query *originalQuery, Query *query, PlannerRestrictionContext * plannerRestrictionContext); static Oid ResultRelationOidForQuery(Query *query); static bool IsTidColumn(Node *node); static DeferredErrorMessage * ModifyPartialQuerySupported(Query *queryTree, bool multiShardQuery, Oid *distributedTableId); static bool NodeIsFieldStore(Node *node); static DeferredErrorMessage * DeferErrorIfUnsupportedModifyQueryWithLocalTable( Query *query); static DeferredErrorMessage * DeferErrorIfUnsupportedModifyQueryWithCitusLocalTable( RTEListProperties *rteListProperties, Oid targetRelationId); static DeferredErrorMessage * DeferErrorIfUnsupportedModifyQueryWithPostgresLocalTable( RTEListProperties *rteListProperties, Oid targetRelationId); static DeferredErrorMessage * MultiShardUpdateDeleteSupported(Query *originalQuery, PlannerRestrictionContext * plannerRestrictionContext); static DeferredErrorMessage * SingleShardUpdateDeleteSupported(Query *originalQuery, PlannerRestrictionContext * plannerRestrictionContext); static bool HasDangerousJoinUsing(List *rtableList, Node *jtnode); static bool MasterIrreducibleExpression(Node *expression, bool *varArgument, bool *badCoalesce); static bool MasterIrreducibleExpressionWalker(Node *expression, WalkerState *state); static bool MasterIrreducibleExpressionFunctionChecker(Oid func_id, void *context); static bool TargetEntryChangesValue(TargetEntry *targetEntry, Var *column, FromExpr *joinTree); static Job * RouterInsertJob(Query *originalQuery); static void ErrorIfNoShardsExist(CitusTableCacheEntry *cacheEntry); static DeferredErrorMessage * DeferErrorIfModifyView(Query *queryTree); static Job * CreateJob(Query *query); static Task * CreateTask(TaskType taskType); static Job * RouterJob(Query *originalQuery, PlannerRestrictionContext *plannerRestrictionContext, DeferredErrorMessage **planningError); static bool RelationPrunesToMultipleShards(List *relationShardList); static void NormalizeMultiRowInsertTargetList(Query *query); static void AppendNextDummyColReference(Alias *expendedReferenceNames); static Value * MakeDummyColumnString(int dummyColumnId); static List * BuildRoutesForInsert(Query *query, DeferredErrorMessage **planningError); static List * GroupInsertValuesByShardId(List *insertValuesList); static List * ExtractInsertValuesList(Query *query, Var *partitionColumn); static DeferredErrorMessage * DeferErrorIfUnsupportedRouterPlannableSelectQuery( Query *query); static DeferredErrorMessage * ErrorIfQueryHasUnroutableModifyingCTE(Query *queryTree); static bool SelectsFromDistributedTable(List *rangeTableList, Query *query); static ShardPlacement * CreateDummyPlacement(bool hasLocalRelation); static ShardPlacement * CreateLocalDummyPlacement(); static List * get_all_actual_clauses(List *restrictinfo_list); static int CompareInsertValuesByShardId(const void *leftElement, const void *rightElement); static List * SingleShardTaskList(Query *query, uint64 jobId, List *relationShardList, List *placementList, uint64 shardId, bool parametersInQueryResolved); static bool RowLocksOnRelations(Node *node, List **rtiLockList); static void ReorderTaskPlacementsByTaskAssignmentPolicy(Job *job, TaskAssignmentPolicyType taskAssignmentPolicy, List *placementList); /* * CreateRouterPlan attempts to create a router executor plan for the given * SELECT statement. ->planningError is set if planning fails. */ DistributedPlan * CreateRouterPlan(Query *originalQuery, Query *query, PlannerRestrictionContext *plannerRestrictionContext) { DistributedPlan *distributedPlan = CitusMakeNode(DistributedPlan); distributedPlan->planningError = DeferErrorIfUnsupportedRouterPlannableSelectQuery( query); if (distributedPlan->planningError == NULL) { CreateSingleTaskRouterSelectPlan(distributedPlan, originalQuery, query, plannerRestrictionContext); } distributedPlan->fastPathRouterPlan = plannerRestrictionContext->fastPathRestrictionContext->fastPathRouterQuery; return distributedPlan; } /* * CreateModifyPlan attempts to create a plan for the given modification * statement. If planning fails ->planningError is set to a description of * the failure. */ DistributedPlan * CreateModifyPlan(Query *originalQuery, Query *query, PlannerRestrictionContext *plannerRestrictionContext) { Job *job = NULL; DistributedPlan *distributedPlan = CitusMakeNode(DistributedPlan); bool multiShardQuery = false; Assert(originalQuery->commandType != CMD_SELECT); distributedPlan->modLevel = RowModifyLevelForQuery(query); distributedPlan->planningError = ModifyQuerySupported(query, originalQuery, multiShardQuery, plannerRestrictionContext); if (distributedPlan->planningError != NULL) { return distributedPlan; } if (UpdateOrDeleteQuery(query)) { job = RouterJob(originalQuery, plannerRestrictionContext, &distributedPlan->planningError); } else { job = RouterInsertJob(originalQuery); } if (distributedPlan->planningError != NULL) { return distributedPlan; } ereport(DEBUG2, (errmsg("Creating router plan"))); distributedPlan->workerJob = job; distributedPlan->combineQuery = NULL; distributedPlan->expectResults = originalQuery->returningList != NIL; distributedPlan->targetRelationId = ResultRelationOidForQuery(query); distributedPlan->fastPathRouterPlan = plannerRestrictionContext->fastPathRestrictionContext->fastPathRouterQuery; return distributedPlan; } /* * CreateSingleTaskRouterPlan creates a physical plan for given SELECT query. * The returned plan is a router task that returns query results from a single worker. * If not router plannable, the returned plan's planningError describes the problem. */ static void CreateSingleTaskRouterSelectPlan(DistributedPlan *distributedPlan, Query *originalQuery, Query *query, PlannerRestrictionContext *plannerRestrictionContext) { Assert(query->commandType == CMD_SELECT); distributedPlan->modLevel = RowModifyLevelForQuery(query); Job *job = RouterJob(originalQuery, plannerRestrictionContext, &distributedPlan->planningError); if (distributedPlan->planningError != NULL) { /* query cannot be handled by this planner */ return; } ereport(DEBUG2, (errmsg("Creating router plan"))); distributedPlan->workerJob = job; distributedPlan->combineQuery = NULL; distributedPlan->expectResults = true; } /* * ShardIntervalOpExpressions returns a list of OpExprs with exactly two * items in it. The list consists of shard interval ranges with partition columns * such as (partitionColumn >= shardMinValue) and (partitionColumn <= shardMaxValue). * * The function returns hashed columns generated by MakeInt4Column() for the hash * partitioned tables in place of partition columns. * * The function returns NIL if shard interval does not belong to a hash, * range and append distributed tables. * * NB: If you update this, also look at PrunableExpressionsWalker(). */ List * ShardIntervalOpExpressions(ShardInterval *shardInterval, Index rteIndex) { Oid relationId = shardInterval->relationId; Var *partitionColumn = NULL; if (IsCitusTableType(relationId, HASH_DISTRIBUTED)) { partitionColumn = MakeInt4Column(); } else if (IsCitusTableType(relationId, RANGE_DISTRIBUTED) || IsCitusTableType( relationId, APPEND_DISTRIBUTED)) { Assert(rteIndex > 0); partitionColumn = PartitionColumn(relationId, rteIndex); } else { /* do not add any shard range interval for reference tables */ return NIL; } /* build the base expression for constraint */ Node *baseConstraint = BuildBaseConstraint(partitionColumn); /* walk over shard list and check if shards can be pruned */ if (shardInterval->minValueExists && shardInterval->maxValueExists) { UpdateConstraint(baseConstraint, shardInterval); } return list_make1(baseConstraint); } /* * AddPartitionKeyNotNullFilterToSelect adds the following filters to a subquery: * * partitionColumn IS NOT NULL * * The function expects and asserts that subquery's target list contains a partition * column value. Thus, this function should never be called with reference tables. */ void AddPartitionKeyNotNullFilterToSelect(Query *subqery) { List *targetList = subqery->targetList; ListCell *targetEntryCell = NULL; Var *targetPartitionColumnVar = NULL; /* iterate through the target entries */ foreach(targetEntryCell, targetList) { TargetEntry *targetEntry = lfirst(targetEntryCell); if (IsPartitionColumn(targetEntry->expr, subqery) && IsA(targetEntry->expr, Var)) { targetPartitionColumnVar = (Var *) targetEntry->expr; break; } } /* we should have found target partition column */ Assert(targetPartitionColumnVar != NULL); /* create expression for partition_column IS NOT NULL */ NullTest *nullTest = makeNode(NullTest); nullTest->nulltesttype = IS_NOT_NULL; nullTest->arg = (Expr *) targetPartitionColumnVar; nullTest->argisrow = false; /* finally add the quals */ if (subqery->jointree->quals == NULL) { subqery->jointree->quals = (Node *) nullTest; } else { subqery->jointree->quals = make_and_qual(subqery->jointree->quals, (Node *) nullTest); } } /* * ExtractSelectRangeTableEntry returns the range table entry of the subquery. * Note that the function expects and asserts that the input query be * an INSERT...SELECT query. */ RangeTblEntry * ExtractSelectRangeTableEntry(Query *query) { Assert(InsertSelectIntoCitusTable(query) || InsertSelectIntoLocalTable(query)); /* * Since we already asserted InsertSelectIntoCitusTable() it is safe to access * both lists */ List *fromList = query->jointree->fromlist; RangeTblRef *reference = linitial(fromList); RangeTblEntry *subqueryRte = rt_fetch(reference->rtindex, query->rtable); return subqueryRte; } /* * ModifyQueryResultRelationId returns the result relation's Oid * for the given modification query. * * The function errors out if the input query is not a * modify query (e.g., INSERT, UPDATE or DELETE). So, this * function is not expected to be called on SELECT queries. */ Oid ModifyQueryResultRelationId(Query *query) { /* only modify queries have result relations */ if (!IsModifyCommand(query)) { ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("input query is not a modification query"))); } RangeTblEntry *resultRte = ExtractResultRelationRTE(query); Assert(OidIsValid(resultRte->relid)); return resultRte->relid; } /* * ResultRelationOidForQuery returns the OID of the relation this is modified * by a given query. */ static Oid ResultRelationOidForQuery(Query *query) { RangeTblEntry *resultRTE = rt_fetch(query->resultRelation, query->rtable); return resultRTE->relid; } /* * ExtractResultRelationRTE returns the table's resultRelation range table * entry. This returns NULL when there's no resultRelation, such as in a SELECT * query. */ RangeTblEntry * ExtractResultRelationRTE(Query *query) { if (query->resultRelation > 0) { return rt_fetch(query->resultRelation, query->rtable); } return NULL; } /* * ExtractResultRelationRTEOrError returns the table's resultRelation range table * entry and errors out if there's no result relation at all, e.g. like in a * SELECT query. * * This is a separate function (instead of using missingOk), so static analysis * reasons about NULL returns correctly. */ RangeTblEntry * ExtractResultRelationRTEOrError(Query *query) { RangeTblEntry *relation = ExtractResultRelationRTE(query); if (relation == NULL) { ereport(ERROR, (errmsg("no result relation could be found for the query"), errhint("is this a SELECT query?"))); } return relation; } /* * IsTidColumn gets a node and returns true if the node is a Var type of TID. */ static bool IsTidColumn(Node *node) { if (IsA(node, Var)) { Var *column = (Var *) node; if (column->vartype == TIDOID) { return true; } } return false; } /* * ModifyPartialQuerySupported implements a subset of what ModifyQuerySupported checks, * that subset being what's necessary to check modifying CTEs for. */ static DeferredErrorMessage * ModifyPartialQuerySupported(Query *queryTree, bool multiShardQuery, Oid *distributedTableIdOutput) { DeferredErrorMessage *deferredError = DeferErrorIfModifyView(queryTree); if (deferredError != NULL) { return deferredError; } deferredError = DeferErrorIfUnsupportedModifyQueryWithLocalTable(queryTree); if (deferredError != NULL) { return deferredError; } /* * Reject subqueries which are in SELECT or WHERE clause. * Queries which include subqueries in FROM clauses are rejected below. */ if (queryTree->hasSubLinks == true) { /* we support subqueries for INSERTs only via INSERT INTO ... SELECT */ if (!UpdateOrDeleteQuery(queryTree)) { Assert(queryTree->commandType == CMD_INSERT); return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "subqueries are not supported within INSERT queries", NULL, "Try rewriting your queries with 'INSERT " "INTO ... SELECT' syntax."); } } /* reject queries which include CommonTableExpr which aren't routable */ if (queryTree->cteList != NIL) { ListCell *cteCell = NULL; foreach(cteCell, queryTree->cteList) { CommonTableExpr *cte = (CommonTableExpr *) lfirst(cteCell); Query *cteQuery = (Query *) cte->ctequery; if (cteQuery->commandType != CMD_SELECT) { /* Modifying CTEs still not supported for INSERTs & multi shard queries. */ if (queryTree->commandType == CMD_INSERT) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "Router planner doesn't support non-select common table expressions with non-select queries.", NULL, NULL); } if (multiShardQuery) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "Router planner doesn't support non-select common table expressions with multi shard queries.", NULL, NULL); } } /* Modifying CTEs exclude both INSERT CTEs & INSERT queries. */ if (cteQuery->commandType == CMD_INSERT) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "Router planner doesn't support INSERT common table expressions.", NULL, NULL); } if (cteQuery->hasForUpdate && FindNodeMatchingCheckFunctionInRangeTableList(cteQuery->rtable, IsReferenceTableRTE)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "Router planner doesn't support SELECT FOR UPDATE" " in common table expressions involving reference tables.", NULL, NULL); } if (FindNodeMatchingCheckFunction((Node *) cteQuery, CitusIsVolatileFunction)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "Router planner doesn't support VOLATILE functions" " in common table expressions.", NULL, NULL); } if (cteQuery->commandType == CMD_SELECT) { DeferredErrorMessage *cteError = DeferErrorIfUnsupportedRouterPlannableSelectQuery(cteQuery); if (cteError) { return cteError; } } } } Oid distributedTableId = ModifyQueryResultRelationId(queryTree); uint32 rangeTableId = 1; Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId); CmdType commandType = queryTree->commandType; if (commandType == CMD_INSERT || commandType == CMD_UPDATE || commandType == CMD_DELETE) { bool hasVarArgument = false; /* A STABLE function is passed a Var argument */ bool hasBadCoalesce = false; /* CASE/COALESCE passed a mutable function */ FromExpr *joinTree = queryTree->jointree; ListCell *targetEntryCell = NULL; foreach(targetEntryCell, queryTree->targetList) { TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell); bool targetEntryPartitionColumn = false; /* reference tables do not have partition column */ if (partitionColumn == NULL) { targetEntryPartitionColumn = false; } else if (targetEntry->resno == partitionColumn->varattno) { targetEntryPartitionColumn = true; } /* skip resjunk entries: UPDATE adds some for ctid, etc. */ if (targetEntry->resjunk) { continue; } if (commandType == CMD_UPDATE && FindNodeMatchingCheckFunction((Node *) targetEntry->expr, CitusIsVolatileFunction)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "functions used in UPDATE queries on distributed " "tables must not be VOLATILE", NULL, NULL); } if (commandType == CMD_UPDATE && targetEntryPartitionColumn && TargetEntryChangesValue(targetEntry, partitionColumn, queryTree->jointree)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "modifying the partition value of rows is not " "allowed", NULL, NULL); } if (commandType == CMD_UPDATE && MasterIrreducibleExpression((Node *) targetEntry->expr, &hasVarArgument, &hasBadCoalesce)) { Assert(hasVarArgument || hasBadCoalesce); } if (FindNodeMatchingCheckFunction((Node *) targetEntry->expr, NodeIsFieldStore)) { /* DELETE cannot do field indirection already */ Assert(commandType == CMD_UPDATE || commandType == CMD_INSERT); return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "inserting or modifying composite type fields is not " "supported", NULL, "Use the column name to insert or update the composite " "type as a single value"); } } if (joinTree != NULL) { if (FindNodeMatchingCheckFunction((Node *) joinTree->quals, CitusIsVolatileFunction)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "functions used in the WHERE clause of modification " "queries on distributed tables must not be VOLATILE", NULL, NULL); } else if (MasterIrreducibleExpression(joinTree->quals, &hasVarArgument, &hasBadCoalesce)) { Assert(hasVarArgument || hasBadCoalesce); } } if (hasVarArgument) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "STABLE functions used in UPDATE queries " "cannot be called with column references", NULL, NULL); } if (hasBadCoalesce) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "non-IMMUTABLE functions are not allowed in CASE or " "COALESCE statements", NULL, NULL); } if (contain_mutable_functions((Node *) queryTree->returningList)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "non-IMMUTABLE functions are not allowed in the " "RETURNING clause", NULL, NULL); } if (queryTree->jointree->quals != NULL && nodeTag(queryTree->jointree->quals) == T_CurrentOfExpr) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot run DML queries with cursors", NULL, NULL); } } deferredError = ErrorIfOnConflictNotSupported(queryTree); if (deferredError != NULL) { return deferredError; } /* set it for caller to use when we don't return any errors */ *distributedTableIdOutput = distributedTableId; return NULL; } /* * NodeIsFieldStore returns true if given Node is a FieldStore object. */ static bool NodeIsFieldStore(Node *node) { return node && IsA(node, FieldStore); } /* * DeferErrorIfUnsupportedModifyQueryWithLocalTable returns DeferredErrorMessage * for unsupported modify queries that cannot be planned by router planner due to * unsupported usage of postgres local or citus local tables. */ static DeferredErrorMessage * DeferErrorIfUnsupportedModifyQueryWithLocalTable(Query *query) { RTEListProperties *rteListProperties = GetRTEListPropertiesForQuery(query); Oid targetRelationId = ModifyQueryResultRelationId(query); DeferredErrorMessage *deferredErrorMessage = DeferErrorIfUnsupportedModifyQueryWithCitusLocalTable(rteListProperties, targetRelationId); if (deferredErrorMessage) { return deferredErrorMessage; } deferredErrorMessage = DeferErrorIfUnsupportedModifyQueryWithPostgresLocalTable( rteListProperties, targetRelationId); return deferredErrorMessage; } /* * DeferErrorIfUnsupportedModifyQueryWithCitusLocalTable is a helper function * that takes RTEListProperties & targetRelationId and returns deferred error * if query is not supported due to unsupported usage of citus local tables. */ static DeferredErrorMessage * DeferErrorIfUnsupportedModifyQueryWithCitusLocalTable( RTEListProperties *rteListProperties, Oid targetRelationId) { if (rteListProperties->hasDistributedTable && rteListProperties->hasCitusLocalTable) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot plan modifications with citus local tables and " "distributed tables", NULL, LOCAL_TABLE_SUBQUERY_CTE_HINT); } if (IsCitusTableType(targetRelationId, REFERENCE_TABLE) && rteListProperties->hasCitusLocalTable) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot plan modifications of reference tables with citus " "local tables", NULL, LOCAL_TABLE_SUBQUERY_CTE_HINT); } return NULL; } /* * DeferErrorIfUnsupportedModifyQueryWithPostgresLocalTable is a helper * function that takes RTEListProperties & targetRelationId and returns * deferred error if query is not supported due to unsupported usage of * postgres local tables. */ static DeferredErrorMessage * DeferErrorIfUnsupportedModifyQueryWithPostgresLocalTable( RTEListProperties *rteListProperties, Oid targetRelationId) { if (rteListProperties->hasPostgresLocalTable && rteListProperties->hasCitusTable) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot plan modifications with local tables involving " "citus tables", NULL, LOCAL_TABLE_SUBQUERY_CTE_HINT); } if (!IsCitusTable(targetRelationId)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot plan modifications of local tables involving " "distributed tables", NULL, NULL); } return NULL; } /* * ModifyQuerySupported returns NULL if the query only contains supported * features, otherwise it returns an error description. * Note that we need both the original query and the modified one because * different checks need different versions. In particular, we cannot * perform the ContainsReadIntermediateResultFunction check on the * rewritten query because it may have been replaced by a subplan, * while some of the checks for setting the partition column value rely * on the rewritten query. */ DeferredErrorMessage * ModifyQuerySupported(Query *queryTree, Query *originalQuery, bool multiShardQuery, PlannerRestrictionContext *plannerRestrictionContext) { Oid distributedTableId = InvalidOid; DeferredErrorMessage *error = ModifyPartialQuerySupported(queryTree, multiShardQuery, &distributedTableId); if (error) { return error; } List *rangeTableList = NIL; uint32 queryTableCount = 0; CmdType commandType = queryTree->commandType; bool fastPathRouterQuery = plannerRestrictionContext->fastPathRestrictionContext->fastPathRouterQuery; /* * Here, we check if a recursively planned query tries to modify * rows based on the ctid column. This is a bad idea because ctid of * the rows could be changed before the modification part of * the query is executed. * * We can exclude fast path queries since they cannot have intermediate * results by definition. */ if (!fastPathRouterQuery && ContainsReadIntermediateResultFunction((Node *) originalQuery)) { bool hasTidColumn = FindNodeMatchingCheckFunction( (Node *) originalQuery->jointree, IsTidColumn); if (hasTidColumn) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot perform distributed planning for the given " "modification", "Recursively planned distributed modifications " "with ctid on where clause are not supported.", NULL); } } /* * Extract range table entries for queries that are not fast path. We can skip fast * path queries because their definition is a single RTE entry, which is a relation, * so the following check doesn't apply for fast-path queries. */ if (!fastPathRouterQuery) { ExtractRangeTableEntryWalker((Node *) originalQuery, &rangeTableList); } RangeTblEntry *rangeTableEntry = NULL; foreach_ptr(rangeTableEntry, rangeTableList) { if (rangeTableEntry->rtekind == RTE_RELATION) { /* we do not expect to see a view in modify query */ if (rangeTableEntry->relkind == RELKIND_VIEW) { /* * we already check if modify is run on a view in DeferErrorIfModifyView * function call. In addition, since Postgres replaced views in FROM * clause with subqueries, encountering with a view should not be a problem here. */ } else if (rangeTableEntry->relkind == RELKIND_MATVIEW) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "materialized views in modify queries are not supported", NULL, NULL); } /* for other kinds of relations, check if its distributed */ else { Oid relationId = rangeTableEntry->relid; if (!IsCitusTable(relationId)) { StringInfo errorMessage = makeStringInfo(); char *relationName = get_rel_name(rangeTableEntry->relid); appendStringInfo(errorMessage, "relation %s is not distributed", relationName); return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, errorMessage->data, NULL, NULL); } } queryTableCount++; } else if (rangeTableEntry->rtekind == RTE_VALUES #if PG_VERSION_NUM >= PG_VERSION_12 || rangeTableEntry->rtekind == RTE_RESULT #endif ) { /* do nothing, this type is supported */ } else { char *rangeTableEntryErrorDetail = NULL; /* * We support UPDATE and DELETE with subqueries and joins unless * they are multi shard queries. */ if (UpdateOrDeleteQuery(queryTree)) { continue; } /* * Error out for rangeTableEntries that we do not support. * We do not explicitly specify "in FROM clause" in the error detail * for the features that we do not support at all (SUBQUERY, JOIN). */ if (rangeTableEntry->rtekind == RTE_SUBQUERY) { StringInfo errorHint = makeStringInfo(); CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry( distributedTableId); char *partitionKeyString = cacheEntry->partitionKeyString; char *partitionColumnName = ColumnToColumnName(distributedTableId, partitionKeyString); appendStringInfo(errorHint, "Consider using an equality filter on " "partition column \"%s\" to target a single shard.", partitionColumnName); return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "subqueries are not " "supported in modifications across multiple shards", errorHint->data, NULL); } else if (rangeTableEntry->rtekind == RTE_JOIN) { rangeTableEntryErrorDetail = "Joins are not supported in distributed" " modifications."; } else if (rangeTableEntry->rtekind == RTE_FUNCTION) { rangeTableEntryErrorDetail = "Functions must not appear in the FROM" " clause of a distributed modifications."; } else if (rangeTableEntry->rtekind == RTE_CTE) { rangeTableEntryErrorDetail = "Common table expressions are not supported" " in distributed modifications."; } else { rangeTableEntryErrorDetail = "Unrecognized range table entry."; } return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot perform distributed planning for the given " "modifications", rangeTableEntryErrorDetail, NULL); } } if (commandType != CMD_INSERT) { DeferredErrorMessage *errorMessage = NULL; if (multiShardQuery) { errorMessage = MultiShardUpdateDeleteSupported(originalQuery, plannerRestrictionContext); } else { errorMessage = SingleShardUpdateDeleteSupported(originalQuery, plannerRestrictionContext); } if (errorMessage != NULL) { return errorMessage; } } return NULL; } /* * Modify statements on simple updetable views are not supported yet. * Actually, we need the original query (the query before postgres * pg_rewrite_query) to detect if the view sitting in rtable is to * be updated or just to be used in FROM clause. * Hence, tracing the postgres source code, we deduced that postgres * puts the relation to be modified to the first entry of rtable. * If first element of the range table list is a simple updatable * view and this view is not coming from FROM clause (inFromCl = False), * then update is run "on" that view. */ static DeferredErrorMessage * DeferErrorIfModifyView(Query *queryTree) { if (queryTree->rtable != NIL) { RangeTblEntry *firstRangeTableElement = (RangeTblEntry *) linitial( queryTree->rtable); if (firstRangeTableElement->rtekind == RTE_RELATION && firstRangeTableElement->relkind == RELKIND_VIEW && firstRangeTableElement->inFromCl == false) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot modify views over distributed tables", NULL, NULL); } } return NULL; } /* * ErrorIfOnConflictNotSupprted returns an error if an INSERT query has an * unsupported ON CONFLICT clause. In particular, changing the partition * column value or using volatile functions is not allowed. */ DeferredErrorMessage * ErrorIfOnConflictNotSupported(Query *queryTree) { uint32 rangeTableId = 1; ListCell *setTargetCell = NULL; bool specifiesPartitionValue = false; CmdType commandType = queryTree->commandType; if (commandType != CMD_INSERT || queryTree->onConflict == NULL) { return NULL; } Oid distributedTableId = ExtractFirstCitusTableId(queryTree); Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId); List *onConflictSet = queryTree->onConflict->onConflictSet; Node *arbiterWhere = queryTree->onConflict->arbiterWhere; Node *onConflictWhere = queryTree->onConflict->onConflictWhere; /* * onConflictSet is expanded via expand_targetlist() on the standard planner. * This ends up adding all the columns to the onConflictSet even if the user * does not explicitly state the columns in the query. * * The following loop simply allows "DO UPDATE SET part_col = table.part_col" * types of elements in the target list, which are added by expand_targetlist(). * Any other attempt to update partition column value is forbidden. */ foreach(setTargetCell, onConflictSet) { TargetEntry *setTargetEntry = (TargetEntry *) lfirst(setTargetCell); bool setTargetEntryPartitionColumn = false; /* reference tables do not have partition column */ if (partitionColumn == NULL) { setTargetEntryPartitionColumn = false; } else if (setTargetEntry->resno == partitionColumn->varattno) { setTargetEntryPartitionColumn = true; } if (setTargetEntryPartitionColumn) { Expr *setExpr = setTargetEntry->expr; if (IsA(setExpr, Var) && ((Var *) setExpr)->varattno == partitionColumn->varattno) { specifiesPartitionValue = false; } else { specifiesPartitionValue = true; } } else { /* * Similarly, allow "DO UPDATE SET col_1 = table.col_1" types of * target list elements. Note that, the following check allows * "DO UPDATE SET col_1 = table.col_2", which is not harmful. */ if (IsA(setTargetEntry->expr, Var)) { continue; } else if (contain_mutable_functions((Node *) setTargetEntry->expr)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "functions used in the DO UPDATE SET clause of " "INSERTs on distributed tables must be marked " "IMMUTABLE", NULL, NULL); } } } /* error if either arbiter or on conflict WHERE contains a mutable function */ if (contain_mutable_functions((Node *) arbiterWhere) || contain_mutable_functions((Node *) onConflictWhere)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "functions used in the WHERE clause of the " "ON CONFLICT clause of INSERTs on distributed " "tables must be marked IMMUTABLE", NULL, NULL); } if (specifiesPartitionValue) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "modifying the partition value of rows is not " "allowed", NULL, NULL); } return NULL; } /* * MultiShardUpdateDeleteSupported returns the error message if the update/delete is * not pushdownable, otherwise it returns NULL. */ static DeferredErrorMessage * MultiShardUpdateDeleteSupported(Query *originalQuery, PlannerRestrictionContext *plannerRestrictionContext) { DeferredErrorMessage *errorMessage = NULL; RangeTblEntry *resultRangeTable = ExtractResultRelationRTE(originalQuery); Oid resultRelationOid = resultRangeTable->relid; if (HasDangerousJoinUsing(originalQuery->rtable, (Node *) originalQuery->jointree)) { errorMessage = DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "a join with USING causes an internal naming conflict, use " "ON instead", NULL, NULL); } else if (FindNodeMatchingCheckFunction((Node *) originalQuery, CitusIsVolatileFunction)) { errorMessage = DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "functions used in UPDATE queries on distributed " "tables must not be VOLATILE", NULL, NULL); } else if (IsCitusTableType(resultRelationOid, REFERENCE_TABLE)) { errorMessage = DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "only reference tables may be queried when targeting " "a reference table with multi shard UPDATE/DELETE queries " "with multiple tables ", NULL, NULL); } else { errorMessage = DeferErrorIfUnsupportedSubqueryPushdown(originalQuery, plannerRestrictionContext); } return errorMessage; } /* * SingleShardUpdateDeleteSupported returns the error message if the update/delete query is * not routable, otherwise it returns NULL. */ static DeferredErrorMessage * SingleShardUpdateDeleteSupported(Query *originalQuery, PlannerRestrictionContext *plannerRestrictionContext) { DeferredErrorMessage *errorMessage = NULL; /* * We currently do not support volatile functions in update/delete statements because * the function evaluation logic does not know how to distinguish volatile functions * (that need to be evaluated per row) from stable functions (that need to be evaluated per query), * and it is also not safe to push the volatile functions down on replicated tables. */ if (FindNodeMatchingCheckFunction((Node *) originalQuery, CitusIsVolatileFunction)) { errorMessage = DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "functions used in UPDATE queries on distributed " "tables must not be VOLATILE", NULL, NULL); } return errorMessage; } /* * HasDangerousJoinUsing search jointree for unnamed JOIN USING. Check the * implementation of has_dangerous_join_using in ruleutils. */ static bool HasDangerousJoinUsing(List *rtableList, Node *joinTreeNode) { if (IsA(joinTreeNode, RangeTblRef)) { /* nothing to do here */ } else if (IsA(joinTreeNode, FromExpr)) { FromExpr *fromExpr = (FromExpr *) joinTreeNode; ListCell *listCell; foreach(listCell, fromExpr->fromlist) { if (HasDangerousJoinUsing(rtableList, (Node *) lfirst(listCell))) { return true; } } } else if (IsA(joinTreeNode, JoinExpr)) { JoinExpr *joinExpr = (JoinExpr *) joinTreeNode; /* Is it an unnamed JOIN with USING? */ if (joinExpr->alias == NULL && joinExpr->usingClause) { /* * Yes, so check each join alias var to see if any of them are not * simple references to underlying columns. If so, we have a * dangerous situation and must pick unique aliases. */ RangeTblEntry *joinRTE = rt_fetch(joinExpr->rtindex, rtableList); ListCell *listCell; foreach(listCell, joinRTE->joinaliasvars) { Var *aliasVar = (Var *) lfirst(listCell); if (aliasVar != NULL && !IsA(aliasVar, Var)) { return true; } } } /* Nope, but inspect children */ if (HasDangerousJoinUsing(rtableList, joinExpr->larg)) { return true; } if (HasDangerousJoinUsing(rtableList, joinExpr->rarg)) { return true; } } else { elog(ERROR, "unrecognized node type: %d", (int) nodeTag(joinTreeNode)); } return false; } /* * UpdateOrDeleteQuery checks if the given query is an UPDATE or DELETE command. * If it is, it returns true otherwise it returns false. */ bool UpdateOrDeleteQuery(Query *query) { return query->commandType == CMD_UPDATE || query->commandType == CMD_DELETE; } /* * If the expression contains STABLE functions which accept any parameters derived from a * Var returns true and sets varArgument. * * If the expression contains a CASE or COALESCE which invoke non-IMMUTABLE functions * returns true and sets badCoalesce. * * Assumes the expression contains no VOLATILE functions. * * Var's are allowed, but only if they are passed solely to IMMUTABLE functions * * We special-case CASE/COALESCE because those are evaluated lazily. We could evaluate * CASE/COALESCE expressions which don't reference Vars, or partially evaluate some * which do, but for now we just error out. That makes both the code and user-education * easier. */ static bool MasterIrreducibleExpression(Node *expression, bool *varArgument, bool *badCoalesce) { WalkerState data; data.containsVar = data.varArgument = data.badCoalesce = false; bool result = MasterIrreducibleExpressionWalker(expression, &data); *varArgument |= data.varArgument; *badCoalesce |= data.badCoalesce; return result; } static bool MasterIrreducibleExpressionWalker(Node *expression, WalkerState *state) { char volatileFlag = 0; WalkerState childState = { false, false, false }; bool containsDisallowedFunction = false; bool hasVolatileFunction PG_USED_FOR_ASSERTS_ONLY = false; if (expression == NULL) { return false; } if (IsA(expression, CoalesceExpr)) { CoalesceExpr *expr = (CoalesceExpr *) expression; if (contain_mutable_functions((Node *) (expr->args))) { state->badCoalesce = true; return true; } else { /* * There's no need to recurse. Since there are no STABLE functions * varArgument will never be set. */ return false; } } if (IsA(expression, CaseExpr)) { if (contain_mutable_functions(expression)) { state->badCoalesce = true; return true; } return false; } if (IsA(expression, Var)) { state->containsVar = true; return false; } /* * In order for statement replication to give us consistent results it's important * that we either disallow or evaluate on the coordinator anything which has a * volatility category above IMMUTABLE. Newer versions of postgres might add node * types which should be checked in this function. * * Look through contain_mutable_functions_walker or future PG's equivalent for new * node types before bumping this version number to fix compilation; e.g. for any * PostgreSQL after 9.5, see check_functions_in_node. Review * MasterIrreducibleExpressionFunctionChecker for any changes in volatility * permissibility ordering. * * Once you've added them to this check, make sure you also evaluate them in the * executor! */ hasVolatileFunction = check_functions_in_node(expression, MasterIrreducibleExpressionFunctionChecker, &volatileFlag); /* the caller should have already checked for this */ Assert(!hasVolatileFunction); Assert(volatileFlag != PROVOLATILE_VOLATILE); if (volatileFlag == PROVOLATILE_STABLE) { containsDisallowedFunction = expression_tree_walker(expression, MasterIrreducibleExpressionWalker, &childState); if (childState.containsVar) { state->varArgument = true; } state->badCoalesce |= childState.badCoalesce; state->varArgument |= childState.varArgument; return (containsDisallowedFunction || childState.containsVar); } /* keep traversing */ return expression_tree_walker(expression, MasterIrreducibleExpressionWalker, state); } /* * MasterIrreducibleExpressionFunctionChecker returns true if a provided function * oid corresponds to a volatile function. It also updates provided context if * the current volatility flag is more permissive than the provided one. It is * only called from check_functions_in_node as checker function. */ static bool MasterIrreducibleExpressionFunctionChecker(Oid func_id, void *context) { char volatileFlag = func_volatile(func_id); char *volatileContext = (char *) context; if (volatileFlag == PROVOLATILE_VOLATILE || *volatileContext == PROVOLATILE_VOLATILE) { *volatileContext = PROVOLATILE_VOLATILE; } else if (volatileFlag == PROVOLATILE_STABLE || *volatileContext == PROVOLATILE_STABLE) { *volatileContext = PROVOLATILE_STABLE; } else { *volatileContext = PROVOLATILE_IMMUTABLE; } return (volatileFlag == PROVOLATILE_VOLATILE); } /* * TargetEntryChangesValue determines whether the given target entry may * change the value in a given column, given a join tree. The result is * true unless the expression refers directly to the column, or the * expression is a value that is implied by the qualifiers of the join * tree, or the target entry sets a different column. */ static bool TargetEntryChangesValue(TargetEntry *targetEntry, Var *column, FromExpr *joinTree) { bool isColumnValueChanged = true; Expr *setExpr = targetEntry->expr; if (targetEntry->resno != column->varattno) { /* target entry of the form SET some_other_col = */ isColumnValueChanged = false; } else if (IsA(setExpr, Var)) { Var *newValue = (Var *) setExpr; if (newValue->varattno == column->varattno) { /* target entry of the form SET col = table.col */ isColumnValueChanged = false; } } else if (IsA(setExpr, Const)) { Const *newValue = (Const *) setExpr; List *restrictClauseList = WhereClauseList(joinTree); OpExpr *equalityExpr = MakeOpExpression(column, BTEqualStrategyNumber); Node *rightOp = get_rightop((Expr *) equalityExpr); Assert(rightOp != NULL); Assert(IsA(rightOp, Const)); Const *rightConst = (Const *) rightOp; rightConst->constvalue = newValue->constvalue; rightConst->constisnull = newValue->constisnull; rightConst->constbyval = newValue->constbyval; bool predicateIsImplied = predicate_implied_by(list_make1(equalityExpr), restrictClauseList, false); if (predicateIsImplied) { /* target entry of the form SET col = WHERE col = AND ... */ isColumnValueChanged = false; } } return isColumnValueChanged; } /* * RouterInsertJob builds a Job to represent an insertion performed by the provided * query. For inserts we always defer shard pruning and generating the task list to * the executor. */ static Job * RouterInsertJob(Query *originalQuery) { Assert(originalQuery->commandType == CMD_INSERT); bool isMultiRowInsert = IsMultiRowInsert(originalQuery); if (isMultiRowInsert) { /* add default expressions to RTE_VALUES in multi-row INSERTs */ NormalizeMultiRowInsertTargetList(originalQuery); } Job *job = CreateJob(originalQuery); job->requiresCoordinatorEvaluation = RequiresCoordinatorEvaluation(originalQuery); job->deferredPruning = true; job->partitionKeyValue = ExtractInsertPartitionKeyValue(originalQuery); return job; } /* * CreateJob returns a new Job for the given query. */ static Job * CreateJob(Query *query) { Job *job = CitusMakeNode(Job); job->jobId = UniqueJobId(); job->jobQuery = query; job->taskList = NIL; job->dependentJobList = NIL; job->subqueryPushdown = false; job->requiresCoordinatorEvaluation = false; job->deferredPruning = false; return job; } /* * ErrorIfNoShardsExist throws an error if the given table has no shards. */ static void ErrorIfNoShardsExist(CitusTableCacheEntry *cacheEntry) { int shardCount = cacheEntry->shardIntervalArrayLength; if (shardCount == 0) { Oid distributedTableId = cacheEntry->relationId; char *relationName = get_rel_name(distributedTableId); ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not find any shards"), errdetail("No shards exist for distributed table \"%s\".", relationName), errhint("Run master_create_worker_shards to create shards " "and try again."))); } } /* * RouterInsertTaskList generates a list of tasks for performing an INSERT on * a distributed table via the router executor. */ List * RouterInsertTaskList(Query *query, bool parametersInQueryResolved, DeferredErrorMessage **planningError) { List *insertTaskList = NIL; Oid distributedTableId = ExtractFirstCitusTableId(query); CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(distributedTableId); ErrorIfNoShardsExist(cacheEntry); Assert(query->commandType == CMD_INSERT); List *modifyRouteList = BuildRoutesForInsert(query, planningError); if (*planningError != NULL) { return NIL; } ModifyRoute *modifyRoute = NULL; foreach_ptr(modifyRoute, modifyRouteList) { Task *modifyTask = CreateTask(MODIFY_TASK); modifyTask->anchorShardId = modifyRoute->shardId; modifyTask->replicationModel = cacheEntry->replicationModel; modifyTask->rowValuesLists = modifyRoute->rowValuesLists; RelationShard *relationShard = CitusMakeNode(RelationShard); relationShard->shardId = modifyRoute->shardId; relationShard->relationId = distributedTableId; modifyTask->relationShardList = list_make1(relationShard); modifyTask->taskPlacementList = ShardPlacementList(modifyRoute->shardId); modifyTask->parametersInQueryStringResolved = parametersInQueryResolved; insertTaskList = lappend(insertTaskList, modifyTask); } return insertTaskList; } /* * CreateTask returns a new Task with the given type. */ static Task * CreateTask(TaskType taskType) { Task *task = CitusMakeNode(Task); task->taskType = taskType; task->jobId = INVALID_JOB_ID; task->taskId = INVALID_TASK_ID; SetTaskQueryString(task, NULL); task->anchorShardId = INVALID_SHARD_ID; task->taskPlacementList = NIL; task->dependentTaskList = NIL; task->partitionId = 0; task->upstreamTaskId = INVALID_TASK_ID; task->shardInterval = NULL; task->assignmentConstrained = false; task->replicationModel = REPLICATION_MODEL_INVALID; task->relationRowLockList = NIL; task->modifyWithSubquery = false; task->partiallyLocalOrRemote = false; task->relationShardList = NIL; return task; } /* * ExtractFirstCitusTableId takes a given query, and finds the relationId * for the first distributed table in that query. If the function cannot find a * distributed table, it returns InvalidOid. * * We only use this function for modifications and fast path queries, which * should have the first distributed table in the top-level rtable. */ Oid ExtractFirstCitusTableId(Query *query) { List *rangeTableList = query->rtable; ListCell *rangeTableCell = NULL; Oid distributedTableId = InvalidOid; foreach(rangeTableCell, rangeTableList) { RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell); if (IsCitusTable(rangeTableEntry->relid)) { distributedTableId = rangeTableEntry->relid; break; } } return distributedTableId; } /* * RouterJob builds a Job to represent a single shard select/update/delete and * multiple shard update/delete queries. */ static Job * RouterJob(Query *originalQuery, PlannerRestrictionContext *plannerRestrictionContext, DeferredErrorMessage **planningError) { uint64 shardId = INVALID_SHARD_ID; List *placementList = NIL; List *relationShardList = NIL; List *prunedShardIntervalListList = NIL; bool isMultiShardModifyQuery = false; Const *partitionKeyValue = NULL; /* router planner should create task even if it doesn't hit a shard at all */ bool replacePrunedQueryWithDummy = true; /* check if this query requires coordinator evaluation */ bool requiresCoordinatorEvaluation = RequiresCoordinatorEvaluation(originalQuery); FastPathRestrictionContext *fastPathRestrictionContext = plannerRestrictionContext->fastPathRestrictionContext; /* * We prefer to defer shard pruning/task generation to the * execution when the parameter on the distribution key * cannot be resolved. */ if (fastPathRestrictionContext->fastPathRouterQuery && fastPathRestrictionContext->distributionKeyHasParam) { Job *job = CreateJob(originalQuery); job->deferredPruning = true; ereport(DEBUG2, (errmsg("Deferred pruning for a fast-path router " "query"))); return job; } else { (*planningError) = PlanRouterQuery(originalQuery, plannerRestrictionContext, &placementList, &shardId, &relationShardList, &prunedShardIntervalListList, replacePrunedQueryWithDummy, &isMultiShardModifyQuery, &partitionKeyValue); } if (*planningError) { return NULL; } Job *job = CreateJob(originalQuery); job->partitionKeyValue = partitionKeyValue; if (originalQuery->resultRelation > 0) { RangeTblEntry *updateOrDeleteRTE = ExtractResultRelationRTE(originalQuery); /* * If all of the shards are pruned, we replace the relation RTE into * subquery RTE that returns no results. However, this is not useful * for UPDATE and DELETE queries. Therefore, if we detect a UPDATE or * DELETE RTE with subquery type, we just set task list to empty and return * the job. */ if (updateOrDeleteRTE->rtekind == RTE_SUBQUERY) { job->taskList = NIL; return job; } } if (isMultiShardModifyQuery) { job->taskList = QueryPushdownSqlTaskList(originalQuery, job->jobId, plannerRestrictionContext-> relationRestrictionContext, prunedShardIntervalListList, MODIFY_TASK, requiresCoordinatorEvaluation); } else { GenerateSingleShardRouterTaskList(job, relationShardList, placementList, shardId); } job->requiresCoordinatorEvaluation = requiresCoordinatorEvaluation; return job; } /* * SingleShardRouterTaskList is a wrapper around other corresponding task * list generation functions specific to single shard selects and modifications. * * The function updates the input job's taskList in-place. */ void GenerateSingleShardRouterTaskList(Job *job, List *relationShardList, List *placementList, uint64 shardId) { Query *originalQuery = job->jobQuery; if (originalQuery->commandType == CMD_SELECT) { job->taskList = SingleShardTaskList(originalQuery, job->jobId, relationShardList, placementList, shardId, job->parametersInJobQueryResolved); /* * Queries to reference tables, or distributed tables with multiple replica's have * their task placements reordered according to the configured * task_assignment_policy. This is only applicable to select queries as the modify * queries will _always_ be executed on all placements. * * We also ignore queries that are targeting only intermediate results (e.g., no * valid anchorShardId). */ if (shardId != INVALID_SHARD_ID) { ReorderTaskPlacementsByTaskAssignmentPolicy(job, TaskAssignmentPolicy, placementList); } } else if (shardId == INVALID_SHARD_ID) { /* modification that prunes to 0 shards */ job->taskList = NIL; } else { job->taskList = SingleShardTaskList(originalQuery, job->jobId, relationShardList, placementList, shardId, job->parametersInJobQueryResolved); } } /* * ReorderTaskPlacementsByTaskAssignmentPolicy applies selective reordering for supported * TaskAssignmentPolicyTypes. * * Supported Types * - TASK_ASSIGNMENT_ROUND_ROBIN round robin schedule queries among placements * * By default it does not reorder the task list, implying a first-replica strategy. */ static void ReorderTaskPlacementsByTaskAssignmentPolicy(Job *job, TaskAssignmentPolicyType taskAssignmentPolicy, List *placementList) { if (taskAssignmentPolicy == TASK_ASSIGNMENT_ROUND_ROBIN) { /* * We hit a single shard on router plans, and there should be only * one task in the task list */ Assert(list_length(job->taskList) == 1); Task *task = (Task *) linitial(job->taskList); /* * For round-robin SELECT queries, we don't want to include the coordinator * because the user is trying to distributed the load across nodes via * round-robin policy. Otherwise, the local execution would prioritize * executing the local tasks and especially for reference tables on the * coordinator this would prevent load balancing accross nodes. * * For other worker nodes in Citus MX, we let the local execution to kick-in * even for round-robin policy, that's because we expect the clients to evenly * connect to the worker nodes. */ Assert(ReadOnlyTask(task->taskType)); placementList = RemoveCoordinatorPlacementIfNotSingleNode(placementList); /* reorder the placement list */ List *reorderedPlacementList = RoundRobinReorder(placementList); task->taskPlacementList = reorderedPlacementList; ShardPlacement *primaryPlacement = (ShardPlacement *) linitial( reorderedPlacementList); ereport(DEBUG3, (errmsg("assigned task %u to node %s:%u", task->taskId, primaryPlacement->nodeName, primaryPlacement->nodePort))); } } /* * RemoveCoordinatorPlacementIfNotSingleNode gets a task placement list and returns the list * by removing the placement belonging to the coordinator (if any). * * If the list has a single element or no placements on the coordinator, the list * returned is unmodified. */ List * RemoveCoordinatorPlacementIfNotSingleNode(List *placementList) { ListCell *placementCell = NULL; if (list_length(placementList) < 2) { return placementList; } foreach(placementCell, placementList) { ShardPlacement *placement = (ShardPlacement *) lfirst(placementCell); if (placement->groupId == COORDINATOR_GROUP_ID) { return list_delete_ptr(placementList, placement); } } return placementList; } /* * SingleShardTaskList generates a task for single shard query * and returns it as a list. */ static List * SingleShardTaskList(Query *query, uint64 jobId, List *relationShardList, List *placementList, uint64 shardId, bool parametersInQueryResolved) { TaskType taskType = READ_TASK; char replicationModel = 0; if (query->commandType != CMD_SELECT) { List *rangeTableList = NIL; ExtractRangeTableEntryWalker((Node *) query, &rangeTableList); RangeTblEntry *updateOrDeleteRTE = ExtractResultRelationRTE(query); Assert(updateOrDeleteRTE != NULL); CitusTableCacheEntry *modificationTableCacheEntry = GetCitusTableCacheEntry( updateOrDeleteRTE->relid); if (IsCitusTableTypeCacheEntry(modificationTableCacheEntry, REFERENCE_TABLE) && SelectsFromDistributedTable(rangeTableList, query)) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot perform select on a distributed table " "and modify a reference table"))); } taskType = MODIFY_TASK; replicationModel = modificationTableCacheEntry->replicationModel; } if (taskType == READ_TASK && query->hasModifyingCTE) { /* assume ErrorIfQueryHasUnroutableModifyingCTE checked query already */ CommonTableExpr *cte = NULL; foreach_ptr(cte, query->cteList) { Query *cteQuery = (Query *) cte->ctequery; if (cteQuery->commandType != CMD_SELECT) { RangeTblEntry *updateOrDeleteRTE = ExtractResultRelationRTE(cteQuery); CitusTableCacheEntry *modificationTableCacheEntry = GetCitusTableCacheEntry( updateOrDeleteRTE->relid); taskType = MODIFY_TASK; replicationModel = modificationTableCacheEntry->replicationModel; break; } } } Task *task = CreateTask(taskType); List *relationRowLockList = NIL; RowLocksOnRelations((Node *) query, &relationRowLockList); /* * For performance reasons, we skip generating the queryString. For local * execution this is not needed, so we wait until the executor determines * that the query cannot be executed locally. */ task->taskPlacementList = placementList; SetTaskQueryIfShouldLazyDeparse(task, query); task->anchorShardId = shardId; task->jobId = jobId; task->relationShardList = relationShardList; task->relationRowLockList = relationRowLockList; task->replicationModel = replicationModel; task->parametersInQueryStringResolved = parametersInQueryResolved; return list_make1(task); } /* * RowLocksOnRelations forms the list for range table IDs and corresponding * row lock modes. */ static bool RowLocksOnRelations(Node *node, List **relationRowLockList) { if (node == NULL) { return false; } if (IsA(node, Query)) { Query *query = (Query *) node; ListCell *rowMarkCell = NULL; foreach(rowMarkCell, query->rowMarks) { RowMarkClause *rowMarkClause = (RowMarkClause *) lfirst(rowMarkCell); RangeTblEntry *rangeTable = rt_fetch(rowMarkClause->rti, query->rtable); Oid relationId = rangeTable->relid; if (IsCitusTable(relationId)) { RelationRowLock *relationRowLock = CitusMakeNode(RelationRowLock); relationRowLock->relationId = relationId; relationRowLock->rowLockStrength = rowMarkClause->strength; *relationRowLockList = lappend(*relationRowLockList, relationRowLock); } } return query_tree_walker(query, RowLocksOnRelations, relationRowLockList, 0); } else { return expression_tree_walker(node, RowLocksOnRelations, relationRowLockList); } } /* * SelectsFromDistributedTable checks if there is a select on a distributed * table by looking into range table entries. */ static bool SelectsFromDistributedTable(List *rangeTableList, Query *query) { ListCell *rangeTableCell = NULL; RangeTblEntry *resultRangeTableEntry = NULL; if (query->resultRelation > 0) { resultRangeTableEntry = ExtractResultRelationRTE(query); } foreach(rangeTableCell, rangeTableList) { RangeTblEntry *rangeTableEntry = (RangeTblEntry *) lfirst(rangeTableCell); if (rangeTableEntry->relid == InvalidOid) { continue; } if (rangeTableEntry->relkind == RELKIND_VIEW || rangeTableEntry->relkind == RELKIND_MATVIEW) { /* * Skip over views, which would error out in GetCitusTableCacheEntry. * Distributed tables within (regular) views are already in rangeTableList. */ continue; } CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry( rangeTableEntry->relid); if (IsCitusTableTypeCacheEntry(cacheEntry, DISTRIBUTED_TABLE) && (resultRangeTableEntry == NULL || resultRangeTableEntry->relid != rangeTableEntry->relid)) { return true; } } return false; } /* * RouterQuery runs router pruning logic for SELECT, UPDATE and DELETE queries. * If there are shards present and query is routable, all RTEs have been updated * to point to the relevant shards in the originalQuery. Also, placementList is * filled with the list of worker nodes that has all the required shard placements * for the query execution. anchorShardId is set to the first pruned shardId of * the given query. Finally, relationShardList is filled with the list of * relation-to-shard mappings for the query. * * If the given query is not routable, it fills planningError with the related * DeferredErrorMessage. The caller can check this error message to see if query * is routable or not. * * Note: If the query prunes down to 0 shards due to filters (e.g. WHERE false), * or the query has only read_intermediate_result calls (no relations left after * recursively planning CTEs and subqueries), then it will be assigned to an * arbitrary worker node in a round-robin fashion. * * Relations that prune down to 0 shards are replaced by subqueries returning * 0 values in UpdateRelationToShardNames. */ DeferredErrorMessage * PlanRouterQuery(Query *originalQuery, PlannerRestrictionContext *plannerRestrictionContext, List **placementList, uint64 *anchorShardId, List **relationShardList, List **prunedShardIntervalListList, bool replacePrunedQueryWithDummy, bool *multiShardModifyQuery, Const **partitionValueConst) { bool isMultiShardQuery = false; DeferredErrorMessage *planningError = NULL; bool shardsPresent = false; CmdType commandType = originalQuery->commandType; bool fastPathRouterQuery = plannerRestrictionContext->fastPathRestrictionContext->fastPathRouterQuery; *placementList = NIL; /* * When FastPathRouterQuery() returns true, we know that standard_planner() has * not been called. Thus, restriction information is not avaliable and we do the * shard pruning based on the distribution column in the quals of the query. */ if (fastPathRouterQuery) { Const *distributionKeyValue = plannerRestrictionContext->fastPathRestrictionContext->distributionKeyValue; List *shardIntervalList = TargetShardIntervalForFastPathQuery(originalQuery, &isMultiShardQuery, distributionKeyValue, partitionValueConst); /* * This could only happen when there is a parameter on the distribution key. * We defer error here, later the planner is forced to use a generic plan * by assigning arbitrarily high cost to the plan. */ if (UpdateOrDeleteQuery(originalQuery) && isMultiShardQuery) { planningError = DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "Router planner cannot handle multi-shard " "modify queries", NULL, NULL); return planningError; } *prunedShardIntervalListList = shardIntervalList; if (!isMultiShardQuery) { ereport(DEBUG2, (errmsg("Distributed planning for a fast-path router " "query"))); } } else { *prunedShardIntervalListList = TargetShardIntervalsForRestrictInfo(plannerRestrictionContext-> relationRestrictionContext, &isMultiShardQuery, partitionValueConst); } if (isMultiShardQuery) { /* * If multiShardQuery is true and it is a type of SELECT query, then * return deferred error. We do not support multi-shard SELECT queries * with this code path. */ if (commandType == CMD_SELECT) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "Router planner cannot handle multi-shard select queries", NULL, NULL); } Assert(UpdateOrDeleteQuery(originalQuery)); planningError = ModifyQuerySupported(originalQuery, originalQuery, isMultiShardQuery, plannerRestrictionContext); if (planningError != NULL) { return planningError; } else { *multiShardModifyQuery = true; return planningError; } } *relationShardList = RelationShardListForShardIntervalList(*prunedShardIntervalListList, &shardsPresent); if (!shardsPresent && !replacePrunedQueryWithDummy) { /* * For INSERT ... SELECT, this query could be still a valid for some other target * shard intervals. Thus, we should return empty list if there aren't any matching * workers, so that the caller can decide what to do with this task. */ return NULL; } /* * We bail out if there are RTEs that prune multiple shards above, but * there can also be multiple RTEs that reference the same relation. */ if (RelationPrunesToMultipleShards(*relationShardList)) { planningError = DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot run command which targets " "multiple shards", NULL, NULL); return planningError; } /* we need anchor shard id for select queries with router planner */ uint64 shardId = GetAnchorShardId(*prunedShardIntervalListList); /* both Postgres tables and materialized tables are locally avaliable */ RTEListProperties *rteProperties = GetRTEListPropertiesForQuery(originalQuery); bool hasPostgresLocalRelation = rteProperties->hasPostgresLocalTable || rteProperties->hasMaterializedView; List *taskPlacementList = CreateTaskPlacementListForShardIntervals(*prunedShardIntervalListList, shardsPresent, replacePrunedQueryWithDummy, hasPostgresLocalRelation); if (taskPlacementList == NIL) { planningError = DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "found no worker with all shard placements", NULL, NULL); return planningError; } /* * If this is an UPDATE or DELETE query which requires coordinator evaluation, * don't try update shard names, and postpone that to execution phase. */ bool isUpdateOrDelete = UpdateOrDeleteQuery(originalQuery); if (!(isUpdateOrDelete && RequiresCoordinatorEvaluation(originalQuery))) { UpdateRelationToShardNames((Node *) originalQuery, *relationShardList); } *multiShardModifyQuery = false; *placementList = taskPlacementList; *anchorShardId = shardId; return planningError; } /* * CreateTaskPlacementListForShardIntervals returns a list of shard placements * on which it can access all shards in shardIntervalListList, which contains * a list of shards for each relation in the query. * * If the query contains a local table then hasLocalRelation should be set to * true. In that case, CreateTaskPlacementListForShardIntervals only returns * a placement for the local node or an empty list if the shards cannot be * accessed locally. * * If generateDummyPlacement is true and there are no shards that need to be * accessed to answer the query (shardsPresent is false), then a single * placement is returned that is either local or follows a round-robin policy. * A typical example is a router query that only reads an intermediate result. * This will happen on the coordinator, unless the user wants to balance the * load by setting the citus.task_assignment_policy. */ List * CreateTaskPlacementListForShardIntervals(List *shardIntervalListList, bool shardsPresent, bool generateDummyPlacement, bool hasLocalRelation) { List *placementList = NIL; if (shardsPresent) { /* * Determine the workers that have all shard placements, if any. */ List *shardPlacementList = PlacementsForWorkersContainingAllShards(shardIntervalListList); if (hasLocalRelation) { ShardPlacement *taskPlacement = NULL; /* * If there is a local table, we only allow the local placement to * be used. If there is none, we disallow the query. */ foreach_ptr(taskPlacement, shardPlacementList) { if (taskPlacement->groupId == GetLocalGroupId()) { placementList = lappend(placementList, taskPlacement); } } } else { placementList = shardPlacementList; } } else if (generateDummyPlacement) { ShardPlacement *dummyPlacement = CreateDummyPlacement(hasLocalRelation); placementList = list_make1(dummyPlacement); } return placementList; } /* * CreateLocalDummyPlacement creates a dummy placement for the local node that * can be used for queries that don't involve any shards. The typical examples * are: * (a) queries that consist of only intermediate results * (b) queries that hit zero shards (... WHERE false;) */ static ShardPlacement * CreateLocalDummyPlacement() { ShardPlacement *dummyPlacement = CitusMakeNode(ShardPlacement); dummyPlacement->nodeId = LOCAL_NODE_ID; dummyPlacement->nodeName = LOCAL_HOST_NAME; dummyPlacement->nodePort = PostPortNumber; dummyPlacement->groupId = GetLocalGroupId(); return dummyPlacement; } /* * CreateDummyPlacement creates a dummy placement that can be used for queries * that don't involve any shards. The typical examples are: * (a) queries that consist of only intermediate results * (b) queries that hit zero shards (... WHERE false;) * * If round robin policy is set, the placement could be on any node in pg_dist_node. * Else, the local node is set for the placement. * * Queries can also involve local tables. In that case we always use the local * node. */ static ShardPlacement * CreateDummyPlacement(bool hasLocalRelation) { static uint32 zeroShardQueryRoundRobin = 0; if (TaskAssignmentPolicy != TASK_ASSIGNMENT_ROUND_ROBIN || hasLocalRelation) { return CreateLocalDummyPlacement(); } List *workerNodeList = ActiveReadableNonCoordinatorNodeList(); if (workerNodeList == NIL) { /* * We want to round-robin over the workers, but there are no workers. * To make sure the query can still succeed we fall back to returning * a local dummy placement. */ return CreateLocalDummyPlacement(); } int workerNodeCount = list_length(workerNodeList); int workerNodeIndex = zeroShardQueryRoundRobin % workerNodeCount; WorkerNode *workerNode = (WorkerNode *) list_nth(workerNodeList, workerNodeIndex); ShardPlacement *dummyPlacement = CitusMakeNode(ShardPlacement); SetPlacementNodeMetadata(dummyPlacement, workerNode); zeroShardQueryRoundRobin++; return dummyPlacement; } /* * RelationShardListForShardIntervalList is a utility function which gets a list of * shardInterval, and returns a list of RelationShard. */ List * RelationShardListForShardIntervalList(List *shardIntervalList, bool *shardsPresent) { List *relationShardList = NIL; ListCell *shardIntervalListCell = NULL; foreach(shardIntervalListCell, shardIntervalList) { List *prunedShardIntervalList = (List *) lfirst(shardIntervalListCell); /* no shard is present or all shards are pruned out case will be handled later */ if (prunedShardIntervalList == NIL) { continue; } *shardsPresent = true; ListCell *shardIntervalCell = NULL; foreach(shardIntervalCell, prunedShardIntervalList) { ShardInterval *shardInterval = (ShardInterval *) lfirst(shardIntervalCell); RelationShard *relationShard = CitusMakeNode(RelationShard); relationShard->relationId = shardInterval->relationId; relationShard->shardId = shardInterval->shardId; relationShard->shardIndex = shardInterval->shardIndex; relationShardList = lappend(relationShardList, relationShard); } } return relationShardList; } /* * GetAnchorShardId returns the anchor shard id given relation shard list. * The desired anchor shard is found as follows: * * - Return the first distributed table shard id in the relationShardList if * there is any. * - Return a random reference table shard id if all the shards belong to * reference tables * - Return INVALID_SHARD_ID on empty lists */ uint64 GetAnchorShardId(List *prunedShardIntervalListList) { ListCell *prunedShardIntervalListCell = NULL; uint64 referenceShardId = INVALID_SHARD_ID; foreach(prunedShardIntervalListCell, prunedShardIntervalListList) { List *prunedShardIntervalList = (List *) lfirst(prunedShardIntervalListCell); /* no shard is present or all shards are pruned out case will be handled later */ if (prunedShardIntervalList == NIL) { continue; } ShardInterval *shardInterval = linitial(prunedShardIntervalList); if (ReferenceTableShardId(shardInterval->shardId)) { referenceShardId = shardInterval->shardId; } else { return shardInterval->shardId; } } return referenceShardId; } /* * TargetShardIntervalForFastPathQuery gets a query which is in * the form defined by FastPathRouterQuery() and returns exactly * one list of a a one shard interval (see FastPathRouterQuery() * for the detail). * * If the caller requested the distributionKey value that this function * yields, set outputPartitionValueConst. */ List * TargetShardIntervalForFastPathQuery(Query *query, bool *isMultiShardQuery, Const *inputDistributionKeyValue, Const **outputPartitionValueConst) { Oid relationId = ExtractFirstCitusTableId(query); if (IsCitusTableType(relationId, CITUS_TABLE_WITH_NO_DIST_KEY)) { /* we don't need to do shard pruning for non-distributed tables */ return list_make1(LoadShardIntervalList(relationId)); } if (inputDistributionKeyValue && !inputDistributionKeyValue->constisnull) { CitusTableCacheEntry *cache = GetCitusTableCacheEntry(relationId); Var *distributionKey = cache->partitionColumn; /* * We currently don't allow implicitly coerced values to be handled by fast- * path planner. Still, let's be defensive for any future changes.. */ if (inputDistributionKeyValue->consttype != distributionKey->vartype) { bool missingOk = false; inputDistributionKeyValue = TransformPartitionRestrictionValue(distributionKey, inputDistributionKeyValue, missingOk); } ShardInterval *cachedShardInterval = FindShardInterval(inputDistributionKeyValue->constvalue, cache); if (cachedShardInterval == NULL) { ereport(ERROR, (errmsg( "could not find shardinterval to which to send the query"))); } if (outputPartitionValueConst != NULL) { /* set the outgoing partition column value if requested */ *outputPartitionValueConst = inputDistributionKeyValue; } ShardInterval *shardInterval = CopyShardInterval(cachedShardInterval); List *shardIntervalList = list_make1(shardInterval); return list_make1(shardIntervalList); } Node *quals = query->jointree->quals; int relationIndex = 1; /* * We couldn't do the shard pruning based on inputDistributionKeyValue as it might * be passed as NULL. Still, we can search the quals for distribution key. */ Const *distributionKeyValueInQuals = NULL; List *prunedShardIntervalList = PruneShards(relationId, relationIndex, make_ands_implicit((Expr *) quals), &distributionKeyValueInQuals); if (!distributionKeyValueInQuals || distributionKeyValueInQuals->constisnull) { /* * If the distribution key equals to NULL, we prefer to treat it as a zero shard * query as it cannot return any rows. */ return NIL; } /* we're only expecting single shard from a single table */ Node *distKey PG_USED_FOR_ASSERTS_ONLY = NULL; Assert(FastPathRouterQuery(query, &distKey) || !EnableFastPathRouterPlanner); if (list_length(prunedShardIntervalList) > 1) { *isMultiShardQuery = true; } else if (list_length(prunedShardIntervalList) == 1 && outputPartitionValueConst != NULL) { /* set the outgoing partition column value if requested */ *outputPartitionValueConst = distributionKeyValueInQuals; } return list_make1(prunedShardIntervalList); } /* * TargetShardIntervalsForRestrictInfo performs shard pruning for all referenced * relations in the relation restriction context and returns list of shards per * relation. Shard pruning is done based on provided restriction context per relation. * The function sets multiShardQuery to true if any of the relations pruned down to * more than one active shard. It also records pruned shard intervals in relation * restriction context to be used later on. Some queries may have contradiction * clauses like 'and false' or 'and 1=0', such queries are treated as if all of * the shards of joining relations are pruned out. */ List * TargetShardIntervalsForRestrictInfo(RelationRestrictionContext *restrictionContext, bool *multiShardQuery, Const **partitionValueConst) { List *prunedShardIntervalListList = NIL; ListCell *restrictionCell = NULL; bool multiplePartitionValuesExist = false; Const *queryPartitionValueConst = NULL; Assert(restrictionContext != NULL); foreach(restrictionCell, restrictionContext->relationRestrictionList) { RelationRestriction *relationRestriction = (RelationRestriction *) lfirst(restrictionCell); Oid relationId = relationRestriction->relationId; int shardCount = 0; if (IsCitusTable(relationId)) { CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(relationId); shardCount = cacheEntry->shardIntervalArrayLength; } else if (IsDistributedIntermediateResultRTE(relationRestriction->rte)) { char *resultId = FindDistributedResultId(relationRestriction->rte); DistributedResult *distributedResult = GetNamedDistributedResult(resultId); int colocationId = distributedResult->colocationId; relationId = ColocatedTableId(colocationId); shardCount = distributedResult->shardCount; } else { /* ignore local tables for shard pruning purposes */ continue; } Index tableId = relationRestriction->index; List *baseRestrictionList = relationRestriction->relOptInfo->baserestrictinfo; List *restrictClauseList = get_all_actual_clauses(baseRestrictionList); List *prunedShardIntervalList = NIL; List *joinInfoList = relationRestriction->relOptInfo->joininfo; List *pseudoRestrictionList = extract_actual_clauses(joinInfoList, true); /* * Queries may have contradiction clauses like 'false', or '1=0' in * their filters. Such queries would have pseudo constant 'false' * inside relOptInfo->joininfo list. We treat such cases as if all * shards of the table are pruned out. */ bool whereFalseQuery = ContainsFalseClause(pseudoRestrictionList); if (!whereFalseQuery && shardCount > 0) { Const *restrictionPartitionValueConst = NULL; prunedShardIntervalList = PruneShards(relationId, tableId, restrictClauseList, &restrictionPartitionValueConst); if (list_length(prunedShardIntervalList) > 1) { (*multiShardQuery) = true; } if (restrictionPartitionValueConst != NULL && queryPartitionValueConst == NULL) { queryPartitionValueConst = restrictionPartitionValueConst; } else if (restrictionPartitionValueConst != NULL && !equal(queryPartitionValueConst, restrictionPartitionValueConst)) { multiplePartitionValuesExist = true; } } prunedShardIntervalListList = lappend(prunedShardIntervalListList, prunedShardIntervalList); } /* * Different resrictions might have different partition columns. * We report partition column value if there is only one. */ if (multiplePartitionValuesExist) { queryPartitionValueConst = NULL; } /* set the outgoing partition column value if requested */ if (partitionValueConst != NULL) { *partitionValueConst = queryPartitionValueConst; } return prunedShardIntervalListList; } /* * RelationPrunesToMultipleShards returns true if the given list of * relation-to-shard mappings contains at least two mappings with * the same relation, but different shards. */ static bool RelationPrunesToMultipleShards(List *relationShardList) { ListCell *relationShardCell = NULL; RelationShard *previousRelationShard = NULL; relationShardList = SortList(relationShardList, CompareRelationShards); foreach(relationShardCell, relationShardList) { RelationShard *relationShard = (RelationShard *) lfirst(relationShardCell); if (previousRelationShard != NULL && relationShard->relationId == previousRelationShard->relationId && relationShard->shardId != previousRelationShard->shardId) { return true; } previousRelationShard = relationShard; } return false; } /* * PlacementsForWorkersContainingAllShards returns list of shard placements for workers * that contain all shard intervals in the given list of shard interval lists. */ List * PlacementsForWorkersContainingAllShards(List *shardIntervalListList) { bool firstShard = true; List *currentPlacementList = NIL; List *shardIntervalList = NIL; foreach_ptr(shardIntervalList, shardIntervalListList) { if (shardIntervalList == NIL) { continue; } Assert(list_length(shardIntervalList) == 1); ShardInterval *shardInterval = (ShardInterval *) linitial(shardIntervalList); uint64 shardId = shardInterval->shardId; /* retrieve all active shard placements for this shard */ List *newPlacementList = ActiveShardPlacementList(shardId); if (firstShard) { firstShard = false; currentPlacementList = newPlacementList; } else { /* keep placements that still exists for this shard */ currentPlacementList = IntersectPlacementList(currentPlacementList, newPlacementList); } /* * Bail out if placement list becomes empty. This means there is no worker * containing all shards referenced by the query, hence we can not forward * this query directly to any worker. */ if (currentPlacementList == NIL) { break; } } return currentPlacementList; } /* * BuildRoutesForInsert returns a list of ModifyRoute objects for an INSERT * query or an empty list if the partition column value is defined as an ex- * pression that still needs to be evaluated. If any partition column value * falls within 0 or multiple (overlapping) shards, the planning error is set. * * Multi-row INSERTs are handled by grouping their rows by target shard. These * groups are returned in ascending order by shard id, ready for later deparse * to shard-specific SQL. */ static List * BuildRoutesForInsert(Query *query, DeferredErrorMessage **planningError) { Oid distributedTableId = ExtractFirstCitusTableId(query); CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(distributedTableId); List *modifyRouteList = NIL; ListCell *insertValuesCell = NULL; Assert(query->commandType == CMD_INSERT); /* reference tables and citus local tables can only have one shard */ if (IsCitusTableTypeCacheEntry(cacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY)) { List *shardIntervalList = LoadShardIntervalList(distributedTableId); int shardCount = list_length(shardIntervalList); if (shardCount != 1) { if (IsCitusTableTypeCacheEntry(cacheEntry, REFERENCE_TABLE)) { ereport(ERROR, (errmsg("reference table cannot have %d shards", shardCount))); } else if (IsCitusTableTypeCacheEntry(cacheEntry, CITUS_LOCAL_TABLE)) { ereport(ERROR, (errmsg("citus local table cannot have %d shards", shardCount))); } } ShardInterval *shardInterval = linitial(shardIntervalList); ModifyRoute *modifyRoute = palloc(sizeof(ModifyRoute)); modifyRoute->shardId = shardInterval->shardId; RangeTblEntry *valuesRTE = ExtractDistributedInsertValuesRTE(query); if (valuesRTE != NULL) { /* add the values list for a multi-row INSERT */ modifyRoute->rowValuesLists = valuesRTE->values_lists; } else { modifyRoute->rowValuesLists = NIL; } modifyRouteList = lappend(modifyRouteList, modifyRoute); return modifyRouteList; } Var *partitionColumn = cacheEntry->partitionColumn; /* get full list of insert values and iterate over them to prune */ List *insertValuesList = ExtractInsertValuesList(query, partitionColumn); foreach(insertValuesCell, insertValuesList) { InsertValues *insertValues = (InsertValues *) lfirst(insertValuesCell); List *prunedShardIntervalList = NIL; Node *partitionValueExpr = (Node *) insertValues->partitionValueExpr; /* * We only support constant partition values at this point. Sometimes * they are wrappend in an implicit coercion though. Most notably * FuncExpr coercions for casts created with CREATE CAST ... WITH * FUNCTION .. AS IMPLICIT. To support this first we strip them here. * Then we do the coercion manually below using * TransformPartitionRestrictionValue, if the types are not the same. * * NOTE: eval_const_expressions below would do some of these removals * too, but it's unclear if it would do all of them. It is possible * that there are no cases where this strip_implicit_coercions call is * really necessary at all, but currently that's hard to rule out. * So to be on the safe side we call strip_implicit_coercions too, to * be sure we support as much as possible. */ partitionValueExpr = strip_implicit_coercions(partitionValueExpr); /* * By evaluating constant expressions an expression such as 2 + 4 * will become const 6. That way we can use them as a partition column * value. Normally the planner evaluates constant expressions, but we * may be working on the original query tree here. So we do it here * explicitely before checking that the partition value is a const. * * NOTE: We do not use expression_planner here, since all it does * apart from calling eval_const_expressions is call fix_opfuncids. * This is not needed here, since it's a no-op for T_Const nodes and we * error out below in all other cases. */ partitionValueExpr = eval_const_expressions(NULL, partitionValueExpr); if (!IsA(partitionValueExpr, Const)) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("failed to evaluate partition key in insert"), errhint("try using constant values for partition column"))); } Const *partitionValueConst = (Const *) partitionValueExpr; if (partitionValueConst->constisnull) { ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("cannot perform an INSERT with NULL in the partition " "column"))); } /* actually do the coercions that we skipped before, if fails throw an * error */ if (partitionValueConst->consttype != partitionColumn->vartype) { bool missingOk = false; partitionValueConst = TransformPartitionRestrictionValue(partitionColumn, partitionValueConst, missingOk); } if (IsCitusTableTypeCacheEntry(cacheEntry, HASH_DISTRIBUTED) || IsCitusTableTypeCacheEntry(cacheEntry, RANGE_DISTRIBUTED)) { Datum partitionValue = partitionValueConst->constvalue; ShardInterval *shardInterval = FindShardInterval(partitionValue, cacheEntry); if (shardInterval != NULL) { prunedShardIntervalList = list_make1(shardInterval); } } else { Index tableId = 1; OpExpr *equalityExpr = MakeOpExpression(partitionColumn, BTEqualStrategyNumber); Node *rightOp = get_rightop((Expr *) equalityExpr); Assert(rightOp != NULL); Assert(IsA(rightOp, Const)); Const *rightConst = (Const *) rightOp; rightConst->constvalue = partitionValueConst->constvalue; rightConst->constisnull = partitionValueConst->constisnull; rightConst->constbyval = partitionValueConst->constbyval; List *restrictClauseList = list_make1(equalityExpr); prunedShardIntervalList = PruneShards(distributedTableId, tableId, restrictClauseList, NULL); } int prunedShardIntervalCount = list_length(prunedShardIntervalList); if (prunedShardIntervalCount != 1) { char *partitionKeyString = cacheEntry->partitionKeyString; char *partitionColumnName = ColumnToColumnName(distributedTableId, partitionKeyString); StringInfo errorMessage = makeStringInfo(); StringInfo errorHint = makeStringInfo(); const char *targetCountType = NULL; if (prunedShardIntervalCount == 0) { targetCountType = "no"; } else { targetCountType = "multiple"; } if (prunedShardIntervalCount == 0) { appendStringInfo(errorHint, "Make sure you have created a shard which " "can receive this partition column value."); } else { appendStringInfo(errorHint, "Make sure the value for partition column " "\"%s\" falls into a single shard.", partitionColumnName); } appendStringInfo(errorMessage, "cannot run INSERT command which targets %s " "shards", targetCountType); (*planningError) = DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, errorMessage->data, NULL, errorHint->data); return NIL; } ShardInterval *targetShard = (ShardInterval *) linitial(prunedShardIntervalList); insertValues->shardId = targetShard->shardId; } modifyRouteList = GroupInsertValuesByShardId(insertValuesList); return modifyRouteList; } /* * IsMultiRowInsert returns whether the given query is a multi-row INSERT. * * It does this by determining whether the query is an INSERT that has an * RTE_VALUES. Single-row INSERTs will have their RTE_VALUES optimised away * in transformInsertStmt, and instead use the target list. */ bool IsMultiRowInsert(Query *query) { return ExtractDistributedInsertValuesRTE(query) != NULL; } /* * ExtractDistributedInsertValuesRTE does precisely that. If the provided * query is not an INSERT, or if the INSERT does not have a VALUES RTE * (i.e. it is not a multi-row INSERT), this function returns NULL. * If all those conditions are met, an RTE representing the multiple values * of a multi-row INSERT is returned. */ RangeTblEntry * ExtractDistributedInsertValuesRTE(Query *query) { ListCell *rteCell = NULL; if (query->commandType != CMD_INSERT) { return NULL; } foreach(rteCell, query->rtable) { RangeTblEntry *rte = (RangeTblEntry *) lfirst(rteCell); if (rte->rtekind == RTE_VALUES) { return rte; } } return NULL; } /* * NormalizeMultiRowInsertTargetList ensures all elements of multi-row INSERT target * lists are Vars. In multi-row INSERTs, most target list entries contain a Var * expression pointing to a position within the values_lists field of a VALUES * RTE, but non-NULL default columns are handled differently. Instead of adding * the default expression to each row, a single expression encoding the DEFAULT * appears in the target list. For consistency, we move these expressions into * values lists and replace them with an appropriately constructed Var. */ static void NormalizeMultiRowInsertTargetList(Query *query) { ListCell *valuesListCell = NULL; ListCell *targetEntryCell = NULL; int targetEntryNo = 0; RangeTblEntry *valuesRTE = ExtractDistributedInsertValuesRTE(query); if (valuesRTE == NULL) { return; } foreach(valuesListCell, valuesRTE->values_lists) { List *valuesList = (List *) lfirst(valuesListCell); Expr **valuesArray = (Expr **) PointerArrayFromList(valuesList); List *expandedValuesList = NIL; foreach(targetEntryCell, query->targetList) { TargetEntry *targetEntry = (TargetEntry *) lfirst(targetEntryCell); Expr *targetExpr = targetEntry->expr; if (IsA(targetExpr, Var)) { /* expression from the VALUES section */ Var *targetListVar = (Var *) targetExpr; targetExpr = valuesArray[targetListVar->varattno - 1]; } else { /* copy the column's default expression */ targetExpr = copyObject(targetExpr); } expandedValuesList = lappend(expandedValuesList, targetExpr); } SetListCellPtr(valuesListCell, (void *) expandedValuesList); } /* reset coltypes, coltypmods, colcollations and rebuild them below */ valuesRTE->coltypes = NIL; valuesRTE->coltypmods = NIL; valuesRTE->colcollations = NIL; foreach(targetEntryCell, query->targetList) { TargetEntry *targetEntry = lfirst(targetEntryCell); Node *targetExprNode = (Node *) targetEntry->expr; /* RTE_VALUES comes 2nd, after destination table */ Index valuesVarno = 2; targetEntryNo++; Oid targetType = exprType(targetExprNode); int32 targetTypmod = exprTypmod(targetExprNode); Oid targetColl = exprCollation(targetExprNode); valuesRTE->coltypes = lappend_oid(valuesRTE->coltypes, targetType); valuesRTE->coltypmods = lappend_int(valuesRTE->coltypmods, targetTypmod); valuesRTE->colcollations = lappend_oid(valuesRTE->colcollations, targetColl); if (IsA(targetExprNode, Var)) { Var *targetVar = (Var *) targetExprNode; targetVar->varattno = targetEntryNo; continue; } /* replace the original expression with a Var referencing values_lists */ Var *syntheticVar = makeVar(valuesVarno, targetEntryNo, targetType, targetTypmod, targetColl, 0); targetEntry->expr = (Expr *) syntheticVar; /* * Postgres appends a dummy column reference into valuesRTE->eref->colnames * list in addRangeTableEntryForValues for each column specified in VALUES * clause. Now that we replaced DEFAULT column with a synthetic Var, we also * need to add a dummy column reference for that column. */ AppendNextDummyColReference(valuesRTE->eref); } } /* * AppendNextDummyColReference appends a new dummy column reference to colnames * list of given Alias object. */ static void AppendNextDummyColReference(Alias *expendedReferenceNames) { int existingColReferences = list_length(expendedReferenceNames->colnames); int nextColReferenceId = existingColReferences + 1; Value *missingColumnString = MakeDummyColumnString(nextColReferenceId); expendedReferenceNames->colnames = lappend(expendedReferenceNames->colnames, missingColumnString); } /* * MakeDummyColumnString returns a String (Value) object by appending given * integer to end of the "column" string. */ static Value * MakeDummyColumnString(int dummyColumnId) { StringInfo dummyColumnStringInfo = makeStringInfo(); appendStringInfo(dummyColumnStringInfo, "column%d", dummyColumnId); Value *dummyColumnString = makeString(dummyColumnStringInfo->data); return dummyColumnString; } /* * IntersectPlacementList performs placement pruning based on matching on * nodeName:nodePort fields of shard placement data. We start pruning from all * placements of the first relation's shard. Then for each relation's shard, we * compute intersection of the new shards placement with existing placement list. * This operation could have been done using other methods, but since we do not * expect very high replication factor, iterating over a list and making string * comparisons should be sufficient. */ List * IntersectPlacementList(List *lhsPlacementList, List *rhsPlacementList) { ListCell *lhsPlacementCell = NULL; List *placementList = NIL; /* Keep existing placement in the list if it is also present in new placement list */ foreach(lhsPlacementCell, lhsPlacementList) { ShardPlacement *lhsPlacement = (ShardPlacement *) lfirst(lhsPlacementCell); ListCell *rhsPlacementCell = NULL; foreach(rhsPlacementCell, rhsPlacementList) { ShardPlacement *rhsPlacement = (ShardPlacement *) lfirst(rhsPlacementCell); if (rhsPlacement->nodePort == lhsPlacement->nodePort && strncmp(rhsPlacement->nodeName, lhsPlacement->nodeName, WORKER_LENGTH) == 0) { placementList = lappend(placementList, rhsPlacement); /* * We don't need to add the same placement over and over again. This * could happen if both placements of a shard appear on the same node. */ break; } } } return placementList; } /* * GroupInsertValuesByShardId takes care of grouping the rows from a multi-row * INSERT by target shard. At this point, all pruning has taken place and we * need only to build sets of rows for each destination. This is done by a * simple sort (by shard identifier) and gather step. The sort has the side- * effect of getting things in ascending order to avoid unnecessary deadlocks * during Task execution. */ static List * GroupInsertValuesByShardId(List *insertValuesList) { ModifyRoute *route = NULL; ListCell *insertValuesCell = NULL; List *modifyRouteList = NIL; insertValuesList = SortList(insertValuesList, CompareInsertValuesByShardId); foreach(insertValuesCell, insertValuesList) { InsertValues *insertValues = (InsertValues *) lfirst(insertValuesCell); int64 shardId = insertValues->shardId; bool foundSameShardId = false; if (route != NULL) { if (route->shardId == shardId) { foundSameShardId = true; } else { /* new shard id seen; current aggregation done; add to list */ modifyRouteList = lappend(modifyRouteList, route); } } if (foundSameShardId) { /* * Our current value has the same shard id as our aggregate object, * so append the rowValues. */ route->rowValuesLists = lappend(route->rowValuesLists, insertValues->rowValues); } else { /* we encountered a new shard id; build a new aggregate object */ route = (ModifyRoute *) palloc(sizeof(ModifyRoute)); route->shardId = insertValues->shardId; route->rowValuesLists = list_make1(insertValues->rowValues); } } /* left holding one final aggregate object; add to list */ modifyRouteList = lappend(modifyRouteList, route); return modifyRouteList; } /* * ExtractInsertValuesList extracts the partition column value for an INSERT * command and returns it within an InsertValues struct. For single-row INSERTs * this is simply a value extracted from the target list, but multi-row INSERTs * will generate a List of InsertValues, each with full row values in addition * to the partition value. If a partition value is NULL or missing altogether, * this function errors. */ static List * ExtractInsertValuesList(Query *query, Var *partitionColumn) { List *insertValuesList = NIL; TargetEntry *targetEntry = get_tle_by_resno(query->targetList, partitionColumn->varattno); if (targetEntry == NULL) { ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("cannot perform an INSERT without a partition column " "value"))); } /* * We've got a multi-row INSERT. PostgreSQL internally represents such * commands by linking Vars in the target list to lists of values within * a special VALUES range table entry. By extracting the right positional * expression from each list within that RTE, we will extract the partition * values for each row within the multi-row INSERT. */ if (IsA(targetEntry->expr, Var)) { Var *partitionVar = (Var *) targetEntry->expr; ListCell *valuesListCell = NULL; Index ivIndex = 0; RangeTblEntry *referencedRTE = rt_fetch(partitionVar->varno, query->rtable); foreach(valuesListCell, referencedRTE->values_lists) { InsertValues *insertValues = (InsertValues *) palloc(sizeof(InsertValues)); insertValues->rowValues = (List *) lfirst(valuesListCell); insertValues->partitionValueExpr = list_nth(insertValues->rowValues, (partitionVar->varattno - 1)); insertValues->shardId = INVALID_SHARD_ID; insertValues->listIndex = ivIndex; insertValuesList = lappend(insertValuesList, insertValues); ivIndex++; } } /* nothing's been found yet; this is a simple single-row INSERT */ if (insertValuesList == NIL) { InsertValues *insertValues = (InsertValues *) palloc(sizeof(InsertValues)); insertValues->rowValues = NIL; insertValues->partitionValueExpr = targetEntry->expr; insertValues->shardId = INVALID_SHARD_ID; insertValuesList = lappend(insertValuesList, insertValues); } return insertValuesList; } /* * ExtractInsertPartitionKeyValue extracts the partition column value * from an INSERT query. If the expression in the partition column is * non-constant or it is a multi-row INSERT with multiple different partition * column values, the function returns NULL. */ Const * ExtractInsertPartitionKeyValue(Query *query) { Oid distributedTableId = ExtractFirstCitusTableId(query); uint32 rangeTableId = 1; Const *singlePartitionValueConst = NULL; if (IsCitusTableType(distributedTableId, CITUS_TABLE_WITH_NO_DIST_KEY)) { return NULL; } Var *partitionColumn = PartitionColumn(distributedTableId, rangeTableId); TargetEntry *targetEntry = get_tle_by_resno(query->targetList, partitionColumn->varattno); if (targetEntry == NULL) { /* partition column value not specified */ return NULL; } Node *targetExpression = strip_implicit_coercions((Node *) targetEntry->expr); /* * Multi-row INSERTs have a Var in the target list that points to * an RTE_VALUES. */ if (IsA(targetExpression, Var)) { Var *partitionVar = (Var *) targetExpression; ListCell *valuesListCell = NULL; RangeTblEntry *referencedRTE = rt_fetch(partitionVar->varno, query->rtable); foreach(valuesListCell, referencedRTE->values_lists) { List *rowValues = (List *) lfirst(valuesListCell); Node *partitionValueNode = list_nth(rowValues, partitionVar->varattno - 1); Expr *partitionValueExpr = (Expr *) strip_implicit_coercions( partitionValueNode); if (!IsA(partitionValueExpr, Const)) { /* non-constant value in the partition column */ singlePartitionValueConst = NULL; break; } Const *partitionValueConst = (Const *) partitionValueExpr; if (singlePartitionValueConst == NULL) { /* first row has a constant in the partition column, looks promising! */ singlePartitionValueConst = partitionValueConst; } else if (!equal(partitionValueConst, singlePartitionValueConst)) { /* multiple different values in the partition column, too bad */ singlePartitionValueConst = NULL; break; } else { /* another row with the same partition column value! */ } } } else if (IsA(targetExpression, Const)) { /* single-row INSERT with a constant partition column value */ singlePartitionValueConst = (Const *) targetExpression; } else { /* single-row INSERT with a non-constant partition column value */ singlePartitionValueConst = NULL; } if (singlePartitionValueConst != NULL) { singlePartitionValueConst = copyObject(singlePartitionValueConst); } return singlePartitionValueConst; } /* * DeferErrorIfUnsupportedRouterPlannableSelectQuery checks if given query is router plannable, * SELECT query, setting distributedPlan->planningError if not. * The query is router plannable if it is a modify query, or if it is a select * query issued on a hash partitioned distributed table. Router plannable checks * for select queries can be turned off by setting citus.enable_router_execution * flag to false. */ static DeferredErrorMessage * DeferErrorIfUnsupportedRouterPlannableSelectQuery(Query *query) { List *rangeTableRelationList = NIL; ListCell *rangeTableRelationCell = NULL; if (query->commandType != CMD_SELECT) { return DeferredError(ERRCODE_ASSERT_FAILURE, "Only SELECT query types are supported in this path", NULL, NULL); } if (!EnableRouterExecution) { return DeferredError(ERRCODE_SUCCESSFUL_COMPLETION, "Router planner not enabled.", NULL, NULL); } if (contain_nextval_expression_walker((Node *) query->targetList, NULL)) { /* * We let queries with nextval in the target list fall through to * the logical planner, which knows how to handle those queries. */ return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "Sequences cannot be used in router queries", NULL, NULL); } bool hasPostgresOrCitusLocalTable = false; bool hasDistributedTable = false; ExtractRangeTableRelationWalker((Node *) query, &rangeTableRelationList); foreach(rangeTableRelationCell, rangeTableRelationList) { RangeTblEntry *rte = (RangeTblEntry *) lfirst(rangeTableRelationCell); if (rte->rtekind == RTE_RELATION) { Oid distributedTableId = rte->relid; /* local tables are allowed if there are no distributed tables */ if (!IsCitusTable(distributedTableId)) { hasPostgresOrCitusLocalTable = true; continue; } else if (IsCitusTableType(distributedTableId, CITUS_LOCAL_TABLE)) { hasPostgresOrCitusLocalTable = true; elog(DEBUG4, "Router planner finds a citus local table"); continue; } if (IsCitusTableType(distributedTableId, APPEND_DISTRIBUTED)) { return DeferredError( ERRCODE_FEATURE_NOT_SUPPORTED, "Router planner does not support append-partitioned tables.", NULL, NULL); } if (IsCitusTableType(distributedTableId, DISTRIBUTED_TABLE)) { hasDistributedTable = true; } /* * Currently, we don't support tables with replication factor > 1, * except reference tables with SELECT ... FOR UPDATE queries. It is * also not supported from MX nodes. */ if (query->hasForUpdate) { uint32 tableReplicationFactor = TableShardReplicationFactor( distributedTableId); if (tableReplicationFactor > 1 && IsCitusTableType(distributedTableId, DISTRIBUTED_TABLE)) { return DeferredError( ERRCODE_FEATURE_NOT_SUPPORTED, "SELECT FOR UPDATE with table replication factor > 1 not supported for non-reference tables.", NULL, NULL); } } } } /* local tables are not allowed if there are distributed tables */ if (hasPostgresOrCitusLocalTable && hasDistributedTable) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "Local tables cannot be used in distributed queries.", NULL, NULL); } return ErrorIfQueryHasUnroutableModifyingCTE(query); } /* * Copy a RelationRestrictionContext. Note that several subfields are copied * shallowly, for lack of copyObject support. * * Note that CopyRelationRestrictionContext copies the following fields per relation * context: index, relationId, distributedRelation, rte, relOptInfo->baserestrictinfo * and relOptInfo->joininfo. Also, the function shallowly copies plannerInfo and * prunedShardIntervalList which are read-only. All other parts of the relOptInfo * is also shallowly copied. */ RelationRestrictionContext * CopyRelationRestrictionContext(RelationRestrictionContext *oldContext) { RelationRestrictionContext *newContext = (RelationRestrictionContext *) palloc(sizeof(RelationRestrictionContext)); ListCell *relationRestrictionCell = NULL; newContext->allReferenceTables = oldContext->allReferenceTables; newContext->relationRestrictionList = NIL; foreach(relationRestrictionCell, oldContext->relationRestrictionList) { RelationRestriction *oldRestriction = (RelationRestriction *) lfirst(relationRestrictionCell); RelationRestriction *newRestriction = (RelationRestriction *) palloc0(sizeof(RelationRestriction)); newRestriction->index = oldRestriction->index; newRestriction->relationId = oldRestriction->relationId; newRestriction->distributedRelation = oldRestriction->distributedRelation; newRestriction->rte = copyObject(oldRestriction->rte); /* can't be copied, we copy (flatly) a RelOptInfo, and then decouple baserestrictinfo */ newRestriction->relOptInfo = palloc(sizeof(RelOptInfo)); *newRestriction->relOptInfo = *oldRestriction->relOptInfo; newRestriction->relOptInfo->baserestrictinfo = copyObject(oldRestriction->relOptInfo->baserestrictinfo); newRestriction->relOptInfo->joininfo = copyObject(oldRestriction->relOptInfo->joininfo); /* not copyable, but readonly */ newRestriction->plannerInfo = oldRestriction->plannerInfo; newContext->relationRestrictionList = lappend(newContext->relationRestrictionList, newRestriction); } return newContext; } /* * ErrorIfQueryHasUnroutableModifyingCTE checks if the query contains modifying common table * expressions and errors out if it does. */ static DeferredErrorMessage * ErrorIfQueryHasUnroutableModifyingCTE(Query *queryTree) { Assert(queryTree->commandType == CMD_SELECT); if (!queryTree->hasModifyingCTE) { return NULL; } /* we can't route conflicting replication models */ char replicationModel = 0; CommonTableExpr *cte = NULL; foreach_ptr(cte, queryTree->cteList) { Query *cteQuery = (Query *) cte->ctequery; /* * Here we only check for command type of top level query. Normally there can be * nested CTE, however PostgreSQL dictates that data-modifying statements must * be at top level of CTE. Therefore it is OK to just check for top level. * Similarly, we do not need to check for subqueries. */ if (cteQuery->commandType != CMD_SELECT && cteQuery->commandType != CMD_UPDATE && cteQuery->commandType != CMD_DELETE) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "only SELECT, UPDATE, or DELETE common table expressions " "may be router planned", NULL, NULL); } if (cteQuery->commandType != CMD_SELECT) { Oid distributedTableId = InvalidOid; DeferredErrorMessage *cteError = ModifyPartialQuerySupported(cteQuery, false, &distributedTableId); if (cteError) { return cteError; } CitusTableCacheEntry *modificationTableCacheEntry = GetCitusTableCacheEntry(distributedTableId); if (IsCitusTableTypeCacheEntry(modificationTableCacheEntry, CITUS_TABLE_WITH_NO_DIST_KEY)) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot router plan modification of a non-distributed table", NULL, NULL); } if (replicationModel && modificationTableCacheEntry->replicationModel != replicationModel) { return DeferredError(ERRCODE_FEATURE_NOT_SUPPORTED, "cannot route mixed replication models", NULL, NULL); } replicationModel = modificationTableCacheEntry->replicationModel; } } /* everything OK */ return NULL; } /* * get_all_actual_clauses * * Returns a list containing the bare clauses from 'restrictinfo_list'. * * This loses the distinction between regular and pseudoconstant clauses, * so be careful what you use it for. */ static List * get_all_actual_clauses(List *restrictinfo_list) { List *result = NIL; ListCell *l; foreach(l, restrictinfo_list) { RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); Assert(IsA(rinfo, RestrictInfo)); result = lappend(result, rinfo->clause); } return result; } /* * CompareInsertValuesByShardId does what it says in the name. Used for sorting * InsertValues objects by their shard. */ static int CompareInsertValuesByShardId(const void *leftElement, const void *rightElement) { InsertValues *leftValue = *((InsertValues **) leftElement); InsertValues *rightValue = *((InsertValues **) rightElement); int64 leftShardId = leftValue->shardId; int64 rightShardId = rightValue->shardId; Index leftIndex = leftValue->listIndex; Index rightIndex = rightValue->listIndex; if (leftShardId > rightShardId) { return 1; } else if (leftShardId < rightShardId) { return -1; } else { /* shard identifiers are the same, list index is secondary sort key */ if (leftIndex > rightIndex) { return 1; } else if (leftIndex < rightIndex) { return -1; } else { return 0; } } }