/*------------------------------------------------------------------------- * * columnar_customscan.c * * This file contains the implementation of a postgres custom scan that * we use to push down the projections into the table access methods. * * $Id$ * *------------------------------------------------------------------------- */ #include "citus_version.h" #include "postgres.h" #include "access/skey.h" #include "nodes/extensible.h" #include "nodes/pg_list.h" #include "nodes/plannodes.h" #include "optimizer/optimizer.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" #include "optimizer/restrictinfo.h" #include "utils/relcache.h" #include "columnar/columnar_customscan.h" #include "columnar/columnar_metadata.h" #include "columnar/columnar_tableam.h" #include "distributed/listutils.h" typedef struct ColumnarScanPath { CustomPath custom_path; /* place for local state during planning */ } ColumnarScanPath; typedef struct ColumnarScanScan { CustomScan custom_scan; /* place for local state during execution */ } ColumnarScanScan; typedef struct ColumnarScanState { CustomScanState custom_scanstate; List *qual; } ColumnarScanState; typedef bool (*PathPredicate)(Path *path); static void ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte); static void RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPredicate); static bool IsNotIndexPath(Path *path); static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); static Cost ColumnarScanCost(Oid relationId, int numberOfColumnsRead); static Cost ColumnarPerStripeScanCost(Oid relationId, int numberOfColumnsRead); static uint64 ColumnarTableStripeCount(Oid relationId); static Plan * ColumnarScanPath_PlanCustomPath(PlannerInfo *root, RelOptInfo *rel, struct CustomPath *best_path, List *tlist, List *clauses, List *custom_plans); static Node * ColumnarScan_CreateCustomScanState(CustomScan *cscan); static void ColumnarScan_BeginCustomScan(CustomScanState *node, EState *estate, int eflags); static TupleTableSlot * ColumnarScan_ExecCustomScan(CustomScanState *node); static void ColumnarScan_EndCustomScan(CustomScanState *node); static void ColumnarScan_ReScanCustomScan(CustomScanState *node); static void ColumnarScan_ExplainCustomScan(CustomScanState *node, List *ancestors, ExplainState *es); /* saved hook value in case of unload */ static set_rel_pathlist_hook_type PreviousSetRelPathlistHook = NULL; static bool EnableColumnarCustomScan = true; static bool EnableColumnarQualPushdown = true; const struct CustomPathMethods ColumnarScanPathMethods = { .CustomName = "ColumnarScan", .PlanCustomPath = ColumnarScanPath_PlanCustomPath, }; const struct CustomScanMethods ColumnarScanScanMethods = { .CustomName = "ColumnarScan", .CreateCustomScanState = ColumnarScan_CreateCustomScanState, }; const struct CustomExecMethods ColumnarExecuteMethods = { .CustomName = "ColumnarScan", .BeginCustomScan = ColumnarScan_BeginCustomScan, .ExecCustomScan = ColumnarScan_ExecCustomScan, .EndCustomScan = ColumnarScan_EndCustomScan, .ReScanCustomScan = ColumnarScan_ReScanCustomScan, .ExplainCustomScan = ColumnarScan_ExplainCustomScan, }; /* * columnar_customscan_init installs the hook required to intercept the postgres planner and * provide extra paths for columnar tables */ void columnar_customscan_init() { PreviousSetRelPathlistHook = set_rel_pathlist_hook; set_rel_pathlist_hook = ColumnarSetRelPathlistHook; /* register customscan specific GUC's */ DefineCustomBoolVariable( "columnar.enable_custom_scan", gettext_noop("Enables the use of a custom scan to push projections and quals " "into the storage layer."), NULL, &EnableColumnarCustomScan, true, PGC_USERSET, GUC_NO_SHOW_ALL, NULL, NULL, NULL); DefineCustomBoolVariable( "columnar.enable_qual_pushdown", gettext_noop("Enables qual pushdown into columnar. This has no effect unless " "columnar.enable_custom_scan is true."), NULL, &EnableColumnarQualPushdown, true, PGC_USERSET, GUC_NO_SHOW_ALL, NULL, NULL, NULL); RegisterCustomScanMethods(&ColumnarScanScanMethods); } static void ColumnarSetRelPathlistHook(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte) { /* call into previous hook if assigned */ if (PreviousSetRelPathlistHook) { PreviousSetRelPathlistHook(root, rel, rti, rte); } if (!OidIsValid(rte->relid) || rte->rtekind != RTE_RELATION || rte->inh) { /* some calls to the pathlist hook don't have a valid relation set. Do nothing */ return; } /* * Here we want to inspect if this relation pathlist hook is accessing a columnar table. * If that is the case we want to insert an extra path that pushes down the projection * into the scan of the table to minimize the data read. */ Relation relation = RelationIdGetRelation(rte->relid); if (relation->rd_tableam == GetColumnarTableAmRoutine()) { if (rte->tablesample != NULL) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("sample scans not supported on columnar tables"))); } /* columnar doesn't support parallel paths */ rel->partial_pathlist = NIL; if (EnableColumnarCustomScan) { Path *customPath = CreateColumnarScanPath(root, rel, rte); ereport(DEBUG1, (errmsg("pathlist hook for columnar table am"))); /* * TODO: Since we don't have a proper costing model for * ColumnarCustomScan, we remove other paths to force postgres * using ColumnarCustomScan. Note that we still keep index paths * since they still might be useful. */ RemovePathsByPredicate(rel, IsNotIndexPath); add_path(rel, customPath); } } RelationClose(relation); } /* * RemovePathsByPredicate removes the paths that removePathPredicate * evaluates to true from pathlist of given rel. */ static void RemovePathsByPredicate(RelOptInfo *rel, PathPredicate removePathPredicate) { List *filteredPathList = NIL; Path *path = NULL; foreach_ptr(path, rel->pathlist) { if (!removePathPredicate(path)) { filteredPathList = lappend(filteredPathList, path); } } rel->pathlist = filteredPathList; } /* * IsNotIndexPath returns true if given path is not an IndexPath. */ static bool IsNotIndexPath(Path *path) { return !IsA(path, IndexPath); } static Path * CreateColumnarScanPath(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) { ColumnarScanPath *cspath = (ColumnarScanPath *) newNode(sizeof(ColumnarScanPath), T_CustomPath); /* * popuate custom path information */ CustomPath *cpath = &cspath->custom_path; cpath->methods = &ColumnarScanPathMethods; /* * populate generic path information */ Path *path = &cpath->path; path->pathtype = T_CustomScan; path->parent = rel; path->pathtarget = rel->reltarget; /* columnar scans are not parallel-aware, but they are parallel-safe */ path->parallel_safe = rel->consider_parallel; /* * We don't support pushing join clauses into the quals of a seqscan, but * it could still have required parameterization due to LATERAL refs in * its tlist. */ path->param_info = get_baserel_parampathinfo(root, rel, rel->lateral_relids); /* * Add cost estimates for a columnar table scan, row count is the rows estimated by * postgres' planner. */ path->rows = rel->rows; path->startup_cost = 0; int numberOfColumnsRead = bms_num_members(rte->selectedCols); path->total_cost = path->startup_cost + ColumnarScanCost(rte->relid, numberOfColumnsRead); return (Path *) cspath; } /* * ColumnarScanCost calculates the cost of scanning the columnar table. The cost is estimated * by using all stripe metadata to estimate based on the columns to read how many pages * need to be read. */ static Cost ColumnarScanCost(Oid relationId, int numberOfColumnsRead) { return ColumnarTableStripeCount(relationId) * ColumnarPerStripeScanCost(relationId, numberOfColumnsRead); } /* * ColumnarPerStripeScanCost calculates the cost to scan a single stripe * of given columnar table based on number of columns that needs to be * read during scan operation. */ static Cost ColumnarPerStripeScanCost(Oid relationId, int numberOfColumnsRead) { Relation relation = RelationIdGetRelation(relationId); List *stripeList = StripesForRelfilenode(relation->rd_node); RelationClose(relation); uint32 maxColumnCount = 0; uint64 totalStripeSize = 0; StripeMetadata *stripeMetadata = NULL; foreach_ptr(stripeMetadata, stripeList) { totalStripeSize += stripeMetadata->dataLength; maxColumnCount = Max(maxColumnCount, stripeMetadata->columnCount); } /* * When no stripes are in the table we don't have a count in maxColumnCount. To * prevent a division by zero turning into a NaN we keep the ratio on zero. * This will result in a cost of 0 for scanning the table which is a reasonable * cost on an empty table. */ if (maxColumnCount == 0) { return 0; } double columnSelectionRatio = numberOfColumnsRead / (double) maxColumnCount; Cost tableScanCost = (double) totalStripeSize / BLCKSZ * columnSelectionRatio; Cost perStripeScanCost = tableScanCost / list_length(stripeList); return perStripeScanCost; } /* * ColumnarTableStripeCount returns the number of stripes that columnar * table with relationId has by using stripe metadata. */ static uint64 ColumnarTableStripeCount(Oid relationId) { Relation relation = RelationIdGetRelation(relationId); List *stripeList = StripesForRelfilenode(relation->rd_node); int stripeCount = list_length(stripeList); RelationClose(relation); return stripeCount; } static Plan * ColumnarScanPath_PlanCustomPath(PlannerInfo *root, RelOptInfo *rel, struct CustomPath *best_path, List *tlist, List *clauses, List *custom_plans) { ColumnarScanScan *plan = (ColumnarScanScan *) newNode(sizeof(ColumnarScanScan), T_CustomScan); CustomScan *cscan = &plan->custom_scan; cscan->methods = &ColumnarScanScanMethods; /* Reduce RestrictInfo list to bare expressions; ignore pseudoconstants */ clauses = extract_actual_clauses(clauses, false); cscan->scan.plan.targetlist = list_copy(tlist); cscan->scan.plan.qual = clauses; cscan->scan.scanrelid = best_path->path.parent->relid; return (Plan *) plan; } static Node * ColumnarScan_CreateCustomScanState(CustomScan *cscan) { ColumnarScanState *columnarScanState = (ColumnarScanState *) newNode( sizeof(ColumnarScanState), T_CustomScanState); CustomScanState *cscanstate = &columnarScanState->custom_scanstate; cscanstate->methods = &ColumnarExecuteMethods; if (EnableColumnarQualPushdown) { columnarScanState->qual = cscan->scan.plan.qual; } return (Node *) cscanstate; } static void ColumnarScan_BeginCustomScan(CustomScanState *cscanstate, EState *estate, int eflags) { /* scan slot is already initialized */ } static Bitmapset * ColumnarAttrNeeded(ScanState *ss) { TupleTableSlot *slot = ss->ss_ScanTupleSlot; int natts = slot->tts_tupleDescriptor->natts; Bitmapset *attr_needed = NULL; Plan *plan = ss->ps.plan; int flags = PVC_RECURSE_AGGREGATES | PVC_RECURSE_WINDOWFUNCS | PVC_RECURSE_PLACEHOLDERS; List *vars = list_concat(pull_var_clause((Node *) plan->targetlist, flags), pull_var_clause((Node *) plan->qual, flags)); ListCell *lc; foreach(lc, vars) { Var *var = lfirst(lc); if (var->varattno < 0) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg( "UPDATE and CTID scans not supported for ColumnarScan"))); } if (var->varattno == 0) { elog(DEBUG1, "Need attribute: all"); /* all attributes are required, we don't need to add more so break*/ attr_needed = bms_add_range(attr_needed, 0, natts - 1); break; } elog(DEBUG1, "Need attribute: %d", var->varattno); attr_needed = bms_add_member(attr_needed, var->varattno - 1); } return attr_needed; } static TupleTableSlot * ColumnarScanNext(ColumnarScanState *columnarScanState) { CustomScanState *node = (CustomScanState *) columnarScanState; /* * get information from the estate and scan state */ TableScanDesc scandesc = node->ss.ss_currentScanDesc; EState *estate = node->ss.ps.state; ScanDirection direction = estate->es_direction; TupleTableSlot *slot = node->ss.ss_ScanTupleSlot; if (scandesc == NULL) { /* the columnar access method does not use the flags, they are specific to heap */ uint32 flags = 0; Bitmapset *attr_needed = ColumnarAttrNeeded(&node->ss); /* * We reach here if the scan is not parallel, or if we're serially * executing a scan that was planned to be parallel. */ scandesc = columnar_beginscan_extended(node->ss.ss_currentRelation, estate->es_snapshot, 0, NULL, NULL, flags, attr_needed, columnarScanState->qual); bms_free(attr_needed); node->ss.ss_currentScanDesc = scandesc; } /* * get the next tuple from the table */ if (table_scan_getnextslot(scandesc, direction, slot)) { return slot; } return NULL; } /* * SeqRecheck -- access method routine to recheck a tuple in EvalPlanQual */ static bool ColumnarScanRecheck(ColumnarScanState *node, TupleTableSlot *slot) { return true; } static TupleTableSlot * ColumnarScan_ExecCustomScan(CustomScanState *node) { return ExecScan(&node->ss, (ExecScanAccessMtd) ColumnarScanNext, (ExecScanRecheckMtd) ColumnarScanRecheck); } static void ColumnarScan_EndCustomScan(CustomScanState *node) { /* * get information from node */ TableScanDesc scanDesc = node->ss.ss_currentScanDesc; /* * Free the exprcontext */ ExecFreeExprContext(&node->ss.ps); /* * clean out the tuple table */ if (node->ss.ps.ps_ResultTupleSlot) { ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); } ExecClearTuple(node->ss.ss_ScanTupleSlot); /* * close heap scan */ if (scanDesc != NULL) { table_endscan(scanDesc); } } static void ColumnarScan_ReScanCustomScan(CustomScanState *node) { TableScanDesc scanDesc = node->ss.ss_currentScanDesc; if (scanDesc != NULL) { table_rescan(node->ss.ss_currentScanDesc, NULL); } } static void ColumnarScan_ExplainCustomScan(CustomScanState *node, List *ancestors, ExplainState *es) { TableScanDesc scanDesc = node->ss.ss_currentScanDesc; if (scanDesc != NULL) { int64 chunkGroupsFiltered = ColumnarScanChunkGroupsFiltered(scanDesc); ExplainPropertyInteger("Columnar Chunk Groups Removed by Filter", NULL, chunkGroupsFiltered, es); } }