From f526eec6a83a63103ba81338adfd61f8ababbb3b Mon Sep 17 00:00:00 2001 From: jeff-davis Date: Thu, 1 Apr 2021 12:27:28 -0700 Subject: [PATCH] Columnar: use clause Vars for chunk group filtering. (#4856) * Columnar: use clause Vars for chunk group filtering. This solves #4780 and also provides a cleaner separation between chunk group filtering and projection pushdown. * Columnar: sort and deduplicate Vars pulled from clauses. * Columnar: cleanup variable names. * Columnar: remove alternate test output. * Columnar: do not recurse when looking for whereClauseVars. Co-authored-by: Jeff Davis (cherry picked from commit 063e67303840cf09248502ae6c40a508824e4ac0) --- src/backend/columnar/cstore_reader.c | 75 +++++++++++++++++-- .../regress/input/am_chunk_filtering.source | 10 +++ .../regress/output/am_chunk_filtering.source | 14 ++++ 3 files changed, 92 insertions(+), 7 deletions(-) diff --git a/src/backend/columnar/cstore_reader.c b/src/backend/columnar/cstore_reader.c index 8caff6017..bc70f57ef 100644 --- a/src/backend/columnar/cstore_reader.c +++ b/src/backend/columnar/cstore_reader.c @@ -29,6 +29,7 @@ #else #include "optimizer/clauses.h" #include "optimizer/predtest.h" +#include "optimizer/var.h" #endif #include "optimizer/restrictinfo.h" #include "storage/fd.h" @@ -62,6 +63,8 @@ struct TableReadState List *projectedColumnList; List *whereClauseList; + List *whereClauseVars; + MemoryContext stripeReadContext; StripeBuffers *stripeBuffers; uint32 readStripeCount; @@ -77,6 +80,7 @@ static StripeBuffers * LoadFilteredStripeBuffers(Relation relation, TupleDesc tupleDescriptor, List *projectedColumnList, List *whereClauseList, + List *whereClauseVars, int64 *chunkGroupsFiltered); static void ReadStripeNextRow(StripeBuffers *stripeBuffers, List *projectedColumnList, uint64 chunkIndex, uint64 chunkRowIndex, @@ -87,10 +91,11 @@ static ColumnBuffers * LoadColumnBuffers(Relation relation, uint32 chunkCount, uint64 stripeOffset, Form_pg_attribute attributeForm); static bool * SelectedChunkMask(StripeSkipList *stripeSkipList, - List *projectedColumnList, List *whereClauseList, + List *whereClauseList, List *whereClauseVars, int64 *chunkGroupsFiltered); static List * BuildRestrictInfoList(List *whereClauseList); static Node * BuildBaseConstraint(Var *variable); +static List * GetClauseVars(List *clauses, int natts); static OpExpr * MakeOpExpression(Var *variable, int16 strategyNumber); static Oid GetOperatorByType(Oid typeId, Oid accessMethodId, int16 strategyNumber); static void UpdateConstraint(Node *baseConstraint, Datum minValue, Datum maxValue); @@ -142,6 +147,7 @@ ColumnarBeginRead(Relation relation, TupleDesc tupleDescriptor, readState->stripeList = stripeList; readState->projectedColumnList = projectedColumnList; readState->whereClauseList = whereClauseList; + readState->whereClauseVars = GetClauseVars(whereClauseList, tupleDescriptor->natts); readState->stripeBuffers = NULL; readState->readStripeCount = 0; readState->stripeReadRowCount = 0; @@ -218,6 +224,8 @@ ColumnarReadNextRow(TableReadState *readState, Datum *columnValues, bool *column projectedColumnList, readState-> whereClauseList, + readState-> + whereClauseVars, &readState-> chunkGroupsFiltered); readState->readStripeCount++; @@ -400,7 +408,8 @@ ColumnarTableRowCount(Relation relation) static StripeBuffers * LoadFilteredStripeBuffers(Relation relation, StripeMetadata *stripeMetadata, TupleDesc tupleDescriptor, List *projectedColumnList, - List *whereClauseList, int64 *chunkGroupsFiltered) + List *whereClauseList, List *whereClauseVars, + int64 *chunkGroupsFiltered) { uint32 columnIndex = 0; uint32 columnCount = tupleDescriptor->natts; @@ -412,8 +421,8 @@ LoadFilteredStripeBuffers(Relation relation, StripeMetadata *stripeMetadata, tupleDescriptor, stripeMetadata->chunkCount); - bool *selectedChunkMask = SelectedChunkMask(stripeSkipList, projectedColumnList, - whereClauseList, chunkGroupsFiltered); + bool *selectedChunkMask = SelectedChunkMask(stripeSkipList, whereClauseList, + whereClauseVars, chunkGroupsFiltered); StripeSkipList *selectedChunkSkipList = SelectedChunkSkipList(stripeSkipList, projectedColumnMask, @@ -551,8 +560,8 @@ LoadColumnBuffers(Relation relation, ColumnChunkSkipNode *chunkSkipNodeArray, * the chunk can be refuted by the given qualifier conditions. */ static bool * -SelectedChunkMask(StripeSkipList *stripeSkipList, List *projectedColumnList, - List *whereClauseList, int64 *chunkGroupsFiltered) +SelectedChunkMask(StripeSkipList *stripeSkipList, List *whereClauseList, + List *whereClauseVars, int64 *chunkGroupsFiltered) { ListCell *columnCell = NULL; uint32 chunkIndex = 0; @@ -561,7 +570,7 @@ SelectedChunkMask(StripeSkipList *stripeSkipList, List *projectedColumnList, bool *selectedChunkMask = palloc0(stripeSkipList->chunkCount * sizeof(bool)); memset(selectedChunkMask, true, stripeSkipList->chunkCount * sizeof(bool)); - foreach(columnCell, projectedColumnList) + foreach(columnCell, whereClauseVars) { Var *column = lfirst(columnCell); uint32 columnIndex = column->varattno - 1; @@ -693,6 +702,58 @@ BuildBaseConstraint(Var *variable) } +/* + * GetClauseVars extracts the Vars from the given clauses for the purpose of + * building constraints that can be refuted by predicate_refuted_by(). It also + * deduplicates and sorts them. + */ +static List * +GetClauseVars(List *whereClauseList, int natts) +{ + /* + * We don't recurse into or include aggregates, window functions, or + * PHVs. We don't expect any PHVs during execution; and Vars found inside + * an aggregate or window function aren't going to be useful in forming + * constraints that can be refuted. + */ + int flags = 0; + List *vars = pull_var_clause((Node *) whereClauseList, flags); + Var **deduplicate = palloc0(sizeof(Var *) * natts); + + ListCell *lc; + foreach(lc, vars) + { + Node *node = lfirst(lc); + Assert(IsA(node, Var)); + + Var *var = (Var *) node; + int idx = var->varattno - 1; + + if (deduplicate[idx] != NULL) + { + /* if they have the same varattno, the rest should be identical */ + Assert(equal(var, deduplicate[idx])); + } + + deduplicate[idx] = var; + } + + List *whereClauseVars = NIL; + for (int i = 0; i < natts; i++) + { + Var *var = deduplicate[i]; + if (var != NULL) + { + whereClauseVars = lappend(whereClauseVars, var); + } + } + + pfree(deduplicate); + + return whereClauseVars; +} + + /* * MakeOpExpression builds an operator expression node. This operator expression * implements the operator clause as defined by the variable and the strategy diff --git a/src/test/regress/input/am_chunk_filtering.source b/src/test/regress/input/am_chunk_filtering.source index 6b0e2e7da..0fa8bc18d 100644 --- a/src/test/regress/input/am_chunk_filtering.source +++ b/src/test/regress/input/am_chunk_filtering.source @@ -101,3 +101,13 @@ EXPLAIN (analyze on, costs off, timing off, summary off) SELECT count(*) FROM multi_column_chunk_filtering WHERE a > 50000 AND b > 50000; DROP TABLE multi_column_chunk_filtering; + +-- +-- https://github.com/citusdata/citus/issues/4780 +-- +create table part_table (id int) partition by range (id); +create table part_1_row partition of part_table for values from (150000) to (160000); +create table part_2_columnar partition of part_table for values from (0) to (150000) using columnar; +insert into part_table select generate_series(1,159999); +select filtered_row_count('select count(*) from part_table where id > 75000'); +drop table part_table; diff --git a/src/test/regress/output/am_chunk_filtering.source b/src/test/regress/output/am_chunk_filtering.source index 321d097eb..cc05fff7b 100644 --- a/src/test/regress/output/am_chunk_filtering.source +++ b/src/test/regress/output/am_chunk_filtering.source @@ -182,3 +182,17 @@ EXPLAIN (analyze on, costs off, timing off, summary off) (5 rows) DROP TABLE multi_column_chunk_filtering; +-- +-- https://github.com/citusdata/citus/issues/4780 +-- +create table part_table (id int) partition by range (id); +create table part_1_row partition of part_table for values from (150000) to (160000); +create table part_2_columnar partition of part_table for values from (0) to (150000) using columnar; +insert into part_table select generate_series(1,159999); +select filtered_row_count('select count(*) from part_table where id > 75000'); + filtered_row_count +--------------------------------------------------------------------- + 5000 +(1 row) + +drop table part_table;