Added citus_find_shard_split_points as UDF and calling it from function

pull/7013/head
Shabnam Khan 2023-06-27 12:58:41 +05:30
parent e9d807c117
commit de57bd1240
4 changed files with 184 additions and 134 deletions

View File

@ -14,10 +14,13 @@
#include "distributed/listutils.h"
#include "distributed/metadata_utility.h"
#include "distributed/background_jobs.h"
#include "distributed/multi_join_order.h"
#include "distributed/citus_ruleutils.h"
PG_FUNCTION_INFO_V1(citus_auto_shard_split_start);
PG_FUNCTION_INFO_V1(citus_find_shard_split_points);
uint64 MaxShardSize = 102400;
int64 MaxShardSize = 102400;
double TenantFrequency = 0.3;
/*
@ -27,27 +30,19 @@ double TenantFrequency = 0.3;
typedef struct ShardInfoData
{
int64 shardSize;
int64 shardMinValue;
int64 shardMaxValue;
int64 shardId;
int32 nodeId;
char *tableName;
char *distributionColumn;
char *dataType;
char *shardName;
Oid tableId;
Oid distributionColumnId;
int64 shardGroupSize;
}ShardInfoData;
typedef ShardInfoData *ShardInfo;
void ErrorOnConcurrentOperation(void);
StringInfo GetShardSplitQuery(ShardInfo shardinfo, List *splitPoints,
StringInfo GetShardSplitQuery(ShardInfo shardinfo, Datum datum,
char *shardSplitMode);
void ExecuteSplitBackgroundJob(int64 jobId, ShardInfo shardinfo, List *splitPoints,
void ExecuteSplitBackgroundJob(int64 jobId, ShardInfo shardinfo, Datum datum,
char *shardSplitMode);
int64 ExecuteAverageHashQuery(ShardInfo shardinfo);
List * FindShardSplitPoints(ShardInfo shardinfo);
int64 ScheduleShardSplit(ShardInfo shardinfo, char *shardSplitMode , int64 jobId);
List * FindShardSplitPoints(int64 shardId);
int64 ScheduleShardSplit(ShardInfo shardinfo, char *shardSplitMode, int64 jobId);
/*
* It throws an error if a concurrent automatic shard split or Rebalance operation is happening.
@ -79,32 +74,34 @@ ErrorOnConcurrentOperation()
* For a given SplitPoints , it creates the SQL query for the Shard Splitting
*/
StringInfo
GetShardSplitQuery(ShardInfo shardinfo, List *splitPoints, char *shardSplitMode)
GetShardSplitQuery(ShardInfo shardinfo, Datum datum, char *shardSplitMode)
{
StringInfo splitQuery = makeStringInfo();
int64 length = list_length(splitPoints);
ArrayType *array = DatumGetArrayTypeP(datum);
Datum *values;
int nelems;
deconstruct_array(array,
INT4OID,
sizeof(int32), true, TYPALIGN_INT,
&values, NULL, &nelems);
appendStringInfo(splitQuery, "SELECT citus_split_shard_by_split_points(%ld, ARRAY[",
shardinfo->shardId);
int32 splitpoint = 0;
uint64 index = 0;
foreach_int(splitpoint, splitPoints)
for (int i = 0; i < nelems; i++)
{
appendStringInfo(splitQuery, "'%d'", splitpoint);
appendStringInfo(splitQuery, "'%d'", values[i]);
if (index < length - 1)
if (i < nelems - 1)
{
appendStringInfoString(splitQuery, ",");
}
index++;
}
/*All the shards after the split will be belonging to the same node */
appendStringInfo(splitQuery, "], ARRAY[");
for (int i = 0; i < length; i++)
for (int i = 0; i < nelems; i++)
{
appendStringInfo(splitQuery, "%d,", shardinfo->nodeId);
}
@ -119,11 +116,12 @@ GetShardSplitQuery(ShardInfo shardinfo, List *splitPoints, char *shardSplitMode)
* It creates a background job for citus_split_shard_by_split_points and executes it in background.
*/
void
ExecuteSplitBackgroundJob(int64 jobId, ShardInfo shardinfo, List *splitPoints,
ExecuteSplitBackgroundJob(int64 jobId, ShardInfo shardinfo, Datum datum,
char *shardSplitMode)
{
StringInfo splitQuery = makeStringInfo();
splitQuery = GetShardSplitQuery(shardinfo, splitPoints, shardSplitMode);
splitQuery = GetShardSplitQuery(shardinfo, datum, shardSplitMode);
/* ereport(LOG, (errmsg(splitQuery->data))); */
int32 nodesInvolved[] = { shardinfo->nodeId };
Oid superUserId = CitusExtensionOwner();
@ -132,58 +130,37 @@ ExecuteSplitBackgroundJob(int64 jobId, ShardInfo shardinfo, List *splitPoints,
}
/*
* It executes a query to find the average hash value in a shard considering rows with a limit of 10GB .
* If there exists a hash value it is returned otherwise shardminvalue-1 is returned.
*/
int64
ExecuteAverageHashQuery(ShardInfo shardinfo)
{
StringInfo AvgHashQuery = makeStringInfo();
uint64 tableSize = 0;
bool check = DistributedTableSize(shardinfo->tableId, TOTAL_RELATION_SIZE, true,
&tableSize);
appendStringInfo(AvgHashQuery, "SELECT avg(h)::int,count(*)"
" FROM (SELECT worker_hash(%s) h FROM %s TABLESAMPLE SYSTEM(least(10, 100*10000000000/%lu))"
" WHERE worker_hash(%s)>=%ld AND worker_hash(%s)<=%ld) s",
shardinfo->distributionColumn, shardinfo->tableName,
tableSize,
shardinfo->distributionColumn, shardinfo->shardMinValue,
shardinfo->distributionColumn, shardinfo->shardMaxValue
);
ereport(DEBUG4, errmsg("%s", AvgHashQuery->data));
SPI_connect();
SPI_exec(AvgHashQuery->data, 0);
SPITupleTable *tupletable = SPI_tuptable;
HeapTuple tuple = tupletable->vals[0];
bool isnull;
Datum average = SPI_getbinval(tuple, tupletable->tupdesc, 1, &isnull);
int64 isResultNull = 1;
if (!isnull)
{
isResultNull = 0;
}
SPI_freetuptable(tupletable);
SPI_finish();
if (isResultNull == 0)
{
return DatumGetInt64(average);
}
else
{
return shardinfo->shardMinValue - 1;
}
}
/*
* This function executes a query and then decides whether a shard is subjected for isolation or average hash 2 way split.
* If a tenant is found splitpoints for isolation is returned otherwise average hash value is returned.
*/
List *
FindShardSplitPoints(ShardInfo shardinfo)
Datum
citus_find_shard_split_points(PG_FUNCTION_ARGS)
{
int64 shardId = PG_GETARG_INT64(0);
int64 shardGroupSize = PG_GETARG_INT64(1);
ereport(DEBUG4, errmsg("%ld", shardGroupSize));
/*Filtering Shards with total GroupSize greater than MaxShardSize*1024 i.e Size based Policy*/
if (shardGroupSize < MaxShardSize * 1024)
{
PG_RETURN_NULL();
}
/*Extracting all the shardinfo with the help of shardId*/
Oid tableId = RelationIdForShard(shardId);
char *distributionColumnName = ColumnToColumnName(tableId,
(Node *) DistPartitionKeyOrError(
tableId));
char *dataType = format_type_be(ColumnTypeIdForRelationColumnName(
tableId,
distributionColumnName));
char *shardName = get_rel_name(tableId);
AppendShardIdToName(&shardName, shardId);
ShardInterval *shardrange = LoadShardInterval(shardId);
int64 shardMinValue = shardrange->minValue;
int64 shardMaxValue = shardrange->maxValue;
char *tableName = generate_qualified_relation_name(tableId);
StringInfo CommonValueQuery = makeStringInfo();
/*
@ -196,14 +173,14 @@ FindShardSplitPoints(ShardInfo shardinfo)
" FROM pg_stats s , unnest(most_common_vals::text::%s[],most_common_freqs) as res(val,freq)"
" WHERE tablename = %s AND attname = %s AND schemaname = %s AND freq > %lf $$)"
" WHERE result <> '' AND shardid = %ld;",
shardinfo->dataType, quote_literal_cstr(shardinfo->tableName),
shardinfo->dataType,
quote_literal_cstr(shardinfo->shardName),
quote_literal_cstr(shardinfo->distributionColumn),
dataType, quote_literal_cstr(tableName),
dataType,
quote_literal_cstr(shardName),
quote_literal_cstr(distributionColumnName),
quote_literal_cstr(get_namespace_name(get_rel_namespace(
shardinfo->tableId))),
tableId))),
TenantFrequency,
shardinfo->shardId);
shardId);
ereport(DEBUG4, errmsg("%s", CommonValueQuery->data));
List *splitPoints = NULL;
@ -218,7 +195,6 @@ FindShardSplitPoints(ShardInfo shardinfo)
MemoryContext spiContext = CurrentMemoryContext;
int64 rowCount = SPI_processed;
int64 average;
int32 hashedValue;
ereport(DEBUG4, errmsg("%ld", rowCount));
@ -230,14 +206,15 @@ FindShardSplitPoints(ShardInfo shardinfo)
* and the resulting is then sorted and returned.
*/
SPITupleTable *tupletable = SPI_tuptable;
CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(shardinfo->tableId);
CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(tableId);
for (int rowIndex = 0; rowIndex < rowCount; rowIndex++)
{
HeapTuple tuple = tupletable->vals[rowIndex];
char *commonValue = SPI_getvalue(tuple, tupletable->tupdesc, 2);
ereport(DEBUG4, errmsg("%s", commonValue));
Datum tenantIdDatum = StringToDatum(commonValue,
shardinfo->distributionColumnId);
ColumnTypeIdForRelationColumnName(tableId,
distributionColumnName));
Datum hashedValueDatum = FunctionCall1Coll(cacheEntry->hashFunction,
cacheEntry->partitionColumn->
varcollid,
@ -248,11 +225,11 @@ FindShardSplitPoints(ShardInfo shardinfo)
/*Switching the memory context to store the unique SplitPoints in a list*/
MemoryContextSwitchTo(originalContext);
if (hashedValue == shardinfo->shardMinValue)
if (hashedValue == shardMinValue)
{
splitPoints = list_append_unique_int(splitPoints, hashedValue);
}
else if (hashedValue == shardinfo->shardMaxValue)
else if (hashedValue == shardMaxValue)
{
splitPoints = list_append_unique_int(splitPoints, hashedValue - 1);
}
@ -268,17 +245,64 @@ FindShardSplitPoints(ShardInfo shardinfo)
}
else
{
average = ExecuteAverageHashQuery(shardinfo);
StringInfo AvgHashQuery = makeStringInfo();
uint64 tableSize = 0;
bool check = DistributedTableSize(tableId, TOTAL_RELATION_SIZE, true,
&tableSize);
/*
* It executes a query to find the average hash value in a shard considering rows with a limit of 10GB .
* If there exists a hash value it is returned otherwise NULL is returned.
*/
appendStringInfo(AvgHashQuery, "SELECT avg(h)::int,count(*)"
" FROM (SELECT worker_hash(%s) h FROM %s TABLESAMPLE SYSTEM(least(10, 100*10000000000/%lu))"
" WHERE worker_hash(%s)>=%ld AND worker_hash(%s)<=%ld) s",
distributionColumnName, tableName,
tableSize,
distributionColumnName, shardMinValue,
distributionColumnName, shardMaxValue
);
ereport(DEBUG4, errmsg("%s", AvgHashQuery->data));
SPI_connect();
SPI_exec(AvgHashQuery->data, 0);
SPITupleTable *tupletable = SPI_tuptable;
HeapTuple tuple = tupletable->vals[0];
bool isnull;
Datum average = SPI_getbinval(tuple, tupletable->tupdesc, 1, &isnull);
int64 isResultNull = 1;
if (!isnull)
{
isResultNull = 0;
}
SPI_freetuptable(tupletable);
SPI_finish();
if (isResultNull == 0)
{
ereport(DEBUG4, errmsg("%ld", average));
MemoryContextSwitchTo(originalContext);
if (shardinfo->shardMinValue <= average)
{
splitPoints = lappend_int(splitPoints, average);
splitPoints = lappend_int(splitPoints, DatumGetInt32(average));
}
}
SPI_finish();
return splitPoints;
/*Converting the list into datum for further conversion into Arraytype*/
Datum *elements = (Datum *) palloc(sizeof(Datum) * list_length(splitPoints));
int32 splitPoint, index = 0;
foreach_int(splitPoint, splitPoints)
{
elements[index++] = Int32GetDatum(splitPoint);
}
ArrayType *resultArray = construct_array(elements, list_length(splitPoints), INT4OID,
sizeof(int32), true, TYPALIGN_INT);
if (list_length(splitPoints) == 0)
{
PG_RETURN_NULL();
}
PG_RETURN_ARRAYTYPE_P(resultArray);
}
@ -287,19 +311,33 @@ FindShardSplitPoints(ShardInfo shardinfo)
* split and then executes the background job for the shard split.
*/
int64
ScheduleShardSplit(ShardInfo shardinfo, char *shardSplitMode , int64 jobId)
ScheduleShardSplit(ShardInfo shardinfo, char *shardSplitMode, int64 jobId)
{
List *splitPoints = FindShardSplitPoints(shardinfo);
if (list_length(splitPoints) > 0)
SPI_connect();
StringInfo findSplitPointsQuery = makeStringInfo();
appendStringInfo(findSplitPointsQuery,
"SELECT citus_find_shard_split_points(%ld , %ld)",
shardinfo->shardId,
shardinfo->shardGroupSize);
SPI_exec(findSplitPointsQuery->data, 0);
SPITupleTable *tupletable = SPI_tuptable;
HeapTuple tuple = tupletable->vals[0];
bool isnull;
Datum resultDatum = SPI_getbinval(tuple, tupletable->tupdesc, 1, &isnull);
if (!isnull)
{
ereport(DEBUG4, errmsg("%s", GetShardSplitQuery(shardinfo, splitPoints,
ereport(DEBUG4, errmsg("%s", GetShardSplitQuery(shardinfo, resultDatum,
shardSplitMode)->data));
ExecuteSplitBackgroundJob(jobId, shardinfo, splitPoints, shardSplitMode);
ExecuteSplitBackgroundJob(jobId, shardinfo, resultDatum, shardSplitMode);
SPI_finish();
return 1;
}
else
{
ereport(LOG, errmsg("No Splitpoints for shard split"));
SPI_finish();
return 0;
}
}
@ -323,17 +361,16 @@ citus_auto_shard_split_start(PG_FUNCTION_ARGS)
appendStringInfo(
query,
" SELECT cs.shardid,pd.shardminvalue,pd.shardmaxvalue,cs.shard_size,pn.nodeid,ct.distribution_column,ct.table_name,cs.shard_name,(SELECT relname FROM pg_class WHERE oid = ct.table_name)"
" SELECT cs.shardid,pd.shardminvalue,pd.shardmaxvalue,cs.shard_size,pn.nodeid, max_sizes.total_sum"
" FROM pg_catalog.pg_dist_shard pd JOIN pg_catalog.citus_shards cs ON pd.shardid = cs.shardid JOIN pg_catalog.pg_dist_node pn ON cs.nodename = pn.nodename AND cs.nodeport= pn.nodeport"
" JOIN"
" ( select shardid , max_size from (SELECT distinct first_value(shardid) OVER w as shardid, sum(shard_size) OVER (PARTITION BY colocation_id, shardminvalue) as total_sum, max(shard_size) OVER w as max_size"
" ( select shardid , max_size , total_sum from (SELECT distinct first_value(shardid) OVER w as shardid, sum(shard_size) OVER (PARTITION BY colocation_id, shardminvalue) as total_sum, max(shard_size) OVER w as max_size"
" FROM citus_shards cs JOIN pg_dist_shard ps USING(shardid)"
" WINDOW w AS (PARTITION BY colocation_id, shardminvalue ORDER BY shard_size DESC) )as t where total_sum >= %lu )"
" AS max_sizes ON cs.shardid=max_sizes.shardid AND cs.shard_size = max_sizes.max_size JOIN citus_tables ct ON cs.table_name = ct.table_name AND pd.shardminvalue <> pd.shardmaxvalue AND pd.shardminvalue <> ''",
MaxShardSize*1024
" WINDOW w AS (PARTITION BY colocation_id, shardminvalue ORDER BY shard_size DESC))as t)"
" AS max_sizes ON cs.shardid=max_sizes.shardid AND cs.shard_size = max_sizes.max_size AND pd.shardminvalue <> pd.shardmaxvalue AND pd.shardminvalue <> ''"
);
ereport(DEBUG4 ,errmsg("%s", query->data));
ereport(DEBUG4, errmsg("%s", query->data));
Oid shardTransferModeOid = PG_GETARG_OID(0);
Datum enumLabelDatum = DirectFunctionCall1(enum_out, shardTransferModeOid);
char *shardSplitMode = DatumGetCString(enumLabelDatum);
@ -368,37 +405,16 @@ citus_auto_shard_split_start(PG_FUNCTION_ARGS)
Datum nodeIdDatum = SPI_getbinval(tuple, tupletable->tupdesc, 5, &isnull);
shardinfo.nodeId = DatumGetInt32(nodeIdDatum);
char *shardMinVal = SPI_getvalue(tuple, tupletable->tupdesc, 2);
shardinfo.shardMinValue = strtoi64(shardMinVal, NULL, 10);
char *shardGroupSizeValue = SPI_getvalue(tuple, tupletable->tupdesc, 6);
shardinfo.shardGroupSize = strtoi64(shardGroupSizeValue, NULL, 10);
char *shardMaxVal = SPI_getvalue(tuple, tupletable->tupdesc, 3);
shardinfo.shardMaxValue = strtoi64(shardMaxVal, NULL, 10);
shardinfo.distributionColumn = SPI_getvalue(tuple, tupletable->tupdesc, 6);
shardinfo.tableName = SPI_getvalue(tuple, tupletable->tupdesc, 7);
shardinfo.shardName = SPI_getvalue(tuple, tupletable->tupdesc, 9);
AppendShardIdToName(&shardinfo.shardName, shardinfo.shardId);
Datum tableIdDatum = SPI_getbinval(tuple, tupletable->tupdesc, 7, &isnull);
shardinfo.tableId = DatumGetObjectId(tableIdDatum);
shardinfo.distributionColumnId = ColumnTypeIdForRelationColumnName(
shardinfo.tableId,
shardinfo.
distributionColumn);
shardinfo.dataType = format_type_be(shardinfo.distributionColumnId);
count = count + ScheduleShardSplit(&shardinfo, shardSplitMode , jobId);
ereport(DEBUG4, (errmsg(
"Shard ID: %ld,ShardMinValue: %ld, ShardMaxValue: %ld , totalSize: %ld , nodeId: %d",
shardinfo.shardId, shardinfo.shardMinValue,
shardinfo.shardMaxValue,
shardinfo.shardSize, shardinfo.nodeId)));
count = count + ScheduleShardSplit(&shardinfo, shardSplitMode, jobId);
}
SPI_freetuptable(tupletable);
SPI_finish();
if(count==0){
if (count == 0)
{
DirectFunctionCall1(citus_job_cancel, Int64GetDatum(jobId));
}

View File

@ -13,3 +13,20 @@ COMMENT ON FUNCTION pg_catalog.citus_auto_shard_split_start(citus.shard_transfer
IS 'automatically split the necessary shards in the cluster in the background';
GRANT EXECUTE ON FUNCTION pg_catalog.citus_auto_shard_split_start(citus.shard_transfer_mode) TO PUBLIC;
CREATE OR REPLACE FUNCTION pg_catalog.citus_find_shard_split_points(
shard_id bigint,
shard_group_size bigint
)
RETURNS SETOF bigint[]
AS 'MODULE_PATHNAME'
LANGUAGE C VOLATILE;
COMMENT ON FUNCTION pg_catalog.citus_find_shard_split_points(shard_id bigint , shard_group_size bigint)
IS 'creates split points for shards';
GRANT EXECUTE ON FUNCTION pg_catalog.citus_find_shard_split_points(shard_id bigint , shard_group_size bigint) TO PUBLIC;

View File

@ -13,3 +13,20 @@ COMMENT ON FUNCTION pg_catalog.citus_auto_shard_split_start(citus.shard_transfer
IS 'automatically split the necessary shards in the cluster in the background';
GRANT EXECUTE ON FUNCTION pg_catalog.citus_auto_shard_split_start(citus.shard_transfer_mode) TO PUBLIC;
CREATE OR REPLACE FUNCTION pg_catalog.citus_find_shard_split_points(
shard_id bigint,
shard_group_size bigint
)
RETURNS SETOF bigint[]
AS 'MODULE_PATHNAME'
LANGUAGE C VOLATILE;
COMMENT ON FUNCTION pg_catalog.citus_find_shard_split_points(shard_id bigint , shard_group_size bigint)
IS 'creates split points for shards';
GRANT EXECUTE ON FUNCTION pg_catalog.citus_find_shard_split_points(shard_id bigint , shard_group_size bigint) TO PUBLIC;

View File

@ -214,7 +214,7 @@ extern int ShardCount;
extern int ShardReplicationFactor;
extern int NextShardId;
extern int NextPlacementId;
extern uint64 MaxShardSize;
extern int64 MaxShardSize;
extern double TenantFrequency;