Added custom schema , FindShardSplitPoints and ErroronconcurrentOperation

2023-06-19 13:55:02 +05:30 · 2023-06-19 13:55:02 +05:30 · 8bac212941
parent f9a5be59b9
commit 8bac212941
9 changed files with 8065 additions and 1529 deletions
--- a/3052
+++ b/3052
--- a/6164
+++ b/6164
--- a/src/backend/distributed/operations/auto_shard_split.c
+++ b/src/backend/distributed/operations/auto_shard_split.c
@ -0,0 +1,329 @@
 #include "postgres.h"
 #include "libpq-fe.h"
 #include "executor/spi.h"
 #include "distributed/lock_graph.h"
 #include "distributed/coordinator_protocol.h"
 #include "distributed/metadata_cache.h"
 #include "distributed/metadata_utility.h"
 #include "distributed/multi_logical_replication.h"
 #include "distributed/multi_server_executor.h"
 #include "distributed/pg_dist_rebalance_strategy.h"
 #include "distributed/pg_dist_shard.h"
 #include "distributed/reference_table_utils.h"
 #include "distributed/remote_commands.h"
 #include "distributed/resource_lock.h"
 #include "distributed/tuplestore.h"
 #include "distributed/utils/array_type.h"
 #include "distributed/worker_protocol.h"
 #include "nodes/pg_list.h"
 #include "postmaster/postmaster.h"
 #include "distributed/distribution_column.h"
 #include "utils/builtins.h"
 #include "distributed/shard_split.h"
 PG_FUNCTION_INFO_V1(citus_auto_shard_split_start);
 int MaxShardSize = 104857600;
 typedef struct ShardInfoData
 {
 	int64 shardsize;
 	int64 shardminvalue;
 	int64 shardmaxvalue;
 	int64 shardid;
 	int64 nodeid;
 	char *tablename;
 	char *distributionColumn;
 	char *datatype;
 	char *shardname;
    Oid tableId;
    Oid distributionColumnId;
 }ShardInfoData;
 typedef ShardInfoData *ShardInfo;
 void ErrorOnConcurrentOperation(){
 	int64 jobId = 0;
 	if (HasNonTerminalJobOfType("rebalance", &jobId))
 	{
 		ereport(ERROR, (
 					errmsg("A rebalance is already running as job %ld", jobId),
 					errdetail("A rebalance was already scheduled as background job"),
 					errhint("To monitor progress, run: SELECT * FROM "
 							"citus_rebalance_status();")));
 	}
 	if (HasNonTerminalJobOfType("Automatic Shard Split", &jobId))
 	{
 		ereport(ERROR, (
 					errmsg("An automatic shard split is already running as job %ld", jobId),
 					errdetail("An automatic shard split was already scheduled as background job")));
 	}
 }
 StringInfo
 GetShardSplitQuery(ShardInfo shardinfo, List* SplitPoints)
 {
 	StringInfo splitQuery = makeStringInfo();
    int64 length = list_length(SplitPoints);
    appendStringInfo(splitQuery,"SELECT citus_split_shard_by_split_points(%ld, ARRAY[",shardinfo->shardid);
    for (int i =0; i < length-1; i++)
    {
      appendStringInfo(splitQuery,"'%ld',",DatumGetInt64(list_nth(SplitPoints,i)));  
    }
    appendStringInfo(splitQuery,"'%ld'], ARRAY[",DatumGetInt64(list_nth(SplitPoints,length-1)));
    for (int i =0; i < length; i++)
    {
        appendStringInfo(splitQuery,"%ld,",shardinfo->nodeid);
    }
    appendStringInfo(splitQuery,"%ld], 'block_writes')",shardinfo->nodeid);
 	return splitQuery;
 }
 void
 ExecuteSplitBackgroundJob(int64 jobid, ShardInfo shardinfo, List* SplitPoints)
 {
 	StringInfo splitQuery = makeStringInfo();
 	splitQuery = GetShardSplitQuery(shardinfo, SplitPoints);
 	ereport(LOG, (errmsg(splitQuery->data)));
 	int32 nodesInvolved[1];
 	nodesInvolved[0] = shardinfo->nodeid;
 	Oid superUserId = CitusExtensionOwner();
 	ErrorOnConcurrentOperation();
 	BackgroundTask *task = ScheduleBackgroundTask(jobid, superUserId, splitQuery->data, 0,
 												  NULL, 1, nodesInvolved);
 }
 /*
 * It executes a query to find the average hash value in a shard considering rows with a limit of 10GB .
 * 
 */
 int64
 ExecuteAvgHashQuery(ShardInfo shardinfo)
 {
 	StringInfo AvgHashQuery = makeStringInfo();
 	appendStringInfo(AvgHashQuery, "SELECT avg(h)::int,count(*)"
 								   " FROM (SELECT worker_hash(%s) h FROM %s TABLESAMPLE SYSTEM(least(10, 100*10000000000/citus_total_relation_size(%s)))"
 								   " WHERE worker_hash(%s)>=%ld AND worker_hash(%s)<=%ld) s",
 					 shardinfo->distributionColumn, shardinfo->tablename,quote_literal_cstr(shardinfo->tablename),
 					 shardinfo->distributionColumn, shardinfo->shardminvalue,
 					 shardinfo->distributionColumn, shardinfo->shardmaxvalue
 					 );
 	ereport(LOG, errmsg("%s", AvgHashQuery->data));
 	SPI_connect();
 	SPI_exec(AvgHashQuery->data, 0);
 	SPITupleTable *tupletable = SPI_tuptable;
 	HeapTuple tuple = tupletable->vals[0];
 	bool isnull;
 	Datum average = SPI_getbinval(tuple, tupletable->tupdesc, 1, &isnull);
 	int64 IsResultNull = 1;
 	if (!isnull)
 	{
 		IsResultNull = 0;
 	}
 	SPI_freetuptable(tupletable);
 	SPI_finish();
 	if (IsResultNull == 0)
 	{
 		return DatumGetInt64(average);
 	}
 	else
 	{
 		return shardinfo->shardminvalue - 1;
 	}
 }
 /*
 * This function executes a query and then decides whether a shard is subjected for isolation or average hash 2 way split.
 * If a tenant is found splitpoints for isolation is returned otherwise average hash value is returned.
 */
 List *
 FindShardSplitPoints(ShardInfo shardinfo)
 {
 	StringInfo CommonValueQuery = makeStringInfo();
 	appendStringInfo(CommonValueQuery,
 					 "SELECT shardid , unnest(result::%s[]) from run_command_on_shards(%s,$$SELECT array_agg(val)"
 					 " FROM pg_stats s , unnest(most_common_vals::text::%s[],most_common_freqs) as res(val,freq)"
 					 " WHERE tablename = %s AND attname = %s AND freq > 0.2 $$)"
 					 " WHERE result <> '' AND shardid = %ld;",
 					 shardinfo->datatype, quote_literal_cstr(shardinfo->tablename), shardinfo->datatype,
 					 quote_literal_cstr(shardinfo->shardname),
 					 quote_literal_cstr(shardinfo->distributionColumn), shardinfo->shardid);
 	ereport(LOG, errmsg("%s", CommonValueQuery->data));
    List *SplitPoints = NULL;
    MemoryContext originalContext = CurrentMemoryContext;
 	SPI_connect();
 	SPI_exec(CommonValueQuery->data, 0);
    MemoryContext spiContext = CurrentMemoryContext;
 	int64 rowCount = SPI_processed;
    int64 average,hashedValue;
 	ereport(LOG, errmsg("%ld", rowCount));
 	if (rowCount > 0)
 	{
 		SPITupleTable *tupletable = SPI_tuptable;
        CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(shardinfo->tableId);
 		for (int rowIndex = 0; rowIndex < rowCount; rowIndex++)
 		{
 			HeapTuple tuple = tupletable->vals[rowIndex];
 			char *commonValue = SPI_getvalue(tuple, tupletable->tupdesc, 2);
 			ereport(LOG, errmsg("%s", commonValue));
            Datum tenantIdDatum = StringToDatum(commonValue, shardinfo->distributionColumnId);
            Datum hashedValueDatum = FunctionCall1Coll(cacheEntry->hashFunction,
 											   cacheEntry->partitionColumn->varcollid,
 											   tenantIdDatum);
 	        hashedValue = DatumGetInt32(hashedValueDatum);
            ereport(LOG,errmsg("%ld",hashedValue));
            MemoryContextSwitchTo(originalContext);
             if (hashedValue == shardinfo->shardminvalue)
            {
                SplitPoints = lappend(SplitPoints,Int64GetDatum(hashedValue));
            }
            else if (hashedValue == shardinfo->shardmaxvalue)
            {
                SplitPoints = lappend(SplitPoints,Int64GetDatum(hashedValue-1));
            }
            else
            {   
                SplitPoints = lappend(SplitPoints,Int64GetDatum(hashedValue-1));
                SplitPoints = lappend(SplitPoints,Int64GetDatum(hashedValue));;
            }
            MemoryContextSwitchTo(spiContext);	
 		}
 		SPI_freetuptable(tupletable);
 	}
 	else
 	{
 		average = ExecuteAvgHashQuery(shardinfo);
 		ereport(LOG, errmsg("%ld", average));
 	}
 	SPI_finish();
    if(rowCount>0){
        list_sort(SplitPoints, list_int_cmp);
    }else{
        if(shardinfo->shardminvalue<=average){
        SplitPoints = lappend(SplitPoints,Int64GetDatum(average));
        }
    }
    return SplitPoints;
 }
 /*
 * This function calculates the split points of the shard to split and then executes the background job.
 */
 void
 ScheduleShardSplit(ShardInfo shardinfo){
    List* SplitPoints = FindShardSplitPoints(shardinfo);
    if(list_length(SplitPoints)>0){
    // int64 jobId = CreateBackgroundJob("Automatic Shard Split", "Split using SplitPoints List");
    ereport(LOG,errmsg("%s",GetShardSplitQuery(shardinfo,SplitPoints)->data));
    }else{
        ereport(LOG,errmsg("No Splitpoints for shard split"));
    }
 }
 Datum
 citus_auto_shard_split_start(PG_FUNCTION_ARGS)
 {
 	StringInfo query = makeStringInfo();
 	/* This query is written to group the shards on the basis of colocation id and shardminvalue and get the groups whose sum of shardsize
 	 * are greater than a threshold and than extract the shard in them which has the maximum size. So for that first pg_dist_shard and citus_shards are joined followed by the joining of pg_dist_node
 	 * and citus_shards and finally joined by the table obtained by the grouping of colocation id and shardminvalue and shardsize exceeding the threshold.*/
 	appendStringInfo(
 		query,
 		" SELECT cs.shardid,pd.shardminvalue,pd.shardmaxvalue,cs.shard_size,pn.nodeid,ct.distribution_column,ct.table_name,cs.shard_name,(SELECT relname FROM pg_class WHERE oid = (ct.table_name::regclass)::oid) "
 		" FROM pg_catalog.pg_dist_shard pd JOIN pg_catalog.citus_shards cs ON pd.shardid = cs.shardid JOIN pg_catalog.pg_dist_node pn ON cs.nodename = pn.nodename AND cs.nodeport= pn.nodeport"
 		" JOIN"
 		" ( select shardid , max_size from (SELECT distinct first_value(shardid) OVER w as shardid, sum(shard_size) OVER (PARTITION BY colocation_id, shardminvalue) as total_sum, max(shard_size) OVER w as max_size"
 		" FROM citus_shards cs JOIN pg_dist_shard ps USING(shardid)"
 		" WINDOW w AS (PARTITION BY colocation_id, shardminvalue ORDER BY shard_size DESC) )as t where total_sum >= %ld )"
 		" AS max_sizes ON cs.shardid=max_sizes.shardid AND cs.shard_size = max_sizes.max_size JOIN citus_tables ct ON cs.table_name = ct.table_name AND pd.shardminvalue <> pd.shardmaxvalue AND pd.shardminvalue <> ''",
 		0
 		);
 	ereport(LOG, errmsg("%s", query->data));
 	if (SPI_connect() != SPI_OK_CONNECT)
 	{
 		elog(ERROR, "SPI_connect to the query failed");
 	}
 	if (SPI_exec(query->data, 0) != SPI_OK_SELECT)
 	{
 		elog(ERROR, "SPI_exec for the execution failed");
 	}
 	SPITupleTable *tupletable = SPI_tuptable;
 	int rowCount = SPI_processed;
 	bool isnull;
 	for (int rowIndex = 0; rowIndex < rowCount; rowIndex++)
 	{
 		ShardInfoData shardinfo;
 		HeapTuple tuple = tupletable->vals[rowIndex];
 		Datum shardId = SPI_getbinval(tuple, tupletable->tupdesc, 1, &isnull);
 		shardinfo.shardid = DatumGetInt64(shardId);
 		Datum shardSize = SPI_getbinval(tuple, tupletable->tupdesc, 4, &isnull);
 		shardinfo.shardsize = DatumGetInt64(shardSize);
 		Datum nodeId = SPI_getbinval(tuple, tupletable->tupdesc, 5, &isnull);
 		shardinfo.nodeid = DatumGetInt64(nodeId);
 		char *shardMinVal = SPI_getvalue(tuple, tupletable->tupdesc, 2);
 		shardinfo.shardminvalue = strtoi64(shardMinVal, NULL, 10);
 		char *shardMaxVal = SPI_getvalue(tuple, tupletable->tupdesc, 3);
 		shardinfo.shardmaxvalue = strtoi64(shardMaxVal, NULL, 10);
 		shardinfo.distributionColumn = SPI_getvalue(tuple, tupletable->tupdesc, 6);
 		shardinfo.tablename = SPI_getvalue(tuple, tupletable->tupdesc, 7);
 		StringInfo shardnameQuery = makeStringInfo();
 		appendStringInfo(shardnameQuery,"%s_%ld",SPI_getvalue(tuple,tupletable->tupdesc,9),shardinfo.shardid);
 		shardinfo.shardname = shardnameQuery->data;
 		Datum tableIdDatum = SPI_getbinval(tuple, tupletable->tupdesc, 7, &isnull);
 		shardinfo.tableId = DatumGetObjectId(tableIdDatum);
 		shardinfo.distributionColumnId = ColumnTypeIdForRelationColumnName(shardinfo.tableId,
 																	 shardinfo.
 																	 distributionColumn);
 		shardinfo.datatype = format_type_be(shardinfo.distributionColumnId);
 		char * shardSplitMode;
 		// Oid shardTransferModeOid = PG_GETARG_OID(0);
 		// Datum enumLabelDatum = DirectFunctionCall1(enum_out, shardTransferModeOid);
 		// char *enumLabel = DatumGetCString(enumLabelDatum);
 		// ereport(LOG,errmsg("%s",enumLabel));
 		ScheduleShardSplit(&shardinfo);
 		ereport(LOG, (errmsg(
 						  "Shard ID: %ld,ShardMinValue: %ld, ShardMaxValue: %ld , totalSize: %ld , nodeId: %ld",
 						  shardinfo.shardid, shardinfo.shardminvalue,
 						  shardinfo.shardmaxvalue,
 						  shardinfo.shardsize, shardinfo.nodeid)));
 	}
 	SPI_freetuptable(tupletable);
 	SPI_finish();
 	PG_RETURN_VOID();
 }
--- a/src/backend/distributed/shared_library_init.c
+++ b/src/backend/distributed/shared_library_init.c
@ -2261,6 +2261,16 @@ RegisterCitusConfigVariables(void)
 		GUC_STANDARD,
 		NULL, NULL, NULL);
 	DefineCustomIntVariable(
 		"citus.max_shard_size",
 		gettext_noop("Sets the max size of a Shard"),
 		NULL,
 		&MaxShardSize,
 		104857600, 102400, INT32_MAX,
 		PGC_USERSET,
 		GUC_STANDARD,
 		NULL, NULL, NULL);
 	DefineCustomIntVariable(
 		"citus.shard_replication_factor",
 		gettext_noop("Sets the replication factor for shards."),
--- a/src/backend/distributed/sql/citus--11.3-1--12.0-1.sql
+++ b/src/backend/distributed/sql/citus--11.3-1--12.0-1.sql
@ -1,3 +1,4 @@
 -- citus--11.3-1--12.0-1
 #include "udfs/citus_auto_shard_split_start/12.0-1.sql"
 -- bump version to 12.0-1
--- a/src/backend/distributed/sql/udfs/citus_auto_shard_split_start/12.0-1.sql
+++ b/src/backend/distributed/sql/udfs/citus_auto_shard_split_start/12.0-1.sql
@ -0,0 +1,13 @@
 CREATE OR REPLACE FUNCTION pg_catalog.citus_auto_shard_split_start(
    )
    RETURNS VOID
    AS 'MODULE_PATHNAME'
    LANGUAGE C VOLATILE;
 COMMENT ON FUNCTION pg_catalog.citus_auto_shard_split_start()
    IS 'automatically split the necessary shards in the cluster in the background';
 GRANT EXECUTE ON FUNCTION pg_catalog.citus_auto_shard_split_start() TO PUBLIC;
--- a/src/backend/distributed/sql/udfs/citus_auto_shard_split_start/latest.sql
+++ b/src/backend/distributed/sql/udfs/citus_auto_shard_split_start/latest.sql
@ -0,0 +1,14 @@
 CREATE OR REPLACE FUNCTION pg_catalog.citus_auto_shard_split_start(
    )
    RETURNS VOID
    AS 'MODULE_PATHNAME'
    LANGUAGE C VOLATILE;
 COMMENT ON FUNCTION pg_catalog.citus_auto_shard_split_start()
    IS 'automatically split the necessary shards in the cluster in the background';
 GRANT EXECUTE ON FUNCTION pg_catalog.citus_auto_shard_split_start() TO PUBLIC;
--- a/src/include/citus_config.h.in
+++ b/src/include/citus_config.h.in
@ -46,12 +46,12 @@
 /* Define to 1 if you have the `zstd' library (-lzstd). */
 #undef HAVE_LIBZSTD
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 /* Define to 1 if you have the <stdio.h> header file. */
 #undef HAVE_STDIO_H
 /* Define to 1 if you have the <stdlib.h> header file. */
 #undef HAVE_STDLIB_H
@ -94,5 +94,7 @@
 /* The size of `void *', as computed by sizeof. */
 #undef SIZEOF_VOID_P
-/* Define to 1 if you have the ANSI C header files. */
+/* Define to 1 if all of the C90 standard headers exist (not just the ones
   required in a freestanding environment). This macro is provided for
   backward compatibility; new code need not use it. */
 #undef STDC_HEADERS
--- a/src/include/distributed/coordinator_protocol.h
+++ b/src/include/distributed/coordinator_protocol.h
@ -214,6 +214,7 @@ extern int ShardCount;
 extern int ShardReplicationFactor;
 extern int NextShardId;
 extern int NextPlacementId;
 extern int MaxShardSize;
 extern bool IsCoordinator(void);