mirror of https://github.com/citusdata/citus.git
5829 lines
155 KiB
C
5829 lines
155 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* metadata_cache.c
|
|
* Distributed table metadata cache
|
|
*
|
|
* Copyright (c) Citus Data, Inc.
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "libpq-fe.h"
|
|
#include "miscadmin.h"
|
|
#include "stdint.h"
|
|
|
|
#include "access/genam.h"
|
|
#include "access/heapam.h"
|
|
#include "access/htup_details.h"
|
|
#include "access/nbtree.h"
|
|
#include "access/sysattr.h"
|
|
#include "access/xact.h"
|
|
#include "catalog/index.h"
|
|
#include "catalog/indexing.h"
|
|
#include "catalog/pg_am.h"
|
|
#include "catalog/pg_collation.h"
|
|
#include "catalog/pg_enum.h"
|
|
#include "catalog/pg_extension.h"
|
|
#include "catalog/pg_namespace.h"
|
|
#include "catalog/pg_type.h"
|
|
#include "commands/dbcommands.h"
|
|
#include "commands/extension.h"
|
|
#include "commands/trigger.h"
|
|
#include "common/hashfn.h"
|
|
#include "executor/executor.h"
|
|
#include "nodes/makefuncs.h"
|
|
#include "nodes/memnodes.h"
|
|
#include "nodes/pg_list.h"
|
|
#include "parser/parse_func.h"
|
|
#include "parser/parse_type.h"
|
|
#include "storage/lmgr.h"
|
|
#include "utils/array.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/catcache.h"
|
|
#include "utils/datum.h"
|
|
#include "utils/elog.h"
|
|
#include "utils/fmgroids.h"
|
|
#include "utils/hsearch.h"
|
|
#include "utils/inval.h"
|
|
#include "utils/jsonb.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/palloc.h"
|
|
#include "utils/rel.h"
|
|
#include "utils/relmapper.h"
|
|
#include "utils/resowner.h"
|
|
#include "utils/syscache.h"
|
|
#include "utils/typcache.h"
|
|
|
|
#include "citus_version.h"
|
|
#include "pg_version_compat.h"
|
|
#include "pg_version_constants.h"
|
|
|
|
#include "distributed/backend_data.h"
|
|
#include "distributed/citus_depended_object.h"
|
|
#include "distributed/citus_ruleutils.h"
|
|
#include "distributed/colocation_utils.h"
|
|
#include "distributed/connection_management.h"
|
|
#include "distributed/foreign_key_relationship.h"
|
|
#include "distributed/function_utils.h"
|
|
#include "distributed/listutils.h"
|
|
#include "distributed/metadata/pg_dist_object.h"
|
|
#include "distributed/metadata_cache.h"
|
|
#include "distributed/metadata_utility.h"
|
|
#include "distributed/multi_executor.h"
|
|
#include "distributed/multi_physical_planner.h"
|
|
#include "distributed/pg_dist_local_group.h"
|
|
#include "distributed/pg_dist_node.h"
|
|
#include "distributed/pg_dist_node_metadata.h"
|
|
#include "distributed/pg_dist_partition.h"
|
|
#include "distributed/pg_dist_placement.h"
|
|
#include "distributed/pg_dist_shard.h"
|
|
#include "distributed/remote_commands.h"
|
|
#include "distributed/shardinterval_utils.h"
|
|
#include "distributed/shared_library_init.h"
|
|
#include "distributed/utils/array_type.h"
|
|
#include "distributed/utils/function.h"
|
|
#include "distributed/version_compat.h"
|
|
#include "distributed/worker_manager.h"
|
|
#include "distributed/worker_protocol.h"
|
|
|
|
#if PG_VERSION_NUM < PG_VERSION_16
|
|
#include "utils/relfilenodemap.h"
|
|
#endif
|
|
|
|
|
|
/* user configuration */
|
|
int ReadFromSecondaries = USE_SECONDARY_NODES_NEVER;
|
|
|
|
|
|
/*
|
|
* CitusTableCacheEntrySlot is entry type for DistTableCacheHash,
|
|
* entry data outlives slot on invalidation, so requires indirection.
|
|
*/
|
|
typedef struct CitusTableCacheEntrySlot
|
|
{
|
|
/* lookup key - must be first. A pg_class.oid oid. */
|
|
Oid relationId;
|
|
|
|
/* Citus table metadata (NULL for local tables) */
|
|
CitusTableCacheEntry *citusTableMetadata;
|
|
|
|
/*
|
|
* If isValid is false, we need to recheck whether the relation ID
|
|
* belongs to a Citus or not.
|
|
*/
|
|
bool isValid;
|
|
} CitusTableCacheEntrySlot;
|
|
|
|
|
|
/*
|
|
* ShardIdCacheEntry is the entry type for ShardIdCacheHash.
|
|
*
|
|
* This should never be used outside of this file. Use ShardInterval instead.
|
|
*/
|
|
typedef struct ShardIdCacheEntry
|
|
{
|
|
/* hash key, needs to be first */
|
|
uint64 shardId;
|
|
|
|
/* pointer to the table entry to which this shard currently belongs */
|
|
CitusTableCacheEntry *tableEntry;
|
|
|
|
/* index of the shard interval in the sortedShardIntervalArray of the table entry */
|
|
int shardIndex;
|
|
} ShardIdCacheEntry;
|
|
|
|
/*
|
|
* ExtensionCreatedState is used to track if citus extension has been created
|
|
* using CREATE EXTENSION command.
|
|
* UNKNOWN : MetadataCache is invalid. State is UNKNOWN.
|
|
* CREATED : Citus is created.
|
|
* NOTCREATED : Citus is not created.
|
|
*/
|
|
typedef enum ExtensionCreatedState
|
|
{
|
|
UNKNOWN = 0,
|
|
CREATED = 1,
|
|
NOTCREATED = 2,
|
|
} ExtensionCreatedState;
|
|
|
|
/*
|
|
* State which should be cleared upon DROP EXTENSION. When the configuration
|
|
* changes, e.g. because extension is dropped, these summarily get set to 0.
|
|
*/
|
|
typedef struct MetadataCacheData
|
|
{
|
|
ExtensionCreatedState extensionCreatedState;
|
|
Oid distShardRelationId;
|
|
Oid distPlacementRelationId;
|
|
Oid distBackgroundJobRelationId;
|
|
Oid distBackgroundJobPKeyIndexId;
|
|
Oid distBackgroundJobJobIdSequenceId;
|
|
Oid distBackgroundTaskRelationId;
|
|
Oid distBackgroundTaskPKeyIndexId;
|
|
Oid distBackgroundTaskJobIdTaskIdIndexId;
|
|
Oid distBackgroundTaskStatusTaskIdIndexId;
|
|
Oid distBackgroundTaskTaskIdSequenceId;
|
|
Oid distBackgroundTaskDependRelationId;
|
|
Oid distBackgroundTaskDependTaskIdIndexId;
|
|
Oid distBackgroundTaskDependDependsOnIndexId;
|
|
Oid citusJobStatusScheduledId;
|
|
Oid citusJobStatusRunningId;
|
|
Oid citusJobStatusCancellingId;
|
|
Oid citusJobStatusFinishedId;
|
|
Oid citusJobStatusCancelledId;
|
|
Oid citusJobStatusFailedId;
|
|
Oid citusJobStatusFailingId;
|
|
Oid citusTaskStatusBlockedId;
|
|
Oid citusTaskStatusRunnableId;
|
|
Oid citusTaskStatusRunningId;
|
|
Oid citusTaskStatusDoneId;
|
|
Oid citusTaskStatusErrorId;
|
|
Oid citusTaskStatusUnscheduledId;
|
|
Oid citusTaskStatusCancelledId;
|
|
Oid citusTaskStatusCancellingId;
|
|
Oid distRebalanceStrategyRelationId;
|
|
Oid distNodeRelationId;
|
|
Oid distNodeNodeIdIndexId;
|
|
Oid distLocalGroupRelationId;
|
|
Oid distObjectRelationId;
|
|
Oid distObjectPrimaryKeyIndexId;
|
|
Oid distCleanupRelationId;
|
|
Oid distCleanupPrimaryKeyIndexId;
|
|
Oid distColocationRelationId;
|
|
Oid distColocationConfigurationIndexId;
|
|
Oid distPartitionRelationId;
|
|
Oid distTenantSchemaRelationId;
|
|
Oid distPartitionLogicalRelidIndexId;
|
|
Oid distPartitionColocationidIndexId;
|
|
Oid distShardLogicalRelidIndexId;
|
|
Oid distShardShardidIndexId;
|
|
Oid distPlacementShardidIndexId;
|
|
Oid distPlacementPlacementidIndexId;
|
|
Oid distColocationidIndexId;
|
|
Oid distPlacementGroupidIndexId;
|
|
Oid distTransactionRelationId;
|
|
Oid distTransactionGroupIndexId;
|
|
Oid distTenantSchemaPrimaryKeyIndexId;
|
|
Oid distTenantSchemaUniqueColocationIdIndexId;
|
|
Oid citusCatalogNamespaceId;
|
|
Oid copyFormatTypeId;
|
|
Oid readIntermediateResultFuncId;
|
|
Oid readIntermediateResultArrayFuncId;
|
|
Oid extraDataContainerFuncId;
|
|
Oid workerHashFunctionId;
|
|
Oid anyValueFunctionId;
|
|
Oid textSendAsJsonbFunctionId;
|
|
Oid textoutFunctionId;
|
|
Oid extensionOwner;
|
|
Oid binaryCopyFormatId;
|
|
Oid textCopyFormatId;
|
|
Oid primaryNodeRoleId;
|
|
Oid secondaryNodeRoleId;
|
|
Oid pgTableIsVisibleFuncId;
|
|
Oid citusTableIsVisibleFuncId;
|
|
Oid distAuthinfoRelationId;
|
|
Oid distAuthinfoIndexId;
|
|
Oid distPoolinfoRelationId;
|
|
Oid distPoolinfoIndexId;
|
|
Oid relationIsAKnownShardFuncId;
|
|
Oid jsonbExtractPathFuncId;
|
|
Oid jsonbExtractPathTextFuncId;
|
|
Oid CitusDependentObjectFuncId;
|
|
Oid distClockLogicalSequenceId;
|
|
bool databaseNameValid;
|
|
char databaseName[NAMEDATALEN];
|
|
} MetadataCacheData;
|
|
|
|
|
|
static MetadataCacheData MetadataCache;
|
|
|
|
/* Citus extension version variables */
|
|
bool EnableVersionChecks = true; /* version checks are enabled */
|
|
|
|
static bool citusVersionKnownCompatible = false;
|
|
|
|
/* Variable to determine if we are in the process of creating citus */
|
|
static int CreateCitusTransactionLevel = 0;
|
|
|
|
/* Hash table for informations about each partition */
|
|
static HTAB *DistTableCacheHash = NULL;
|
|
static List *DistTableCacheExpired = NIL;
|
|
|
|
/* Hash table for informations about each shard */
|
|
static HTAB *ShardIdCacheHash = NULL;
|
|
|
|
static MemoryContext MetadataCacheMemoryContext = NULL;
|
|
|
|
/* Hash table for information about each object */
|
|
static HTAB *DistObjectCacheHash = NULL;
|
|
|
|
/* Hash table for informations about worker nodes */
|
|
static HTAB *WorkerNodeHash = NULL;
|
|
static WorkerNode **WorkerNodeArray = NULL;
|
|
static int WorkerNodeCount = 0;
|
|
static bool workerNodeHashValid = false;
|
|
|
|
/* default value is -1, for coordinator it's 0 and for worker nodes > 0 */
|
|
static int32 LocalGroupId = -1;
|
|
|
|
/* default value is -1, increases with every node starting from 1 */
|
|
static int32 LocalNodeId = -1;
|
|
|
|
/* built first time through in InitializeDistCache */
|
|
static ScanKeyData DistPartitionScanKey[1];
|
|
static ScanKeyData DistShardScanKey[1];
|
|
static ScanKeyData DistObjectScanKey[3];
|
|
|
|
|
|
/* local function forward declarations */
|
|
static HeapTuple PgDistPartitionTupleViaCatalog(Oid relationId);
|
|
static ShardIdCacheEntry * LookupShardIdCacheEntry(int64 shardId, bool missingOk);
|
|
static CitusTableCacheEntry * BuildCitusTableCacheEntry(Oid relationId);
|
|
static void BuildCachedShardList(CitusTableCacheEntry *cacheEntry);
|
|
static void PrepareWorkerNodeCache(void);
|
|
static bool CheckInstalledVersion(int elevel);
|
|
static char * AvailableExtensionVersion(void);
|
|
static char * InstalledExtensionVersion(void);
|
|
static bool CitusHasBeenLoadedInternal(void);
|
|
static void InitializeCaches(void);
|
|
static void InitializeDistCache(void);
|
|
static void InitializeDistObjectCache(void);
|
|
static void InitializeWorkerNodeCache(void);
|
|
static void RegisterForeignKeyGraphCacheCallbacks(void);
|
|
static void RegisterWorkerNodeCacheCallbacks(void);
|
|
static void RegisterLocalGroupIdCacheCallbacks(void);
|
|
static void RegisterAuthinfoCacheCallbacks(void);
|
|
static void RegisterCitusTableCacheEntryReleaseCallbacks(void);
|
|
static void ResetCitusTableCacheEntry(CitusTableCacheEntry *cacheEntry);
|
|
static void RemoveStaleShardIdCacheEntries(CitusTableCacheEntry *tableEntry);
|
|
static void CreateDistTableCache(void);
|
|
static void CreateShardIdCache(void);
|
|
static void CreateDistObjectCache(void);
|
|
static void InvalidateForeignRelationGraphCacheCallback(Datum argument, Oid relationId);
|
|
static void InvalidateNodeRelationCacheCallback(Datum argument, Oid relationId);
|
|
static void InvalidateLocalGroupIdRelationCacheCallback(Datum argument, Oid relationId);
|
|
static void InvalidateConnParamsCacheCallback(Datum argument, Oid relationId);
|
|
static void CitusTableCacheEntryReleaseCallback(ResourceReleasePhase phase, bool isCommit,
|
|
bool isTopLevel, void *arg);
|
|
static HeapTuple LookupDistPartitionTuple(Relation pgDistPartition, Oid relationId);
|
|
static void GetPartitionTypeInputInfo(char *partitionKeyString, char partitionMethod,
|
|
Oid *columnTypeId, int32 *columnTypeMod,
|
|
Oid *intervalTypeId, int32 *intervalTypeMod);
|
|
static void CachedNamespaceLookup(const char *nspname, Oid *cachedOid);
|
|
static void CachedRelationLookup(const char *relationName, Oid *cachedOid);
|
|
static void CachedRelationLookupExtended(const char *relationName, Oid *cachedOid,
|
|
bool missing_ok);
|
|
static void CachedRelationNamespaceLookup(const char *relationName, Oid relnamespace,
|
|
Oid *cachedOid);
|
|
static void CachedRelationNamespaceLookupExtended(const char *relationName,
|
|
Oid renamespace, Oid *cachedOid,
|
|
bool missing_ok);
|
|
static ShardPlacement * ResolveGroupShardPlacement(
|
|
GroupShardPlacement *groupShardPlacement, CitusTableCacheEntry *tableEntry,
|
|
int shardIndex);
|
|
static Oid LookupEnumValueId(Oid typeId, char *valueName);
|
|
static void InvalidateCitusTableCacheEntrySlot(CitusTableCacheEntrySlot *cacheSlot);
|
|
static void InvalidateDistTableCache(void);
|
|
static void InvalidateDistObjectCache(void);
|
|
static bool InitializeTableCacheEntry(int64 shardId, bool missingOk);
|
|
static bool IsCitusTableTypeInternal(char partitionMethod, char replicationModel,
|
|
uint32 colocationId, CitusTableType tableType);
|
|
static bool RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry, bool
|
|
missingOk);
|
|
|
|
static Oid DistAuthinfoRelationId(void);
|
|
static Oid DistAuthinfoIndexId(void);
|
|
static Oid DistPoolinfoRelationId(void);
|
|
static Oid DistPoolinfoIndexId(void);
|
|
|
|
/* exports for SQL callable functions */
|
|
PG_FUNCTION_INFO_V1(citus_dist_partition_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(master_dist_partition_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(citus_dist_shard_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(master_dist_shard_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(citus_dist_placement_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(master_dist_placement_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(citus_dist_node_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(master_dist_node_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(citus_dist_local_group_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(master_dist_local_group_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(citus_conninfo_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(master_dist_authinfo_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(citus_dist_object_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(master_dist_object_cache_invalidate);
|
|
PG_FUNCTION_INFO_V1(role_exists);
|
|
PG_FUNCTION_INFO_V1(authinfo_valid);
|
|
PG_FUNCTION_INFO_V1(poolinfo_valid);
|
|
|
|
|
|
/*
|
|
* EnsureModificationsCanRun checks if the current node is in recovery mode or
|
|
* citus.use_secondary_nodes is 'always'. If either is true the function errors out.
|
|
*/
|
|
void
|
|
EnsureModificationsCanRun(void)
|
|
{
|
|
if (RecoveryInProgress() && !WritableStandbyCoordinator)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
|
|
errmsg("writing to worker nodes is not currently allowed"),
|
|
errdetail("the database is read-only")));
|
|
}
|
|
|
|
if (ReadFromSecondaries == USE_SECONDARY_NODES_ALWAYS)
|
|
{
|
|
ereport(ERROR, (errmsg("writing to worker nodes is not currently allowed"),
|
|
errdetail("citus.use_secondary_nodes is set to 'always'")));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* EnsureModificationsCanRunOnRelation first calls into EnsureModificationsCanRun() and
|
|
* then does one more additional check. The additional check is to give a proper error
|
|
* message if any relation that is modified is replicated, as replicated tables use
|
|
* 2PC and 2PC cannot happen when recovery is in progress.
|
|
*/
|
|
void
|
|
EnsureModificationsCanRunOnRelation(Oid relationId)
|
|
{
|
|
EnsureModificationsCanRun();
|
|
|
|
if (!OidIsValid(relationId) || !IsCitusTable(relationId))
|
|
{
|
|
/* we are not interested in PG tables */
|
|
return;
|
|
}
|
|
|
|
bool modifiedTableReplicated =
|
|
IsCitusTableType(relationId, REFERENCE_TABLE) ||
|
|
!SingleReplicatedTable(relationId);
|
|
|
|
if (!IsCoordinator() && !AllowModificationsFromWorkersToReplicatedTables &&
|
|
modifiedTableReplicated)
|
|
{
|
|
ereport(ERROR, (errmsg("modifications via the worker nodes are not "
|
|
"allowed for replicated tables such as reference "
|
|
"tables or hash distributed tables with replication "
|
|
"factor greater than 1."),
|
|
errhint("All modifications to replicated tables should "
|
|
"happen via the coordinator unless "
|
|
"citus.allow_modifications_from_workers_to_replicated_tables "
|
|
" = true."),
|
|
errdetail("Allowing modifications from the worker nodes "
|
|
"requires extra locking which might decrease "
|
|
"the throughput.")));
|
|
}
|
|
|
|
/*
|
|
* Even if user allows writes from standby, we should not allow for
|
|
* replicated tables as they require 2PC. And, 2PC needs to write a log
|
|
* record on the coordinator.
|
|
*/
|
|
if (!(RecoveryInProgress() && WritableStandbyCoordinator))
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (modifiedTableReplicated)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
|
|
errmsg("writing to worker nodes is not currently "
|
|
"allowed for replicated tables such as reference "
|
|
"tables or hash distributed tables with replication "
|
|
"factor greater than 1."),
|
|
errhint("All modifications to replicated tables "
|
|
"happen via 2PC, and 2PC requires the "
|
|
"database to be in a writable state."),
|
|
errdetail("the database is read-only")));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* IsCitusTableType returns true if the given table with relationId
|
|
* belongs to a citus table that matches the given table type. If cache
|
|
* entry already exists, prefer using IsCitusTableTypeCacheEntry to avoid
|
|
* an extra lookup.
|
|
*/
|
|
bool
|
|
IsCitusTableType(Oid relationId, CitusTableType tableType)
|
|
{
|
|
CitusTableCacheEntry *tableEntry = LookupCitusTableCacheEntry(relationId);
|
|
|
|
/* we are not interested in postgres tables */
|
|
if (tableEntry == NULL)
|
|
{
|
|
return false;
|
|
}
|
|
return IsCitusTableTypeCacheEntry(tableEntry, tableType);
|
|
}
|
|
|
|
|
|
/*
|
|
* GetCitusTableType is a helper function that returns the CitusTableType
|
|
* for the given relationId.
|
|
* Note that a single table can be qualified as multiple CitusTableType, such
|
|
* as hash distributed tables are both HASH_DISTRIBUTED and DISTRIBUTED_TABLE.
|
|
* This function returns the base type for a given table.
|
|
*
|
|
* If the table is not a Citus table, ANY_CITUS_TABLE_TYPE is returned.
|
|
*/
|
|
CitusTableType
|
|
GetCitusTableType(CitusTableCacheEntry *tableEntry)
|
|
{
|
|
/* we do not expect local tables here */
|
|
Assert(tableEntry != NULL);
|
|
|
|
if (IsCitusTableTypeCacheEntry(tableEntry, HASH_DISTRIBUTED))
|
|
{
|
|
return HASH_DISTRIBUTED;
|
|
}
|
|
else if (IsCitusTableTypeCacheEntry(tableEntry, SINGLE_SHARD_DISTRIBUTED))
|
|
{
|
|
return SINGLE_SHARD_DISTRIBUTED;
|
|
}
|
|
else if (IsCitusTableTypeCacheEntry(tableEntry, REFERENCE_TABLE))
|
|
{
|
|
return REFERENCE_TABLE;
|
|
}
|
|
else if (IsCitusTableTypeCacheEntry(tableEntry, CITUS_LOCAL_TABLE))
|
|
{
|
|
return CITUS_LOCAL_TABLE;
|
|
}
|
|
else if (IsCitusTableTypeCacheEntry(tableEntry, APPEND_DISTRIBUTED))
|
|
{
|
|
return APPEND_DISTRIBUTED;
|
|
}
|
|
else if (IsCitusTableTypeCacheEntry(tableEntry, RANGE_DISTRIBUTED))
|
|
{
|
|
return RANGE_DISTRIBUTED;
|
|
}
|
|
else
|
|
{
|
|
return ANY_CITUS_TABLE_TYPE;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* IsCitusTableTypeCacheEntry returns true if the given table cache entry
|
|
* belongs to a citus table that matches the given table type.
|
|
*/
|
|
bool
|
|
IsCitusTableTypeCacheEntry(CitusTableCacheEntry *tableEntry, CitusTableType tableType)
|
|
{
|
|
return IsCitusTableTypeInternal(tableEntry->partitionMethod,
|
|
tableEntry->replicationModel,
|
|
tableEntry->colocationId, tableType);
|
|
}
|
|
|
|
|
|
/*
|
|
* HasDistributionKey returns true if given Citus table has a distribution key.
|
|
*/
|
|
bool
|
|
HasDistributionKey(Oid relationId)
|
|
{
|
|
CitusTableCacheEntry *tableEntry = LookupCitusTableCacheEntry(relationId);
|
|
if (tableEntry == NULL)
|
|
{
|
|
ereport(ERROR, (errmsg("relation with oid %u is not a Citus table", relationId)));
|
|
}
|
|
|
|
return HasDistributionKeyCacheEntry(tableEntry);
|
|
}
|
|
|
|
|
|
/*
|
|
* HasDistributionKeyCacheEntry returns true if given cache entry identifies a
|
|
* Citus table that has a distribution key.
|
|
*/
|
|
bool
|
|
HasDistributionKeyCacheEntry(CitusTableCacheEntry *tableEntry)
|
|
{
|
|
return tableEntry->partitionMethod != DISTRIBUTE_BY_NONE;
|
|
}
|
|
|
|
|
|
/*
|
|
* IsCitusTableTypeInternal returns true if the given table entry belongs to
|
|
* the given table type group. For definition of table types, see CitusTableType.
|
|
*/
|
|
static bool
|
|
IsCitusTableTypeInternal(char partitionMethod, char replicationModel,
|
|
uint32 colocationId, CitusTableType tableType)
|
|
{
|
|
switch (tableType)
|
|
{
|
|
case HASH_DISTRIBUTED:
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_HASH;
|
|
}
|
|
|
|
case APPEND_DISTRIBUTED:
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_APPEND;
|
|
}
|
|
|
|
case RANGE_DISTRIBUTED:
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_RANGE;
|
|
}
|
|
|
|
case SINGLE_SHARD_DISTRIBUTED:
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_NONE &&
|
|
replicationModel != REPLICATION_MODEL_2PC &&
|
|
colocationId != INVALID_COLOCATION_ID;
|
|
}
|
|
|
|
case DISTRIBUTED_TABLE:
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_HASH ||
|
|
partitionMethod == DISTRIBUTE_BY_RANGE ||
|
|
partitionMethod == DISTRIBUTE_BY_APPEND ||
|
|
(partitionMethod == DISTRIBUTE_BY_NONE &&
|
|
replicationModel != REPLICATION_MODEL_2PC &&
|
|
colocationId != INVALID_COLOCATION_ID);
|
|
}
|
|
|
|
case STRICTLY_PARTITIONED_DISTRIBUTED_TABLE:
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_HASH ||
|
|
partitionMethod == DISTRIBUTE_BY_RANGE;
|
|
}
|
|
|
|
case REFERENCE_TABLE:
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_NONE &&
|
|
replicationModel == REPLICATION_MODEL_2PC;
|
|
}
|
|
|
|
case CITUS_LOCAL_TABLE:
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_NONE &&
|
|
replicationModel != REPLICATION_MODEL_2PC &&
|
|
colocationId == INVALID_COLOCATION_ID;
|
|
}
|
|
|
|
case ANY_CITUS_TABLE_TYPE:
|
|
{
|
|
return true;
|
|
}
|
|
|
|
default:
|
|
{
|
|
ereport(ERROR, (errmsg("Unknown table type %d", tableType)));
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetTableTypeName returns string representation of the table type.
|
|
*/
|
|
char *
|
|
GetTableTypeName(Oid tableId)
|
|
{
|
|
if (!IsCitusTable(tableId))
|
|
{
|
|
return "regular table";
|
|
}
|
|
|
|
CitusTableCacheEntry *tableCacheEntry = GetCitusTableCacheEntry(tableId);
|
|
if (IsCitusTableTypeCacheEntry(tableCacheEntry, HASH_DISTRIBUTED))
|
|
{
|
|
return "distributed table";
|
|
}
|
|
else if (IsCitusTableTypeCacheEntry(tableCacheEntry, REFERENCE_TABLE))
|
|
{
|
|
return "reference table";
|
|
}
|
|
else if (IsCitusTableTypeCacheEntry(tableCacheEntry, CITUS_LOCAL_TABLE))
|
|
{
|
|
return "citus local table";
|
|
}
|
|
else
|
|
{
|
|
return "unknown table";
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* IsCitusTable returns whether relationId is a distributed relation or
|
|
* not.
|
|
*/
|
|
bool
|
|
IsCitusTable(Oid relationId)
|
|
{
|
|
return LookupCitusTableCacheEntry(relationId) != NULL;
|
|
}
|
|
|
|
|
|
/*
|
|
* IsCitusTableRangeVar returns whether the table named in the given
|
|
* rangeVar is a Citus table.
|
|
*/
|
|
bool
|
|
IsCitusTableRangeVar(RangeVar *rangeVar, LOCKMODE lockMode, bool missingOK)
|
|
{
|
|
Oid relationId = RangeVarGetRelid(rangeVar, lockMode, missingOK);
|
|
return IsCitusTable(relationId);
|
|
}
|
|
|
|
|
|
/*
|
|
* IsCitusTableViaCatalog returns whether the given relation is a
|
|
* distributed table or not.
|
|
*
|
|
* It does so by searching pg_dist_partition, explicitly bypassing caches,
|
|
* because this function is designed to be used in cases where accessing
|
|
* metadata tables is not safe.
|
|
*
|
|
* NB: Currently this still hardcodes pg_dist_partition logicalrelid column
|
|
* offset and the corresponding index. If we ever come close to changing
|
|
* that, we'll have to work a bit harder.
|
|
*/
|
|
bool
|
|
IsCitusTableViaCatalog(Oid relationId)
|
|
{
|
|
HeapTuple partitionTuple = PgDistPartitionTupleViaCatalog(relationId);
|
|
|
|
bool heapTupleIsValid = HeapTupleIsValid(partitionTuple);
|
|
|
|
if (heapTupleIsValid)
|
|
{
|
|
heap_freetuple(partitionTuple);
|
|
}
|
|
return heapTupleIsValid;
|
|
}
|
|
|
|
|
|
/*
|
|
* PartitionMethodViaCatalog gets a relationId and returns the partition
|
|
* method column from pg_dist_partition via reading from catalog.
|
|
*/
|
|
char
|
|
PartitionMethodViaCatalog(Oid relationId)
|
|
{
|
|
HeapTuple partitionTuple = PgDistPartitionTupleViaCatalog(relationId);
|
|
if (!HeapTupleIsValid(partitionTuple))
|
|
{
|
|
return DISTRIBUTE_BY_INVALID;
|
|
}
|
|
|
|
Datum datumArray[Natts_pg_dist_partition];
|
|
bool isNullArray[Natts_pg_dist_partition];
|
|
|
|
Relation pgDistPartition = table_open(DistPartitionRelationId(), AccessShareLock);
|
|
|
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistPartition);
|
|
heap_deform_tuple(partitionTuple, tupleDescriptor, datumArray, isNullArray);
|
|
|
|
if (isNullArray[Anum_pg_dist_partition_partmethod - 1])
|
|
{
|
|
/* partition method cannot be NULL, still let's make sure */
|
|
heap_freetuple(partitionTuple);
|
|
table_close(pgDistPartition, NoLock);
|
|
return DISTRIBUTE_BY_INVALID;
|
|
}
|
|
|
|
Datum partitionMethodDatum = datumArray[Anum_pg_dist_partition_partmethod - 1];
|
|
char partitionMethodChar = DatumGetChar(partitionMethodDatum);
|
|
|
|
heap_freetuple(partitionTuple);
|
|
table_close(pgDistPartition, NoLock);
|
|
|
|
return partitionMethodChar;
|
|
}
|
|
|
|
|
|
/*
|
|
* PartitionColumnViaCatalog gets a relationId and returns the partition
|
|
* key column from pg_dist_partition via reading from catalog.
|
|
*/
|
|
Var *
|
|
PartitionColumnViaCatalog(Oid relationId)
|
|
{
|
|
HeapTuple partitionTuple = PgDistPartitionTupleViaCatalog(relationId);
|
|
if (!HeapTupleIsValid(partitionTuple))
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
Datum datumArray[Natts_pg_dist_partition];
|
|
bool isNullArray[Natts_pg_dist_partition];
|
|
|
|
Relation pgDistPartition = table_open(DistPartitionRelationId(), AccessShareLock);
|
|
|
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistPartition);
|
|
heap_deform_tuple(partitionTuple, tupleDescriptor, datumArray, isNullArray);
|
|
|
|
if (isNullArray[Anum_pg_dist_partition_partkey - 1])
|
|
{
|
|
/* partition key cannot be NULL, still let's make sure */
|
|
heap_freetuple(partitionTuple);
|
|
table_close(pgDistPartition, NoLock);
|
|
return NULL;
|
|
}
|
|
|
|
Datum partitionKeyDatum = datumArray[Anum_pg_dist_partition_partkey - 1];
|
|
char *partitionKeyString = TextDatumGetCString(partitionKeyDatum);
|
|
|
|
/* convert the string to a Node and ensure it is a Var */
|
|
Node *partitionNode = stringToNode(partitionKeyString);
|
|
Assert(IsA(partitionNode, Var));
|
|
|
|
Var *partitionColumn = (Var *) partitionNode;
|
|
|
|
heap_freetuple(partitionTuple);
|
|
table_close(pgDistPartition, NoLock);
|
|
|
|
return partitionColumn;
|
|
}
|
|
|
|
|
|
/*
|
|
* ColocationIdViaCatalog gets a relationId and returns the colocation
|
|
* id column from pg_dist_partition via reading from catalog.
|
|
*/
|
|
uint32
|
|
ColocationIdViaCatalog(Oid relationId)
|
|
{
|
|
HeapTuple partitionTuple = PgDistPartitionTupleViaCatalog(relationId);
|
|
if (!HeapTupleIsValid(partitionTuple))
|
|
{
|
|
return INVALID_COLOCATION_ID;
|
|
}
|
|
|
|
Datum datumArray[Natts_pg_dist_partition];
|
|
bool isNullArray[Natts_pg_dist_partition];
|
|
|
|
Relation pgDistPartition = table_open(DistPartitionRelationId(), AccessShareLock);
|
|
|
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistPartition);
|
|
heap_deform_tuple(partitionTuple, tupleDescriptor, datumArray, isNullArray);
|
|
|
|
if (isNullArray[Anum_pg_dist_partition_colocationid - 1])
|
|
{
|
|
/* colocation id cannot be NULL, still let's make sure */
|
|
heap_freetuple(partitionTuple);
|
|
table_close(pgDistPartition, NoLock);
|
|
return INVALID_COLOCATION_ID;
|
|
}
|
|
|
|
Datum colocationIdDatum = datumArray[Anum_pg_dist_partition_colocationid - 1];
|
|
uint32 colocationId = DatumGetUInt32(colocationIdDatum);
|
|
|
|
heap_freetuple(partitionTuple);
|
|
table_close(pgDistPartition, NoLock);
|
|
|
|
return colocationId;
|
|
}
|
|
|
|
|
|
/*
|
|
* PgDistPartitionTupleViaCatalog is a helper function that searches
|
|
* pg_dist_partition for the given relationId. The caller is responsible
|
|
* for ensuring that the returned heap tuple is valid before accessing
|
|
* its fields.
|
|
*/
|
|
static HeapTuple
|
|
PgDistPartitionTupleViaCatalog(Oid relationId)
|
|
{
|
|
const int scanKeyCount = 1;
|
|
ScanKeyData scanKey[1];
|
|
bool indexOK = true;
|
|
|
|
Relation pgDistPartition = table_open(DistPartitionRelationId(), AccessShareLock);
|
|
|
|
ScanKeyInit(&scanKey[0], Anum_pg_dist_partition_logicalrelid,
|
|
BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(relationId));
|
|
|
|
SysScanDesc scanDescriptor = systable_beginscan(pgDistPartition,
|
|
DistPartitionLogicalRelidIndexId(),
|
|
indexOK, NULL, scanKeyCount, scanKey);
|
|
|
|
HeapTuple partitionTuple = systable_getnext(scanDescriptor);
|
|
|
|
if (HeapTupleIsValid(partitionTuple))
|
|
{
|
|
/* callers should have the tuple in their memory contexts */
|
|
partitionTuple = heap_copytuple(partitionTuple);
|
|
}
|
|
|
|
systable_endscan(scanDescriptor);
|
|
table_close(pgDistPartition, AccessShareLock);
|
|
|
|
return partitionTuple;
|
|
}
|
|
|
|
|
|
/*
|
|
* IsReferenceTableByDistParams returns true if given partitionMethod and
|
|
* replicationModel would identify a reference table.
|
|
*/
|
|
bool
|
|
IsReferenceTableByDistParams(char partitionMethod, char replicationModel)
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_NONE &&
|
|
replicationModel == REPLICATION_MODEL_2PC;
|
|
}
|
|
|
|
|
|
/*
|
|
* IsCitusLocalTableByDistParams returns true if given partitionMethod,
|
|
* replicationModel and colocationId would identify a citus local table.
|
|
*/
|
|
bool
|
|
IsCitusLocalTableByDistParams(char partitionMethod, char replicationModel,
|
|
uint32 colocationId)
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_NONE &&
|
|
replicationModel != REPLICATION_MODEL_2PC &&
|
|
colocationId == INVALID_COLOCATION_ID;
|
|
}
|
|
|
|
|
|
/*
|
|
* IsSingleShardTableByDistParams returns true if given partitionMethod,
|
|
* replicationModel and colocationId would identify a single-shard distributed
|
|
* table that has a null shard key.
|
|
*/
|
|
bool
|
|
IsSingleShardTableByDistParams(char partitionMethod, char replicationModel,
|
|
uint32 colocationId)
|
|
{
|
|
return partitionMethod == DISTRIBUTE_BY_NONE &&
|
|
replicationModel != REPLICATION_MODEL_2PC &&
|
|
colocationId != INVALID_COLOCATION_ID;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusTableList returns a list that includes all the valid distributed table
|
|
* cache entries.
|
|
*/
|
|
List *
|
|
CitusTableList(void)
|
|
{
|
|
List *distributedTableList = NIL;
|
|
|
|
Assert(CitusHasBeenLoaded() && CheckCitusVersion(WARNING));
|
|
|
|
/* first, we need to iterate over pg_dist_partition */
|
|
List *citusTableIdList = CitusTableTypeIdList(ANY_CITUS_TABLE_TYPE);
|
|
|
|
Oid relationId = InvalidOid;
|
|
foreach_oid(relationId, citusTableIdList)
|
|
{
|
|
CitusTableCacheEntry *cacheEntry = GetCitusTableCacheEntry(relationId);
|
|
|
|
distributedTableList = lappend(distributedTableList, cacheEntry);
|
|
}
|
|
|
|
return distributedTableList;
|
|
}
|
|
|
|
|
|
/*
|
|
* LoadShardInterval returns the, cached, metadata about a shard.
|
|
*
|
|
* The return value is a copy of the cached ShardInterval struct and may
|
|
* therefore be modified and/or freed.
|
|
*/
|
|
ShardInterval *
|
|
LoadShardInterval(uint64 shardId)
|
|
{
|
|
bool missingOk = false;
|
|
ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
|
|
CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
|
|
int shardIndex = shardIdEntry->shardIndex;
|
|
|
|
/* the offset better be in a valid range */
|
|
Assert(shardIndex < tableEntry->shardIntervalArrayLength);
|
|
|
|
ShardInterval *sourceShardInterval =
|
|
tableEntry->sortedShardIntervalArray[shardIndex];
|
|
|
|
/* copy value to return */
|
|
ShardInterval *shardInterval = CopyShardInterval(sourceShardInterval);
|
|
|
|
return shardInterval;
|
|
}
|
|
|
|
|
|
/*
|
|
* ShardExists returns whether given shard exists or not. It fails if missingOk is false
|
|
* and shard is not found.
|
|
*/
|
|
bool
|
|
ShardExists(uint64 shardId)
|
|
{
|
|
bool missingOk = true;
|
|
ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
|
|
|
|
if (!shardIdEntry)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* RelationIdOfShard returns the relationId of the given shardId.
|
|
*/
|
|
Oid
|
|
RelationIdForShard(uint64 shardId)
|
|
{
|
|
bool missingOk = false;
|
|
ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
|
|
CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
|
|
return tableEntry->relationId;
|
|
}
|
|
|
|
|
|
/*
|
|
* ReferenceTableShardId returns true if the given shardId belongs to
|
|
* a reference table.
|
|
*/
|
|
bool
|
|
ReferenceTableShardId(uint64 shardId)
|
|
{
|
|
bool missingOk = false;
|
|
ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
|
|
CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
|
|
return IsCitusTableTypeCacheEntry(tableEntry, REFERENCE_TABLE);
|
|
}
|
|
|
|
|
|
/*
|
|
* DistributedTableShardId returns true if the given shardId belongs to
|
|
* a distributed table.
|
|
*/
|
|
bool
|
|
DistributedTableShardId(uint64 shardId)
|
|
{
|
|
if (shardId == INVALID_SHARD_ID)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool missingOk = false;
|
|
ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
|
|
CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
|
|
return IsCitusTableTypeCacheEntry(tableEntry, DISTRIBUTED_TABLE);
|
|
}
|
|
|
|
|
|
/*
|
|
* LoadGroupShardPlacement returns the cached shard placement metadata
|
|
*
|
|
* The return value is a copy of the cached GroupShardPlacement struct and may
|
|
* therefore be modified and/or freed.
|
|
*/
|
|
GroupShardPlacement *
|
|
LoadGroupShardPlacement(uint64 shardId, uint64 placementId)
|
|
{
|
|
bool missingOk = false;
|
|
ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
|
|
CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
|
|
int shardIndex = shardIdEntry->shardIndex;
|
|
|
|
/* the offset better be in a valid range */
|
|
Assert(shardIndex < tableEntry->shardIntervalArrayLength);
|
|
|
|
GroupShardPlacement *placementArray =
|
|
tableEntry->arrayOfPlacementArrays[shardIndex];
|
|
int numberOfPlacements =
|
|
tableEntry->arrayOfPlacementArrayLengths[shardIndex];
|
|
|
|
for (int i = 0; i < numberOfPlacements; i++)
|
|
{
|
|
if (placementArray[i].placementId == placementId)
|
|
{
|
|
GroupShardPlacement *shardPlacement = CitusMakeNode(GroupShardPlacement);
|
|
|
|
*shardPlacement = placementArray[i];
|
|
|
|
return shardPlacement;
|
|
}
|
|
}
|
|
|
|
ereport(ERROR, (errmsg("could not find valid entry for shard placement "
|
|
UINT64_FORMAT, placementId)));
|
|
}
|
|
|
|
|
|
/*
|
|
* LoadShardPlacement returns a shard placement for the primary node.
|
|
*/
|
|
ShardPlacement *
|
|
LoadShardPlacement(uint64 shardId, uint64 placementId)
|
|
{
|
|
bool missingOk = false;
|
|
ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
|
|
CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
|
|
int shardIndex = shardIdEntry->shardIndex;
|
|
GroupShardPlacement *groupPlacement = LoadGroupShardPlacement(shardId, placementId);
|
|
ShardPlacement *nodePlacement = ResolveGroupShardPlacement(groupPlacement,
|
|
tableEntry, shardIndex);
|
|
|
|
return nodePlacement;
|
|
}
|
|
|
|
|
|
/*
|
|
* ShardPlacementOnGroupIncludingOrphanedPlacements returns the shard placement
|
|
* for the given shard on the given group, or returns NULL if no placement for
|
|
* the shard exists on the group.
|
|
*
|
|
* NOTE: This can return inactive or orphaned placements.
|
|
*/
|
|
ShardPlacement *
|
|
ShardPlacementOnGroupIncludingOrphanedPlacements(int32 groupId, uint64 shardId)
|
|
{
|
|
ShardPlacement *placementOnNode = NULL;
|
|
|
|
bool missingOk = false;
|
|
ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
|
|
CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
|
|
int shardIndex = shardIdEntry->shardIndex;
|
|
GroupShardPlacement *placementArray =
|
|
tableEntry->arrayOfPlacementArrays[shardIndex];
|
|
int numberOfPlacements =
|
|
tableEntry->arrayOfPlacementArrayLengths[shardIndex];
|
|
|
|
for (int placementIndex = 0; placementIndex < numberOfPlacements; placementIndex++)
|
|
{
|
|
GroupShardPlacement *placement = &placementArray[placementIndex];
|
|
if (placement->groupId == groupId)
|
|
{
|
|
placementOnNode = ResolveGroupShardPlacement(placement, tableEntry,
|
|
shardIndex);
|
|
break;
|
|
}
|
|
}
|
|
|
|
return placementOnNode;
|
|
}
|
|
|
|
|
|
/*
|
|
* ActiveShardPlacementOnGroup returns the active shard placement for the
|
|
* given shard on the given group, or returns NULL if no active placement for
|
|
* the shard exists on the group.
|
|
*/
|
|
ShardPlacement *
|
|
ActiveShardPlacementOnGroup(int32 groupId, uint64 shardId)
|
|
{
|
|
ShardPlacement *placement =
|
|
ShardPlacementOnGroupIncludingOrphanedPlacements(groupId, shardId);
|
|
if (placement == NULL)
|
|
{
|
|
return NULL;
|
|
}
|
|
return placement;
|
|
}
|
|
|
|
|
|
/*
|
|
* ResolveGroupShardPlacement takes a GroupShardPlacement and adds additional data to it,
|
|
* such as the node we should consider it to be on.
|
|
*/
|
|
static ShardPlacement *
|
|
ResolveGroupShardPlacement(GroupShardPlacement *groupShardPlacement,
|
|
CitusTableCacheEntry *tableEntry,
|
|
int shardIndex)
|
|
{
|
|
ShardInterval *shardInterval = tableEntry->sortedShardIntervalArray[shardIndex];
|
|
|
|
ShardPlacement *shardPlacement = CitusMakeNode(ShardPlacement);
|
|
int32 groupId = groupShardPlacement->groupId;
|
|
WorkerNode *workerNode = LookupNodeForGroup(groupId);
|
|
|
|
/* copy everything into shardPlacement but preserve the header */
|
|
CitusNode header = shardPlacement->type;
|
|
GroupShardPlacement *shardPlacementAsGroupPlacement =
|
|
(GroupShardPlacement *) shardPlacement;
|
|
*shardPlacementAsGroupPlacement = *groupShardPlacement;
|
|
shardPlacement->type = header;
|
|
|
|
SetPlacementNodeMetadata(shardPlacement, workerNode);
|
|
|
|
/* fill in remaining fields */
|
|
Assert(tableEntry->partitionMethod != 0);
|
|
shardPlacement->partitionMethod = tableEntry->partitionMethod;
|
|
shardPlacement->colocationGroupId = tableEntry->colocationId;
|
|
if (tableEntry->partitionMethod == DISTRIBUTE_BY_HASH)
|
|
{
|
|
Assert(shardInterval->minValueExists);
|
|
Assert(shardInterval->valueTypeId == INT4OID);
|
|
|
|
/*
|
|
* Use the lower boundary of the interval's range to identify
|
|
* it for colocation purposes. That remains meaningful even if
|
|
* a concurrent session splits a shard.
|
|
*/
|
|
shardPlacement->representativeValue = DatumGetInt32(shardInterval->minValue);
|
|
}
|
|
else
|
|
{
|
|
shardPlacement->representativeValue = 0;
|
|
}
|
|
|
|
return shardPlacement;
|
|
}
|
|
|
|
|
|
/*
|
|
* HasAnyNodes returns whether there are any nodes in pg_dist_node.
|
|
*/
|
|
bool
|
|
HasAnyNodes(void)
|
|
{
|
|
PrepareWorkerNodeCache();
|
|
|
|
return WorkerNodeCount > 0;
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupNodeByNodeId returns a worker node by nodeId or NULL if the node
|
|
* cannot be found.
|
|
*/
|
|
WorkerNode *
|
|
LookupNodeByNodeId(uint32 nodeId)
|
|
{
|
|
PrepareWorkerNodeCache();
|
|
|
|
for (int workerNodeIndex = 0; workerNodeIndex < WorkerNodeCount; workerNodeIndex++)
|
|
{
|
|
WorkerNode *workerNode = WorkerNodeArray[workerNodeIndex];
|
|
if (workerNode->nodeId == nodeId)
|
|
{
|
|
WorkerNode *workerNodeCopy = palloc0(sizeof(WorkerNode));
|
|
*workerNodeCopy = *workerNode;
|
|
|
|
return workerNodeCopy;
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupNodeByNodeIdOrError returns a worker node by nodeId or errors out if the
|
|
* node cannot be found.
|
|
*/
|
|
WorkerNode *
|
|
LookupNodeByNodeIdOrError(uint32 nodeId)
|
|
{
|
|
WorkerNode *node = LookupNodeByNodeId(nodeId);
|
|
if (node == NULL)
|
|
{
|
|
ereport(ERROR, (errmsg("node %d could not be found", nodeId)));
|
|
}
|
|
return node;
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupNodeForGroup searches the WorkerNodeHash for a worker which is a member of the
|
|
* given group and also readable (a primary if we're reading from primaries, a secondary
|
|
* if we're reading from secondaries). If such a node does not exist it emits an
|
|
* appropriate error message.
|
|
*/
|
|
WorkerNode *
|
|
LookupNodeForGroup(int32 groupId)
|
|
{
|
|
bool foundAnyNodes = false;
|
|
|
|
PrepareWorkerNodeCache();
|
|
|
|
for (int workerNodeIndex = 0; workerNodeIndex < WorkerNodeCount; workerNodeIndex++)
|
|
{
|
|
WorkerNode *workerNode = WorkerNodeArray[workerNodeIndex];
|
|
int32 workerNodeGroupId = workerNode->groupId;
|
|
if (workerNodeGroupId != groupId)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
foundAnyNodes = true;
|
|
|
|
if (NodeIsReadable(workerNode))
|
|
{
|
|
return workerNode;
|
|
}
|
|
}
|
|
|
|
if (!foundAnyNodes)
|
|
{
|
|
ereport(ERROR, (errmsg("there is a shard placement in node group %d but "
|
|
"there are no nodes in that group", groupId)));
|
|
}
|
|
|
|
switch (ReadFromSecondaries)
|
|
{
|
|
case USE_SECONDARY_NODES_NEVER:
|
|
{
|
|
ereport(ERROR, (errmsg("node group %d does not have a primary node",
|
|
groupId)));
|
|
break;
|
|
}
|
|
|
|
case USE_SECONDARY_NODES_ALWAYS:
|
|
{
|
|
ereport(ERROR, (errmsg("node group %d does not have a secondary node",
|
|
groupId)));
|
|
break;
|
|
}
|
|
|
|
default:
|
|
{
|
|
ereport(FATAL, (errmsg("unrecognized value for use_secondary_nodes")));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* ShardPlacementList returns the list of placements for the given shard from
|
|
* the cache.
|
|
*
|
|
* The returned list is deep copied from the cache and thus can be modified
|
|
* and pfree()d freely.
|
|
*/
|
|
List *
|
|
ShardPlacementList(uint64 shardId)
|
|
{
|
|
List *placementList = NIL;
|
|
|
|
bool missingOk = false;
|
|
ShardIdCacheEntry *shardIdEntry = LookupShardIdCacheEntry(shardId, missingOk);
|
|
CitusTableCacheEntry *tableEntry = shardIdEntry->tableEntry;
|
|
int shardIndex = shardIdEntry->shardIndex;
|
|
|
|
/* the offset better be in a valid range */
|
|
Assert(shardIndex < tableEntry->shardIntervalArrayLength);
|
|
|
|
GroupShardPlacement *placementArray =
|
|
tableEntry->arrayOfPlacementArrays[shardIndex];
|
|
int numberOfPlacements =
|
|
tableEntry->arrayOfPlacementArrayLengths[shardIndex];
|
|
|
|
for (int i = 0; i < numberOfPlacements; i++)
|
|
{
|
|
GroupShardPlacement *groupShardPlacement = &placementArray[i];
|
|
ShardPlacement *shardPlacement = ResolveGroupShardPlacement(groupShardPlacement,
|
|
tableEntry,
|
|
shardIndex);
|
|
|
|
placementList = lappend(placementList, shardPlacement);
|
|
}
|
|
|
|
/* if no shard placements are found, warn the user */
|
|
if (numberOfPlacements == 0)
|
|
{
|
|
ereport(WARNING, (errmsg("could not find any shard placements for shardId "
|
|
UINT64_FORMAT, shardId)));
|
|
}
|
|
|
|
return placementList;
|
|
}
|
|
|
|
|
|
/*
|
|
* InitializeTableCacheEntry initializes a shard in cache. A possible reason
|
|
* for not finding an entry in the cache is that the distributed table's cache
|
|
* entry hasn't been accessed yet. Thus look up the distributed table, and
|
|
* build the cache entry. Afterwards we know that the shard has to be in the
|
|
* cache if it exists. If the shard does *not* exist, this function errors
|
|
* (because LookupShardRelationFromCatalog errors out).
|
|
*
|
|
* If missingOk is true and the shard cannot be found, the function returns false.
|
|
*/
|
|
static bool
|
|
InitializeTableCacheEntry(int64 shardId, bool missingOk)
|
|
{
|
|
Oid relationId = LookupShardRelationFromCatalog(shardId, missingOk);
|
|
|
|
if (!OidIsValid(relationId))
|
|
{
|
|
Assert(missingOk);
|
|
return false;
|
|
}
|
|
|
|
/* trigger building the cache for the shard id */
|
|
GetCitusTableCacheEntry(relationId); /* lgtm[cpp/return-value-ignored] */
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* RefreshTableCacheEntryIfInvalid checks if the cache entry is still valid and
|
|
* refreshes it in cache when it's not. It returns true if it refreshed the
|
|
* entry in the cache and false if it didn't.
|
|
*/
|
|
static bool
|
|
RefreshTableCacheEntryIfInvalid(ShardIdCacheEntry *shardEntry, bool missingOk)
|
|
{
|
|
/*
|
|
* We might have some concurrent metadata changes. In order to get the changes,
|
|
* we first need to accept the cache invalidation messages.
|
|
*/
|
|
AcceptInvalidationMessages();
|
|
if (shardEntry->tableEntry->isValid)
|
|
{
|
|
return false;
|
|
}
|
|
Oid oldRelationId = shardEntry->tableEntry->relationId;
|
|
Oid currentRelationId = LookupShardRelationFromCatalog(shardEntry->shardId,
|
|
missingOk);
|
|
|
|
/*
|
|
* The relation OID to which the shard belongs could have changed,
|
|
* most notably when the extension is dropped and a shard ID is
|
|
* reused. Reload the cache entries for both old and new relation
|
|
* ID and then look up the shard entry again.
|
|
*/
|
|
LookupCitusTableCacheEntry(oldRelationId);
|
|
LookupCitusTableCacheEntry(currentRelationId);
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupShardCacheEntry returns the cache entry belonging to a shard.
|
|
* It errors out if that shard is unknown and missingOk is false. Else,
|
|
* it will return a NULL cache entry.
|
|
*/
|
|
static ShardIdCacheEntry *
|
|
LookupShardIdCacheEntry(int64 shardId, bool missingOk)
|
|
{
|
|
bool foundInCache = false;
|
|
bool recheck = false;
|
|
|
|
Assert(CitusHasBeenLoaded() && CheckCitusVersion(WARNING));
|
|
|
|
InitializeCaches();
|
|
|
|
ShardIdCacheEntry *shardEntry =
|
|
hash_search(ShardIdCacheHash, &shardId, HASH_FIND, &foundInCache);
|
|
|
|
if (!foundInCache)
|
|
{
|
|
if (!InitializeTableCacheEntry(shardId, missingOk))
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
recheck = true;
|
|
}
|
|
else
|
|
{
|
|
recheck = RefreshTableCacheEntryIfInvalid(shardEntry, missingOk);
|
|
}
|
|
|
|
/*
|
|
* If we (re-)loaded the table cache, re-search the shard cache - the
|
|
* shard index might have changed. If we still can't find the entry, it
|
|
* can't exist.
|
|
*/
|
|
if (recheck)
|
|
{
|
|
shardEntry = hash_search(ShardIdCacheHash, &shardId, HASH_FIND, &foundInCache);
|
|
|
|
if (!foundInCache)
|
|
{
|
|
int eflag = (missingOk) ? DEBUG1 : ERROR;
|
|
ereport(eflag, (errmsg("could not find valid entry for shard "
|
|
UINT64_FORMAT, shardId)));
|
|
}
|
|
}
|
|
|
|
return shardEntry;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetCitusTableCacheEntry looks up a pg_dist_partition entry for a
|
|
* relation.
|
|
*
|
|
* Errors out if no relation matching the criteria could be found.
|
|
*/
|
|
CitusTableCacheEntry *
|
|
GetCitusTableCacheEntry(Oid distributedRelationId)
|
|
{
|
|
CitusTableCacheEntry *cacheEntry =
|
|
LookupCitusTableCacheEntry(distributedRelationId);
|
|
|
|
if (cacheEntry)
|
|
{
|
|
return cacheEntry;
|
|
}
|
|
else
|
|
{
|
|
char *relationName = get_rel_name(distributedRelationId);
|
|
|
|
if (relationName == NULL)
|
|
{
|
|
ereport(ERROR, (errmsg("relation with OID %u does not exist",
|
|
distributedRelationId)));
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errmsg("relation %s is not distributed", relationName)));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* GetCitusTableCacheEntry returns the distributed table metadata for the
|
|
* passed relationId. For efficiency it caches lookups. This function returns
|
|
* NULL if the relation isn't a distributed table.
|
|
*/
|
|
CitusTableCacheEntry *
|
|
LookupCitusTableCacheEntry(Oid relationId)
|
|
{
|
|
bool foundInCache = false;
|
|
void *hashKey = (void *) &relationId;
|
|
|
|
/*
|
|
* Can't be a distributed relation if the extension hasn't been loaded
|
|
* yet. As we can't do lookups in nonexistent tables, directly return NULL
|
|
* here.
|
|
*/
|
|
if (!CitusHasBeenLoaded())
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
InitializeCaches();
|
|
|
|
/*
|
|
* If the version is not known to be compatible, perform thorough check,
|
|
* unless such checks are disabled.
|
|
*/
|
|
if (!citusVersionKnownCompatible && EnableVersionChecks)
|
|
{
|
|
bool isCitusTable = IsCitusTableViaCatalog(relationId);
|
|
int reportLevel = DEBUG1;
|
|
|
|
/*
|
|
* If there's a version-mismatch, and we're dealing with a distributed
|
|
* table, we have to error out as we can't return a valid entry. We
|
|
* want to check compatibility in the non-distributed case as well, so
|
|
* future lookups can use the cache if compatible.
|
|
*/
|
|
if (isCitusTable)
|
|
{
|
|
reportLevel = ERROR;
|
|
}
|
|
|
|
if (!CheckCitusVersion(reportLevel))
|
|
{
|
|
/* incompatible, can't access cache, so return before doing so */
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* We might have some concurrent metadata changes. In order to get the changes,
|
|
* we first need to accept the cache invalidation messages.
|
|
*/
|
|
AcceptInvalidationMessages();
|
|
CitusTableCacheEntrySlot *cacheSlot =
|
|
hash_search(DistTableCacheHash, hashKey, HASH_ENTER, &foundInCache);
|
|
|
|
/* return valid matches */
|
|
if (foundInCache)
|
|
{
|
|
if (cacheSlot->isValid)
|
|
{
|
|
return cacheSlot->citusTableMetadata;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* An invalidation was received or we encountered an OOM while building
|
|
* the cache entry. We need to rebuild it.
|
|
*/
|
|
|
|
if (cacheSlot->citusTableMetadata)
|
|
{
|
|
/*
|
|
* The CitusTableCacheEntry might still be in use. We therefore do
|
|
* not reset it until the end of the transaction.
|
|
*/
|
|
MemoryContext oldContext =
|
|
MemoryContextSwitchTo(MetadataCacheMemoryContext);
|
|
|
|
DistTableCacheExpired = lappend(DistTableCacheExpired,
|
|
cacheSlot->citusTableMetadata);
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* zero out entry, but not the key part */
|
|
memset(((char *) cacheSlot) + sizeof(Oid), 0,
|
|
sizeof(CitusTableCacheEntrySlot) - sizeof(Oid));
|
|
|
|
/*
|
|
* We disable interrupts while creating the cache entry because loading
|
|
* shard metadata can take a while, and if statement_timeout is too low,
|
|
* this will get canceled on each call and we won't be able to run any
|
|
* queries on the table.
|
|
*/
|
|
HOLD_INTERRUPTS();
|
|
|
|
cacheSlot->citusTableMetadata = BuildCitusTableCacheEntry(relationId);
|
|
|
|
/*
|
|
* Mark it as valid only after building the full entry, such that any
|
|
* error that happened during the build would trigger a rebuild.
|
|
*/
|
|
cacheSlot->isValid = true;
|
|
|
|
RESUME_INTERRUPTS();
|
|
|
|
return cacheSlot->citusTableMetadata;
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupDistObjectCacheEntry returns the distributed table metadata for the
|
|
* passed relationId. For efficiency it caches lookups.
|
|
*/
|
|
DistObjectCacheEntry *
|
|
LookupDistObjectCacheEntry(Oid classid, Oid objid, int32 objsubid)
|
|
{
|
|
bool foundInCache = false;
|
|
DistObjectCacheEntryKey hashKey;
|
|
ScanKeyData pgDistObjectKey[3];
|
|
|
|
memset(&hashKey, 0, sizeof(DistObjectCacheEntryKey));
|
|
hashKey.classid = classid;
|
|
hashKey.objid = objid;
|
|
hashKey.objsubid = objsubid;
|
|
|
|
/*
|
|
* Can't be a distributed relation if the extension hasn't been loaded
|
|
* yet. As we can't do lookups in nonexistent tables, directly return NULL
|
|
* here.
|
|
*/
|
|
if (!CitusHasBeenLoaded())
|
|
{
|
|
return NULL;
|
|
}
|
|
|
|
InitializeCaches();
|
|
|
|
DistObjectCacheEntry *cacheEntry = hash_search(DistObjectCacheHash, &hashKey,
|
|
HASH_ENTER, &foundInCache);
|
|
|
|
/* return valid matches */
|
|
if (foundInCache)
|
|
{
|
|
/*
|
|
* We might have some concurrent metadata changes. In order to get the changes,
|
|
* we first need to accept the cache invalidation messages.
|
|
*/
|
|
AcceptInvalidationMessages();
|
|
|
|
if (cacheEntry->isValid)
|
|
{
|
|
return cacheEntry;
|
|
}
|
|
|
|
/*
|
|
* This is where we'd free the old entry's out of band data if it had any.
|
|
* Right now we don't have anything to free.
|
|
*/
|
|
}
|
|
|
|
/* zero out entry, but not the key part */
|
|
memset(((char *) cacheEntry), 0, sizeof(DistObjectCacheEntry));
|
|
cacheEntry->key.classid = classid;
|
|
cacheEntry->key.objid = objid;
|
|
cacheEntry->key.objsubid = objsubid;
|
|
|
|
Relation pgDistObjectRel = table_open(DistObjectRelationId(), AccessShareLock);
|
|
TupleDesc pgDistObjectTupleDesc = RelationGetDescr(pgDistObjectRel);
|
|
|
|
ScanKeyInit(&pgDistObjectKey[0], Anum_pg_dist_object_classid,
|
|
BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(classid));
|
|
ScanKeyInit(&pgDistObjectKey[1], Anum_pg_dist_object_objid,
|
|
BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(objid));
|
|
ScanKeyInit(&pgDistObjectKey[2], Anum_pg_dist_object_objsubid,
|
|
BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(objsubid));
|
|
|
|
SysScanDesc pgDistObjectScan = systable_beginscan(pgDistObjectRel,
|
|
DistObjectPrimaryKeyIndexId(),
|
|
true, NULL, 3, pgDistObjectKey);
|
|
HeapTuple pgDistObjectTup = systable_getnext(pgDistObjectScan);
|
|
|
|
if (HeapTupleIsValid(pgDistObjectTup))
|
|
{
|
|
Datum datumArray[Natts_pg_dist_object];
|
|
bool isNullArray[Natts_pg_dist_object];
|
|
|
|
heap_deform_tuple(pgDistObjectTup, pgDistObjectTupleDesc, datumArray,
|
|
isNullArray);
|
|
|
|
cacheEntry->isValid = true;
|
|
cacheEntry->isDistributed = true;
|
|
|
|
cacheEntry->distributionArgIndex =
|
|
DatumGetInt32(datumArray[Anum_pg_dist_object_distribution_argument_index -
|
|
1]);
|
|
cacheEntry->colocationId =
|
|
DatumGetInt32(datumArray[Anum_pg_dist_object_colocationid - 1]);
|
|
|
|
cacheEntry->forceDelegation =
|
|
DatumGetBool(datumArray[Anum_pg_dist_object_force_delegation - 1]);
|
|
}
|
|
else
|
|
{
|
|
cacheEntry->isValid = true;
|
|
cacheEntry->isDistributed = false;
|
|
}
|
|
|
|
systable_endscan(pgDistObjectScan);
|
|
relation_close(pgDistObjectRel, AccessShareLock);
|
|
|
|
return cacheEntry;
|
|
}
|
|
|
|
|
|
/*
|
|
* BuildCitusTableCacheEntry is a helper routine for
|
|
* LookupCitusTableCacheEntry() for building the cache contents.
|
|
* This function returns NULL if the relation isn't a distributed table.
|
|
*/
|
|
static CitusTableCacheEntry *
|
|
BuildCitusTableCacheEntry(Oid relationId)
|
|
{
|
|
Relation pgDistPartition = table_open(DistPartitionRelationId(), AccessShareLock);
|
|
HeapTuple distPartitionTuple =
|
|
LookupDistPartitionTuple(pgDistPartition, relationId);
|
|
|
|
if (distPartitionTuple == NULL)
|
|
{
|
|
/* not a distributed table, done */
|
|
table_close(pgDistPartition, NoLock);
|
|
return NULL;
|
|
}
|
|
|
|
MemoryContext oldContext = NULL;
|
|
Datum datumArray[Natts_pg_dist_partition];
|
|
bool isNullArray[Natts_pg_dist_partition];
|
|
|
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistPartition);
|
|
heap_deform_tuple(distPartitionTuple, tupleDescriptor, datumArray, isNullArray);
|
|
|
|
CitusTableCacheEntry *cacheEntry =
|
|
MemoryContextAllocZero(MetadataCacheMemoryContext, sizeof(CitusTableCacheEntry));
|
|
|
|
cacheEntry->relationId = relationId;
|
|
|
|
cacheEntry->partitionMethod = datumArray[Anum_pg_dist_partition_partmethod - 1];
|
|
Datum partitionKeyDatum = datumArray[Anum_pg_dist_partition_partkey - 1];
|
|
bool partitionKeyIsNull = isNullArray[Anum_pg_dist_partition_partkey - 1];
|
|
|
|
/* note that for reference tables partitionKeyisNull is true */
|
|
if (!partitionKeyIsNull)
|
|
{
|
|
oldContext = MemoryContextSwitchTo(MetadataCacheMemoryContext);
|
|
|
|
/* get the string representation of the partition column Var */
|
|
cacheEntry->partitionKeyString = TextDatumGetCString(partitionKeyDatum);
|
|
|
|
/* convert the string to a Node and ensure it is a Var */
|
|
Node *partitionNode = stringToNode(cacheEntry->partitionKeyString);
|
|
Assert(IsA(partitionNode, Var));
|
|
|
|
cacheEntry->partitionColumn = (Var *) partitionNode;
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
}
|
|
else
|
|
{
|
|
cacheEntry->partitionKeyString = NULL;
|
|
}
|
|
|
|
cacheEntry->colocationId = datumArray[Anum_pg_dist_partition_colocationid - 1];
|
|
if (isNullArray[Anum_pg_dist_partition_colocationid - 1])
|
|
{
|
|
cacheEntry->colocationId = INVALID_COLOCATION_ID;
|
|
}
|
|
|
|
Datum replicationModelDatum = datumArray[Anum_pg_dist_partition_repmodel - 1];
|
|
if (isNullArray[Anum_pg_dist_partition_repmodel - 1])
|
|
{
|
|
/*
|
|
* repmodel is NOT NULL but before ALTER EXTENSION citus UPGRADE the column
|
|
* doesn't exist
|
|
*/
|
|
cacheEntry->replicationModel = 'c';
|
|
}
|
|
else
|
|
{
|
|
cacheEntry->replicationModel = DatumGetChar(replicationModelDatum);
|
|
}
|
|
|
|
if (isNullArray[Anum_pg_dist_partition_autoconverted - 1])
|
|
{
|
|
/*
|
|
* We don't expect this to happen, but set it to false (the default value)
|
|
* to not break if anything goes wrong.
|
|
*/
|
|
cacheEntry->autoConverted = false;
|
|
}
|
|
else
|
|
{
|
|
cacheEntry->autoConverted = DatumGetBool(
|
|
datumArray[Anum_pg_dist_partition_autoconverted - 1]);
|
|
}
|
|
|
|
heap_freetuple(distPartitionTuple);
|
|
|
|
BuildCachedShardList(cacheEntry);
|
|
|
|
/* we only need hash functions for hash distributed tables */
|
|
if (cacheEntry->partitionMethod == DISTRIBUTE_BY_HASH)
|
|
{
|
|
Var *partitionColumn = cacheEntry->partitionColumn;
|
|
|
|
TypeCacheEntry *typeEntry = lookup_type_cache(partitionColumn->vartype,
|
|
TYPECACHE_HASH_PROC_FINFO);
|
|
|
|
FmgrInfo *hashFunction = MemoryContextAllocZero(MetadataCacheMemoryContext,
|
|
sizeof(FmgrInfo));
|
|
|
|
fmgr_info_copy(hashFunction, &(typeEntry->hash_proc_finfo),
|
|
MetadataCacheMemoryContext);
|
|
|
|
cacheEntry->hashFunction = hashFunction;
|
|
|
|
/* check the shard distribution for hash partitioned tables */
|
|
cacheEntry->hasUniformHashDistribution =
|
|
HasUniformHashDistribution(cacheEntry->sortedShardIntervalArray,
|
|
cacheEntry->shardIntervalArrayLength);
|
|
}
|
|
else
|
|
{
|
|
cacheEntry->hashFunction = NULL;
|
|
}
|
|
|
|
oldContext = MemoryContextSwitchTo(MetadataCacheMemoryContext);
|
|
|
|
cacheEntry->referencedRelationsViaForeignKey = ReferencedRelationIdList(
|
|
cacheEntry->relationId);
|
|
cacheEntry->referencingRelationsViaForeignKey = ReferencingRelationIdList(
|
|
cacheEntry->relationId);
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
table_close(pgDistPartition, NoLock);
|
|
|
|
cacheEntry->isValid = true;
|
|
|
|
return cacheEntry;
|
|
}
|
|
|
|
|
|
/*
|
|
* BuildCachedShardList() is a helper routine for BuildCitusTableCacheEntry()
|
|
* building up the list of shards in a distributed relation.
|
|
*/
|
|
static void
|
|
BuildCachedShardList(CitusTableCacheEntry *cacheEntry)
|
|
{
|
|
ShardInterval **shardIntervalArray = NULL;
|
|
ShardInterval **sortedShardIntervalArray = NULL;
|
|
FmgrInfo *shardIntervalCompareFunction = NULL;
|
|
FmgrInfo *shardColumnCompareFunction = NULL;
|
|
Oid columnTypeId = InvalidOid;
|
|
int32 columnTypeMod = -1;
|
|
Oid intervalTypeId = InvalidOid;
|
|
int32 intervalTypeMod = -1;
|
|
|
|
GetPartitionTypeInputInfo(cacheEntry->partitionKeyString,
|
|
cacheEntry->partitionMethod,
|
|
&columnTypeId,
|
|
&columnTypeMod,
|
|
&intervalTypeId,
|
|
&intervalTypeMod);
|
|
|
|
List *distShardTupleList = LookupDistShardTuples(cacheEntry->relationId);
|
|
int shardIntervalArrayLength = list_length(distShardTupleList);
|
|
if (shardIntervalArrayLength > 0)
|
|
{
|
|
Relation distShardRelation = table_open(DistShardRelationId(), AccessShareLock);
|
|
TupleDesc distShardTupleDesc = RelationGetDescr(distShardRelation);
|
|
int arrayIndex = 0;
|
|
|
|
shardIntervalArray = MemoryContextAllocZero(MetadataCacheMemoryContext,
|
|
shardIntervalArrayLength *
|
|
sizeof(ShardInterval *));
|
|
|
|
cacheEntry->arrayOfPlacementArrays =
|
|
MemoryContextAllocZero(MetadataCacheMemoryContext,
|
|
shardIntervalArrayLength *
|
|
sizeof(GroupShardPlacement *));
|
|
cacheEntry->arrayOfPlacementArrayLengths =
|
|
MemoryContextAllocZero(MetadataCacheMemoryContext,
|
|
shardIntervalArrayLength *
|
|
sizeof(int));
|
|
|
|
HeapTuple shardTuple = NULL;
|
|
foreach_ptr(shardTuple, distShardTupleList)
|
|
{
|
|
ShardInterval *shardInterval = TupleToShardInterval(shardTuple,
|
|
distShardTupleDesc,
|
|
intervalTypeId,
|
|
intervalTypeMod);
|
|
MemoryContext oldContext = MemoryContextSwitchTo(MetadataCacheMemoryContext);
|
|
|
|
shardIntervalArray[arrayIndex] = CopyShardInterval(shardInterval);
|
|
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
heap_freetuple(shardTuple);
|
|
|
|
arrayIndex++;
|
|
}
|
|
|
|
table_close(distShardRelation, AccessShareLock);
|
|
}
|
|
|
|
/* look up value comparison function */
|
|
if (columnTypeId != InvalidOid)
|
|
{
|
|
/* allocate the comparison function in the cache context */
|
|
MemoryContext oldContext = MemoryContextSwitchTo(MetadataCacheMemoryContext);
|
|
|
|
shardColumnCompareFunction = GetFunctionInfo(columnTypeId, BTREE_AM_OID,
|
|
BTORDER_PROC);
|
|
MemoryContextSwitchTo(oldContext);
|
|
}
|
|
else
|
|
{
|
|
shardColumnCompareFunction = NULL;
|
|
}
|
|
|
|
/* look up interval comparison function */
|
|
if (intervalTypeId != InvalidOid)
|
|
{
|
|
/* allocate the comparison function in the cache context */
|
|
MemoryContext oldContext = MemoryContextSwitchTo(MetadataCacheMemoryContext);
|
|
|
|
shardIntervalCompareFunction = GetFunctionInfo(intervalTypeId, BTREE_AM_OID,
|
|
BTORDER_PROC);
|
|
MemoryContextSwitchTo(oldContext);
|
|
}
|
|
else
|
|
{
|
|
shardIntervalCompareFunction = NULL;
|
|
}
|
|
|
|
/* reference tables has a single shard which is not initialized */
|
|
if (cacheEntry->partitionMethod == DISTRIBUTE_BY_NONE)
|
|
{
|
|
cacheEntry->hasUninitializedShardInterval = true;
|
|
cacheEntry->hasOverlappingShardInterval = true;
|
|
|
|
/*
|
|
* Note that during create_reference_table() call,
|
|
* the reference table do not have any shards.
|
|
*/
|
|
if (shardIntervalArrayLength > 1)
|
|
{
|
|
char *relationName = get_rel_name(cacheEntry->relationId);
|
|
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("reference table \"%s\" has more than 1 shard",
|
|
relationName)));
|
|
}
|
|
|
|
/* since there is a zero or one shard, it is already sorted */
|
|
sortedShardIntervalArray = shardIntervalArray;
|
|
}
|
|
else
|
|
{
|
|
/* sort the interval array */
|
|
sortedShardIntervalArray = SortShardIntervalArray(shardIntervalArray,
|
|
shardIntervalArrayLength,
|
|
cacheEntry->partitionColumn->
|
|
varcollid,
|
|
shardIntervalCompareFunction);
|
|
|
|
/* check if there exists any shard intervals with no min/max values */
|
|
cacheEntry->hasUninitializedShardInterval =
|
|
HasUninitializedShardInterval(sortedShardIntervalArray,
|
|
shardIntervalArrayLength);
|
|
|
|
if (!cacheEntry->hasUninitializedShardInterval)
|
|
{
|
|
cacheEntry->hasOverlappingShardInterval =
|
|
HasOverlappingShardInterval(sortedShardIntervalArray,
|
|
shardIntervalArrayLength,
|
|
cacheEntry->partitionColumn->varcollid,
|
|
shardIntervalCompareFunction);
|
|
}
|
|
else
|
|
{
|
|
cacheEntry->hasOverlappingShardInterval = true;
|
|
}
|
|
|
|
ErrorIfInconsistentShardIntervals(cacheEntry);
|
|
}
|
|
|
|
cacheEntry->sortedShardIntervalArray = sortedShardIntervalArray;
|
|
cacheEntry->shardIntervalArrayLength = 0;
|
|
|
|
/* maintain shardId->(table,ShardInterval) cache */
|
|
for (int shardIndex = 0; shardIndex < shardIntervalArrayLength; shardIndex++)
|
|
{
|
|
ShardInterval *shardInterval = sortedShardIntervalArray[shardIndex];
|
|
int64 shardId = shardInterval->shardId;
|
|
int placementOffset = 0;
|
|
|
|
/*
|
|
* Enable quick lookups of this shard ID by adding it to ShardIdCacheHash
|
|
* or overwriting the previous values.
|
|
*/
|
|
ShardIdCacheEntry *shardIdCacheEntry =
|
|
hash_search(ShardIdCacheHash, &shardId, HASH_ENTER, NULL);
|
|
|
|
shardIdCacheEntry->tableEntry = cacheEntry;
|
|
shardIdCacheEntry->shardIndex = shardIndex;
|
|
|
|
/*
|
|
* We should increment this only after we are sure this hasn't already
|
|
* been assigned to any other relations. ResetCitusTableCacheEntry()
|
|
* depends on this.
|
|
*/
|
|
cacheEntry->shardIntervalArrayLength++;
|
|
|
|
/* build list of shard placements */
|
|
List *placementList = BuildShardPlacementList(shardId);
|
|
int numberOfPlacements = list_length(placementList);
|
|
|
|
/* and copy that list into the cache entry */
|
|
MemoryContext oldContext = MemoryContextSwitchTo(MetadataCacheMemoryContext);
|
|
GroupShardPlacement *placementArray = palloc0(numberOfPlacements *
|
|
sizeof(GroupShardPlacement));
|
|
GroupShardPlacement *srcPlacement = NULL;
|
|
foreach_ptr(srcPlacement, placementList)
|
|
{
|
|
placementArray[placementOffset] = *srcPlacement;
|
|
placementOffset++;
|
|
}
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
cacheEntry->arrayOfPlacementArrays[shardIndex] = placementArray;
|
|
cacheEntry->arrayOfPlacementArrayLengths[shardIndex] = numberOfPlacements;
|
|
|
|
/* store the shard index in the ShardInterval */
|
|
shardInterval->shardIndex = shardIndex;
|
|
}
|
|
|
|
cacheEntry->shardColumnCompareFunction = shardColumnCompareFunction;
|
|
cacheEntry->shardIntervalCompareFunction = shardIntervalCompareFunction;
|
|
}
|
|
|
|
|
|
/*
|
|
* ErrorIfInconsistentShardIntervals checks if shard intervals are consistent with
|
|
* our expectations.
|
|
*/
|
|
void
|
|
ErrorIfInconsistentShardIntervals(CitusTableCacheEntry *cacheEntry)
|
|
{
|
|
/*
|
|
* If table is hash-partitioned and has shards, there never should be any
|
|
* uninitalized shards. Historically we've not prevented that for range
|
|
* partitioned tables, but it might be a good idea to start doing so.
|
|
*/
|
|
if (cacheEntry->partitionMethod == DISTRIBUTE_BY_HASH &&
|
|
cacheEntry->hasUninitializedShardInterval)
|
|
{
|
|
ereport(ERROR, (errmsg("hash partitioned table has uninitialized shards")));
|
|
}
|
|
if (cacheEntry->partitionMethod == DISTRIBUTE_BY_HASH &&
|
|
cacheEntry->hasOverlappingShardInterval)
|
|
{
|
|
ereport(ERROR, (errmsg("hash partitioned table has overlapping shards")));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* HasUniformHashDistribution determines whether the given list of sorted shards
|
|
* has a uniform hash distribution, as produced by master_create_worker_shards for
|
|
* hash partitioned tables.
|
|
*/
|
|
bool
|
|
HasUniformHashDistribution(ShardInterval **shardIntervalArray,
|
|
int shardIntervalArrayLength)
|
|
{
|
|
/* if there are no shards, there is no uniform distribution */
|
|
if (shardIntervalArrayLength == 0)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
/* calculate the hash token increment */
|
|
uint64 hashTokenIncrement = HASH_TOKEN_COUNT / shardIntervalArrayLength;
|
|
|
|
for (int shardIndex = 0; shardIndex < shardIntervalArrayLength; shardIndex++)
|
|
{
|
|
ShardInterval *shardInterval = shardIntervalArray[shardIndex];
|
|
int32 shardMinHashToken = PG_INT32_MIN + (shardIndex * hashTokenIncrement);
|
|
int32 shardMaxHashToken = shardMinHashToken + (hashTokenIncrement - 1);
|
|
|
|
if (shardIndex == (shardIntervalArrayLength - 1))
|
|
{
|
|
shardMaxHashToken = PG_INT32_MAX;
|
|
}
|
|
|
|
if (DatumGetInt32(shardInterval->minValue) != shardMinHashToken ||
|
|
DatumGetInt32(shardInterval->maxValue) != shardMaxHashToken)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* HasUninitializedShardInterval returns true if all the elements of the
|
|
* sortedShardIntervalArray has min/max values. Callers of the function must
|
|
* ensure that input shard interval array is sorted on shardminvalue and uninitialized
|
|
* shard intervals are at the end of the array.
|
|
*/
|
|
bool
|
|
HasUninitializedShardInterval(ShardInterval **sortedShardIntervalArray, int shardCount)
|
|
{
|
|
bool hasUninitializedShardInterval = false;
|
|
|
|
if (shardCount == 0)
|
|
{
|
|
return hasUninitializedShardInterval;
|
|
}
|
|
|
|
Assert(sortedShardIntervalArray != NULL);
|
|
|
|
/*
|
|
* Since the shard interval array is sorted, and uninitialized ones stored
|
|
* in the end of the array, checking the last element is enough.
|
|
*/
|
|
ShardInterval *lastShardInterval = sortedShardIntervalArray[shardCount - 1];
|
|
if (!lastShardInterval->minValueExists || !lastShardInterval->maxValueExists)
|
|
{
|
|
hasUninitializedShardInterval = true;
|
|
}
|
|
|
|
return hasUninitializedShardInterval;
|
|
}
|
|
|
|
|
|
/*
|
|
* HasOverlappingShardInterval determines whether the given list of sorted
|
|
* shards has overlapping ranges.
|
|
*/
|
|
bool
|
|
HasOverlappingShardInterval(ShardInterval **shardIntervalArray,
|
|
int shardIntervalArrayLength,
|
|
Oid shardIntervalCollation,
|
|
FmgrInfo *shardIntervalSortCompareFunction)
|
|
{
|
|
Datum comparisonDatum = 0;
|
|
int comparisonResult = 0;
|
|
|
|
/* zero/a single shard can't overlap */
|
|
if (shardIntervalArrayLength < 2)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
ShardInterval *lastShardInterval = shardIntervalArray[0];
|
|
for (int shardIndex = 1; shardIndex < shardIntervalArrayLength; shardIndex++)
|
|
{
|
|
ShardInterval *curShardInterval = shardIntervalArray[shardIndex];
|
|
|
|
/* only called if !hasUninitializedShardInterval */
|
|
Assert(lastShardInterval->minValueExists && lastShardInterval->maxValueExists);
|
|
Assert(curShardInterval->minValueExists && curShardInterval->maxValueExists);
|
|
|
|
comparisonDatum = FunctionCall2Coll(shardIntervalSortCompareFunction,
|
|
shardIntervalCollation,
|
|
lastShardInterval->maxValue,
|
|
curShardInterval->minValue);
|
|
comparisonResult = DatumGetInt32(comparisonDatum);
|
|
|
|
if (comparisonResult >= 0)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
lastShardInterval = curShardInterval;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusHasBeenLoaded returns true if the citus extension has been created
|
|
* in the current database and the extension script has been executed. Otherwise,
|
|
* it returns false. The result is cached as this is called very frequently.
|
|
*/
|
|
bool
|
|
CitusHasBeenLoaded(void)
|
|
{
|
|
/*
|
|
* We do not use Citus hooks during CREATE/ALTER EXTENSION citus
|
|
* since the objects used by the C code might be not be there yet.
|
|
*/
|
|
if (creating_extension)
|
|
{
|
|
Oid citusExtensionOid = get_extension_oid("citus", true);
|
|
|
|
if (CurrentExtensionObject == citusExtensionOid)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If extensionCreatedState is UNKNOWN, query pg_extension for Citus
|
|
* and cache the result. Otherwise return the value extensionCreatedState
|
|
* indicates.
|
|
*/
|
|
if (MetadataCache.extensionCreatedState == UNKNOWN)
|
|
{
|
|
bool extensionCreated = CitusHasBeenLoadedInternal();
|
|
|
|
if (extensionCreated)
|
|
{
|
|
/*
|
|
* Loaded Citus for the first time in this session, or first time after
|
|
* CREATE/ALTER EXTENSION citus. Do some initialisation.
|
|
*/
|
|
|
|
/*
|
|
* Make sure the maintenance daemon is running if it was not already.
|
|
*/
|
|
StartupCitusBackend();
|
|
|
|
/*
|
|
* This needs to be initialized so we can receive foreign relation graph
|
|
* invalidation messages in InvalidateForeignRelationGraphCacheCallback().
|
|
* See the comments of InvalidateForeignKeyGraph for more context.
|
|
*/
|
|
DistColocationRelationId();
|
|
|
|
MetadataCache.extensionCreatedState = CREATED;
|
|
}
|
|
else
|
|
{
|
|
MetadataCache.extensionCreatedState = NOTCREATED;
|
|
}
|
|
}
|
|
|
|
return (MetadataCache.extensionCreatedState == CREATED) ? true : false;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusHasBeenLoadedInternal returns true if the citus extension has been created
|
|
* in the current database and the extension script has been executed. Otherwise,
|
|
* it returns false.
|
|
*/
|
|
static bool
|
|
CitusHasBeenLoadedInternal(void)
|
|
{
|
|
if (IsBinaryUpgrade)
|
|
{
|
|
/* never use Citus logic during pg_upgrade */
|
|
return false;
|
|
}
|
|
|
|
Oid citusExtensionOid = get_extension_oid("citus", true);
|
|
if (citusExtensionOid == InvalidOid)
|
|
{
|
|
/* Citus extension does not exist yet */
|
|
return false;
|
|
}
|
|
|
|
/* citus extension exists and has been created */
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetCitusCreationLevel returns the level of the transaction creating citus
|
|
*/
|
|
int
|
|
GetCitusCreationLevel(void)
|
|
{
|
|
return CreateCitusTransactionLevel;
|
|
}
|
|
|
|
|
|
/*
|
|
* Sets the value of CreateCitusTransactionLevel based on int received which represents the
|
|
* nesting level of the transaction that created the Citus extension
|
|
*/
|
|
void
|
|
SetCreateCitusTransactionLevel(int val)
|
|
{
|
|
CreateCitusTransactionLevel = val;
|
|
}
|
|
|
|
|
|
/*
|
|
* CheckCitusVersion checks whether there is a version mismatch between the
|
|
* available version and the loaded version or between the installed version
|
|
* and the loaded version. Returns true if compatible, false otherwise.
|
|
*
|
|
* As a side effect, this function also sets citusVersionKnownCompatible global
|
|
* variable to true which reduces version check cost of next calls.
|
|
*/
|
|
bool
|
|
CheckCitusVersion(int elevel)
|
|
{
|
|
if (citusVersionKnownCompatible ||
|
|
!CitusHasBeenLoaded() ||
|
|
!EnableVersionChecks)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
if (CheckAvailableVersion(elevel) && CheckInstalledVersion(elevel))
|
|
{
|
|
citusVersionKnownCompatible = true;
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* CheckAvailableVersion compares CITUS_EXTENSIONVERSION and the currently
|
|
* available version from the citus.control file. If they are not compatible,
|
|
* this function logs an error with the specified elevel and returns false,
|
|
* otherwise it returns true.
|
|
*/
|
|
bool
|
|
CheckAvailableVersion(int elevel)
|
|
{
|
|
if (!EnableVersionChecks)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
char *availableVersion = AvailableExtensionVersion();
|
|
|
|
if (!MajorVersionsCompatible(availableVersion, CITUS_EXTENSIONVERSION))
|
|
{
|
|
ereport(elevel, (errmsg("loaded Citus library version differs from latest "
|
|
"available extension version"),
|
|
errdetail("Loaded library requires %s, but the latest control "
|
|
"file specifies %s.", CITUS_MAJORVERSION,
|
|
availableVersion),
|
|
errhint("Restart the database to load the latest Citus "
|
|
"library.")));
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* CheckInstalledVersion compares CITUS_EXTENSIONVERSION and the
|
|
* extension's current version from the pg_extension catalog table. If they
|
|
* are not compatible, this function logs an error with the specified elevel,
|
|
* otherwise it returns true.
|
|
*/
|
|
static bool
|
|
CheckInstalledVersion(int elevel)
|
|
{
|
|
Assert(CitusHasBeenLoaded());
|
|
Assert(EnableVersionChecks);
|
|
|
|
char *installedVersion = InstalledExtensionVersion();
|
|
|
|
if (!MajorVersionsCompatible(installedVersion, CITUS_EXTENSIONVERSION))
|
|
{
|
|
ereport(elevel, (errmsg("loaded Citus library version differs from installed "
|
|
"extension version"),
|
|
errdetail("Loaded library requires %s, but the installed "
|
|
"extension version is %s.", CITUS_MAJORVERSION,
|
|
installedVersion),
|
|
errhint("Run ALTER EXTENSION citus UPDATE and try again.")));
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
/*
|
|
* InstalledAndAvailableVersionsSame compares extension's available version and
|
|
* its current version from the pg_extension catalog table. If they are not same
|
|
* returns false, otherwise returns true.
|
|
*/
|
|
bool
|
|
InstalledAndAvailableVersionsSame()
|
|
{
|
|
char *installedVersion = InstalledExtensionVersion();
|
|
char *availableVersion = AvailableExtensionVersion();
|
|
|
|
if (strncmp(installedVersion, availableVersion, NAMEDATALEN) == 0)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/*
|
|
* MajorVersionsCompatible checks whether both versions are compatible. They
|
|
* are if major and minor version numbers match, the schema version is
|
|
* ignored. Returns true if compatible, false otherwise.
|
|
*/
|
|
bool
|
|
MajorVersionsCompatible(char *leftVersion, char *rightVersion)
|
|
{
|
|
const char schemaVersionSeparator = '-';
|
|
|
|
char *leftSeperatorPosition = strchr(leftVersion, schemaVersionSeparator);
|
|
char *rightSeperatorPosition = strchr(rightVersion, schemaVersionSeparator);
|
|
int leftComparisionLimit = 0;
|
|
int rightComparisionLimit = 0;
|
|
|
|
if (leftSeperatorPosition != NULL)
|
|
{
|
|
leftComparisionLimit = leftSeperatorPosition - leftVersion;
|
|
}
|
|
else
|
|
{
|
|
leftComparisionLimit = strlen(leftVersion);
|
|
}
|
|
|
|
if (rightSeperatorPosition != NULL)
|
|
{
|
|
rightComparisionLimit = rightSeperatorPosition - rightVersion;
|
|
}
|
|
else
|
|
{
|
|
rightComparisionLimit = strlen(leftVersion);
|
|
}
|
|
|
|
/* we can error out early if hypens are not in the same position */
|
|
if (leftComparisionLimit != rightComparisionLimit)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return strncmp(leftVersion, rightVersion, leftComparisionLimit) == 0;
|
|
}
|
|
|
|
|
|
/*
|
|
* AvailableExtensionVersion returns the Citus version from citus.control file. It also
|
|
* saves the result, thus consecutive calls to CitusExtensionAvailableVersion will
|
|
* not read the citus.control file again.
|
|
*/
|
|
static char *
|
|
AvailableExtensionVersion(void)
|
|
{
|
|
LOCAL_FCINFO(fcinfo, 0);
|
|
FmgrInfo flinfo;
|
|
|
|
bool goForward = true;
|
|
bool doCopy = false;
|
|
char *availableExtensionVersion;
|
|
|
|
InitializeCaches();
|
|
|
|
EState *estate = CreateExecutorState();
|
|
ReturnSetInfo *extensionsResultSet = makeNode(ReturnSetInfo);
|
|
extensionsResultSet->econtext = GetPerTupleExprContext(estate);
|
|
extensionsResultSet->allowedModes = SFRM_Materialize;
|
|
|
|
fmgr_info(F_PG_AVAILABLE_EXTENSIONS, &flinfo);
|
|
InitFunctionCallInfoData(*fcinfo, &flinfo, 0, InvalidOid, NULL,
|
|
(Node *) extensionsResultSet);
|
|
|
|
/* pg_available_extensions returns result set containing all available extensions */
|
|
(*pg_available_extensions)(fcinfo);
|
|
|
|
TupleTableSlot *tupleTableSlot = MakeSingleTupleTableSlot(
|
|
extensionsResultSet->setDesc,
|
|
&TTSOpsMinimalTuple);
|
|
bool hasTuple = tuplestore_gettupleslot(extensionsResultSet->setResult, goForward,
|
|
doCopy,
|
|
tupleTableSlot);
|
|
while (hasTuple)
|
|
{
|
|
bool isNull = false;
|
|
|
|
Datum extensionNameDatum = slot_getattr(tupleTableSlot, 1, &isNull);
|
|
char *extensionName = NameStr(*DatumGetName(extensionNameDatum));
|
|
if (strcmp(extensionName, "citus") == 0)
|
|
{
|
|
Datum availableVersion = slot_getattr(tupleTableSlot, 2, &isNull);
|
|
|
|
/* we will cache the result of citus version to prevent catalog access */
|
|
MemoryContext oldMemoryContext = MemoryContextSwitchTo(
|
|
MetadataCacheMemoryContext);
|
|
|
|
availableExtensionVersion = text_to_cstring(DatumGetTextPP(availableVersion));
|
|
|
|
MemoryContextSwitchTo(oldMemoryContext);
|
|
|
|
ExecClearTuple(tupleTableSlot);
|
|
ExecDropSingleTupleTableSlot(tupleTableSlot);
|
|
|
|
return availableExtensionVersion;
|
|
}
|
|
|
|
ExecClearTuple(tupleTableSlot);
|
|
hasTuple = tuplestore_gettupleslot(extensionsResultSet->setResult, goForward,
|
|
doCopy, tupleTableSlot);
|
|
}
|
|
|
|
ExecDropSingleTupleTableSlot(tupleTableSlot);
|
|
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("citus extension is not found")));
|
|
|
|
return NULL; /* keep compiler happy */
|
|
}
|
|
|
|
|
|
/*
|
|
* InstalledExtensionVersion returns the Citus version in PostgreSQL pg_extension table.
|
|
*/
|
|
static char *
|
|
InstalledExtensionVersion(void)
|
|
{
|
|
ScanKeyData entry[1];
|
|
char *installedExtensionVersion = NULL;
|
|
|
|
InitializeCaches();
|
|
|
|
Relation relation = table_open(ExtensionRelationId, AccessShareLock);
|
|
|
|
ScanKeyInit(&entry[0], Anum_pg_extension_extname, BTEqualStrategyNumber, F_NAMEEQ,
|
|
CStringGetDatum("citus"));
|
|
|
|
SysScanDesc scandesc = systable_beginscan(relation, ExtensionNameIndexId, true,
|
|
NULL, 1, entry);
|
|
|
|
HeapTuple extensionTuple = systable_getnext(scandesc);
|
|
|
|
/* We assume that there can be at most one matching tuple */
|
|
if (HeapTupleIsValid(extensionTuple))
|
|
{
|
|
int extensionIndex = Anum_pg_extension_extversion;
|
|
TupleDesc tupleDescriptor = RelationGetDescr(relation);
|
|
bool isNull = false;
|
|
|
|
Datum installedVersion = heap_getattr(extensionTuple, extensionIndex,
|
|
tupleDescriptor, &isNull);
|
|
|
|
if (isNull)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("citus extension version is null")));
|
|
}
|
|
|
|
/* we will cache the result of citus version to prevent catalog access */
|
|
MemoryContext oldMemoryContext = MemoryContextSwitchTo(
|
|
MetadataCacheMemoryContext);
|
|
|
|
installedExtensionVersion = text_to_cstring(DatumGetTextPP(installedVersion));
|
|
|
|
MemoryContextSwitchTo(oldMemoryContext);
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("citus extension is not loaded")));
|
|
}
|
|
|
|
systable_endscan(scandesc);
|
|
|
|
table_close(relation, AccessShareLock);
|
|
|
|
return installedExtensionVersion;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_shard relation */
|
|
Oid
|
|
DistShardRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_shard",
|
|
&MetadataCache.distShardRelationId);
|
|
|
|
return MetadataCache.distShardRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_placement relation */
|
|
Oid
|
|
DistPlacementRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_placement",
|
|
&MetadataCache.distPlacementRelationId);
|
|
|
|
return MetadataCache.distPlacementRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_node relation */
|
|
Oid
|
|
DistNodeRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_node",
|
|
&MetadataCache.distNodeRelationId);
|
|
|
|
return MetadataCache.distNodeRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_node's primary key index */
|
|
Oid
|
|
DistNodeNodeIdIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_node_pkey",
|
|
&MetadataCache.distNodeNodeIdIndexId);
|
|
|
|
return MetadataCache.distNodeNodeIdIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_local_group relation */
|
|
Oid
|
|
DistLocalGroupIdRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_local_group",
|
|
&MetadataCache.distLocalGroupRelationId);
|
|
|
|
return MetadataCache.distLocalGroupRelationId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundJobRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_job",
|
|
&MetadataCache.distBackgroundJobRelationId);
|
|
|
|
return MetadataCache.distBackgroundJobRelationId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundJobPKeyIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_job_pkey",
|
|
&MetadataCache.distBackgroundJobPKeyIndexId);
|
|
|
|
return MetadataCache.distBackgroundJobPKeyIndexId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundJobJobIdSequenceId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_job_job_id_seq",
|
|
&MetadataCache.distBackgroundJobJobIdSequenceId);
|
|
|
|
return MetadataCache.distBackgroundJobJobIdSequenceId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundTaskRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_task",
|
|
&MetadataCache.distBackgroundTaskRelationId);
|
|
|
|
return MetadataCache.distBackgroundTaskRelationId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundTaskPKeyIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_task_pkey",
|
|
&MetadataCache.distBackgroundTaskPKeyIndexId);
|
|
|
|
return MetadataCache.distBackgroundTaskPKeyIndexId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundTaskJobIdTaskIdIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_task_job_id_task_id",
|
|
&MetadataCache.distBackgroundTaskJobIdTaskIdIndexId);
|
|
|
|
return MetadataCache.distBackgroundTaskJobIdTaskIdIndexId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundTaskStatusTaskIdIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_task_status_task_id_index",
|
|
&MetadataCache.distBackgroundTaskStatusTaskIdIndexId);
|
|
|
|
return MetadataCache.distBackgroundTaskStatusTaskIdIndexId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundTaskTaskIdSequenceId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_task_task_id_seq",
|
|
&MetadataCache.distBackgroundTaskTaskIdSequenceId);
|
|
|
|
return MetadataCache.distBackgroundTaskTaskIdSequenceId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistClockLogicalSequenceId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_clock_logical_seq",
|
|
&MetadataCache.distClockLogicalSequenceId);
|
|
|
|
return MetadataCache.distClockLogicalSequenceId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundTaskDependRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_task_depend",
|
|
&MetadataCache.distBackgroundTaskDependRelationId);
|
|
|
|
return MetadataCache.distBackgroundTaskDependRelationId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundTaskDependTaskIdIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_task_depend_task_id",
|
|
&MetadataCache.distBackgroundTaskDependTaskIdIndexId);
|
|
|
|
return MetadataCache.distBackgroundTaskDependTaskIdIndexId;
|
|
}
|
|
|
|
|
|
Oid
|
|
DistBackgroundTaskDependDependsOnIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_background_task_depend_depends_on",
|
|
&MetadataCache.distBackgroundTaskDependDependsOnIndexId);
|
|
|
|
return MetadataCache.distBackgroundTaskDependDependsOnIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_rebalance_strategy relation */
|
|
Oid
|
|
DistRebalanceStrategyRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_rebalance_strategy",
|
|
&MetadataCache.distRebalanceStrategyRelationId);
|
|
|
|
return MetadataCache.distRebalanceStrategyRelationId;
|
|
}
|
|
|
|
|
|
/* return the oid of citus namespace */
|
|
Oid
|
|
CitusCatalogNamespaceId(void)
|
|
{
|
|
CachedNamespaceLookup("citus", &MetadataCache.citusCatalogNamespaceId);
|
|
return MetadataCache.citusCatalogNamespaceId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_object relation */
|
|
Oid
|
|
DistObjectRelationId(void)
|
|
{
|
|
/*
|
|
* In older versions pg_dist_object was living in the `citus` namespace, With Citus 11
|
|
* this has been moved to pg_dist_catalog.
|
|
*
|
|
* During upgrades it could therefore be that we simply need to look in the old
|
|
* catalog. Since we expect to find it most of the time in the pg_catalog schema from
|
|
* now on we will start there.
|
|
*
|
|
* even after the table has been moved, the oid's stay the same, so we don't have to
|
|
* invalidate the cache after a move
|
|
*
|
|
* Note: during testing we also up/downgrade the extension, and sometimes interact
|
|
* with the database when the schema and the binary are not in sync. Hance we always
|
|
* allow the catalog to be missing on our first lookup. The error message might
|
|
* therefore become misleading as it will complain about citus.pg_dist_object not
|
|
* being found when called too early.
|
|
*/
|
|
CachedRelationLookupExtended("pg_dist_object",
|
|
&MetadataCache.distObjectRelationId,
|
|
true);
|
|
if (!OidIsValid(MetadataCache.distObjectRelationId))
|
|
{
|
|
/*
|
|
* We can only ever reach here while we are creating/altering our extension before
|
|
* the table is moved to pg_catalog.
|
|
*/
|
|
CachedRelationNamespaceLookupExtended("pg_dist_object",
|
|
CitusCatalogNamespaceId(),
|
|
&MetadataCache.distObjectRelationId,
|
|
false);
|
|
}
|
|
|
|
return MetadataCache.distObjectRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_object_pkey */
|
|
Oid
|
|
DistObjectPrimaryKeyIndexId(void)
|
|
{
|
|
/*
|
|
* In older versions pg_dist_object was living in the `citus` namespace, With Citus 11
|
|
* this has been moved to pg_dist_catalog.
|
|
*
|
|
* During upgrades it could therefore be that we simply need to look in the old
|
|
* catalog. Since we expect to find it most of the time in the pg_catalog schema from
|
|
* now on we will start there.
|
|
*
|
|
* even after the table has been moved, the oid's stay the same, so we don't have to
|
|
* invalidate the cache after a move
|
|
*
|
|
* Note: during testing we also up/downgrade the extension, and sometimes interact
|
|
* with the database when the schema and the binary are not in sync. Hance we always
|
|
* allow the catalog to be missing on our first lookup. The error message might
|
|
* therefore become misleading as it will complain about citus.pg_dist_object not
|
|
* being found when called too early.
|
|
*/
|
|
CachedRelationLookupExtended("pg_dist_object_pkey",
|
|
&MetadataCache.distObjectPrimaryKeyIndexId,
|
|
true);
|
|
|
|
if (!OidIsValid(MetadataCache.distObjectPrimaryKeyIndexId))
|
|
{
|
|
/*
|
|
* We can only ever reach here while we are creating/altering our extension before
|
|
* the table is moved to pg_catalog.
|
|
*/
|
|
CachedRelationNamespaceLookupExtended("pg_dist_object_pkey",
|
|
CitusCatalogNamespaceId(),
|
|
&MetadataCache.distObjectPrimaryKeyIndexId,
|
|
false);
|
|
}
|
|
|
|
return MetadataCache.distObjectPrimaryKeyIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_cleanup relation */
|
|
Oid
|
|
DistCleanupRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_cleanup",
|
|
&MetadataCache.distCleanupRelationId);
|
|
|
|
return MetadataCache.distCleanupRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_cleanup primary key index */
|
|
Oid
|
|
DistCleanupPrimaryKeyIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_cleanup_pkey",
|
|
&MetadataCache.distCleanupPrimaryKeyIndexId);
|
|
|
|
return MetadataCache.distCleanupPrimaryKeyIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_colocation relation */
|
|
Oid
|
|
DistColocationRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_colocation",
|
|
&MetadataCache.distColocationRelationId);
|
|
|
|
return MetadataCache.distColocationRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_colocation_configuration_index index */
|
|
Oid
|
|
DistColocationConfigurationIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_colocation_configuration_index",
|
|
&MetadataCache.distColocationConfigurationIndexId);
|
|
|
|
return MetadataCache.distColocationConfigurationIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_schema relation */
|
|
Oid
|
|
DistTenantSchemaRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_schema",
|
|
&MetadataCache.distTenantSchemaRelationId);
|
|
|
|
return MetadataCache.distTenantSchemaRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_schema_pkey index */
|
|
Oid
|
|
DistTenantSchemaPrimaryKeyIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_schema_pkey",
|
|
&MetadataCache.distTenantSchemaPrimaryKeyIndexId);
|
|
|
|
return MetadataCache.distTenantSchemaPrimaryKeyIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_schema_unique_colocationid_index index */
|
|
Oid
|
|
DistTenantSchemaUniqueColocationIdIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_schema_unique_colocationid_index",
|
|
&MetadataCache.distTenantSchemaUniqueColocationIdIndexId);
|
|
|
|
return MetadataCache.distTenantSchemaUniqueColocationIdIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_partition relation */
|
|
Oid
|
|
DistPartitionRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_partition",
|
|
&MetadataCache.distPartitionRelationId);
|
|
|
|
return MetadataCache.distPartitionRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_partition_logical_relid_index index */
|
|
Oid
|
|
DistPartitionLogicalRelidIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_partition_logical_relid_index",
|
|
&MetadataCache.distPartitionLogicalRelidIndexId);
|
|
|
|
return MetadataCache.distPartitionLogicalRelidIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_partition_colocationid_index index */
|
|
Oid
|
|
DistPartitionColocationidIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_partition_colocationid_index",
|
|
&MetadataCache.distPartitionColocationidIndexId);
|
|
|
|
return MetadataCache.distPartitionColocationidIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_shard_logical_relid_index index */
|
|
Oid
|
|
DistShardLogicalRelidIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_shard_logical_relid_index",
|
|
&MetadataCache.distShardLogicalRelidIndexId);
|
|
|
|
return MetadataCache.distShardLogicalRelidIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_shard_shardid_index index */
|
|
Oid
|
|
DistShardShardidIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_shard_shardid_index",
|
|
&MetadataCache.distShardShardidIndexId);
|
|
|
|
return MetadataCache.distShardShardidIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_placement_shardid_index */
|
|
Oid
|
|
DistPlacementShardidIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_placement_shardid_index",
|
|
&MetadataCache.distPlacementShardidIndexId);
|
|
|
|
return MetadataCache.distPlacementShardidIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_placement_placementid_index */
|
|
Oid
|
|
DistPlacementPlacementidIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_placement_placementid_index",
|
|
&MetadataCache.distPlacementPlacementidIndexId);
|
|
|
|
return MetadataCache.distPlacementPlacementidIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_colocation_pkey */
|
|
Oid
|
|
DistColocationIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_colocation_pkey",
|
|
&MetadataCache.distColocationidIndexId);
|
|
|
|
return MetadataCache.distColocationidIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_transaction relation */
|
|
Oid
|
|
DistTransactionRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_transaction",
|
|
&MetadataCache.distTransactionRelationId);
|
|
|
|
return MetadataCache.distTransactionRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_transaction_group_index */
|
|
Oid
|
|
DistTransactionGroupIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_transaction_group_index",
|
|
&MetadataCache.distTransactionGroupIndexId);
|
|
|
|
return MetadataCache.distTransactionGroupIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_placement_groupid_index */
|
|
Oid
|
|
DistPlacementGroupidIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_placement_groupid_index",
|
|
&MetadataCache.distPlacementGroupidIndexId);
|
|
|
|
return MetadataCache.distPlacementGroupidIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_authinfo relation */
|
|
static Oid
|
|
DistAuthinfoRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_authinfo",
|
|
&MetadataCache.distAuthinfoRelationId);
|
|
|
|
return MetadataCache.distAuthinfoRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_authinfo identification index */
|
|
static Oid
|
|
DistAuthinfoIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_authinfo_identification_index",
|
|
&MetadataCache.distAuthinfoIndexId);
|
|
|
|
return MetadataCache.distAuthinfoIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_poolinfo relation */
|
|
static Oid
|
|
DistPoolinfoRelationId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_poolinfo",
|
|
&MetadataCache.distPoolinfoRelationId);
|
|
|
|
return MetadataCache.distPoolinfoRelationId;
|
|
}
|
|
|
|
|
|
/* return oid of pg_dist_poolinfo primary key index */
|
|
static Oid
|
|
DistPoolinfoIndexId(void)
|
|
{
|
|
CachedRelationLookup("pg_dist_poolinfo_pkey",
|
|
&MetadataCache.distPoolinfoIndexId);
|
|
|
|
return MetadataCache.distPoolinfoIndexId;
|
|
}
|
|
|
|
|
|
/* return oid of the read_intermediate_result(text,citus_copy_format) function */
|
|
Oid
|
|
CitusReadIntermediateResultFuncId(void)
|
|
{
|
|
if (MetadataCache.readIntermediateResultFuncId == InvalidOid)
|
|
{
|
|
List *functionNameList = list_make2(makeString("pg_catalog"),
|
|
makeString("read_intermediate_result"));
|
|
Oid copyFormatTypeOid = CitusCopyFormatTypeId();
|
|
Oid paramOids[2] = { TEXTOID, copyFormatTypeOid };
|
|
bool missingOK = false;
|
|
|
|
MetadataCache.readIntermediateResultFuncId =
|
|
LookupFuncName(functionNameList, 2, paramOids, missingOK);
|
|
}
|
|
|
|
return MetadataCache.readIntermediateResultFuncId;
|
|
}
|
|
|
|
|
|
/* return oid of the read_intermediate_results(text[],citus_copy_format) function */
|
|
Oid
|
|
CitusReadIntermediateResultArrayFuncId(void)
|
|
{
|
|
if (MetadataCache.readIntermediateResultArrayFuncId == InvalidOid)
|
|
{
|
|
List *functionNameList = list_make2(makeString("pg_catalog"),
|
|
makeString("read_intermediate_results"));
|
|
Oid copyFormatTypeOid = CitusCopyFormatTypeId();
|
|
Oid paramOids[2] = { TEXTARRAYOID, copyFormatTypeOid };
|
|
bool missingOK = false;
|
|
|
|
MetadataCache.readIntermediateResultArrayFuncId =
|
|
LookupFuncName(functionNameList, 2, paramOids, missingOK);
|
|
}
|
|
|
|
return MetadataCache.readIntermediateResultArrayFuncId;
|
|
}
|
|
|
|
|
|
/* return oid of the citus.copy_format enum type */
|
|
Oid
|
|
CitusCopyFormatTypeId(void)
|
|
{
|
|
if (MetadataCache.copyFormatTypeId == InvalidOid)
|
|
{
|
|
char *typeName = "citus_copy_format";
|
|
MetadataCache.copyFormatTypeId = GetSysCacheOid2(TYPENAMENSP,
|
|
Anum_pg_enum_oid,
|
|
PointerGetDatum(typeName),
|
|
PG_CATALOG_NAMESPACE);
|
|
}
|
|
|
|
return MetadataCache.copyFormatTypeId;
|
|
}
|
|
|
|
|
|
/* return oid of the 'binary' citus_copy_format enum value */
|
|
Oid
|
|
BinaryCopyFormatId(void)
|
|
{
|
|
if (MetadataCache.binaryCopyFormatId == InvalidOid)
|
|
{
|
|
Oid copyFormatTypeId = CitusCopyFormatTypeId();
|
|
MetadataCache.binaryCopyFormatId = LookupEnumValueId(copyFormatTypeId, "binary");
|
|
}
|
|
|
|
return MetadataCache.binaryCopyFormatId;
|
|
}
|
|
|
|
|
|
/* return oid of the 'text' citus_copy_format enum value */
|
|
Oid
|
|
TextCopyFormatId(void)
|
|
{
|
|
if (MetadataCache.textCopyFormatId == InvalidOid)
|
|
{
|
|
Oid copyFormatTypeId = CitusCopyFormatTypeId();
|
|
MetadataCache.textCopyFormatId = LookupEnumValueId(copyFormatTypeId, "text");
|
|
}
|
|
|
|
return MetadataCache.textCopyFormatId;
|
|
}
|
|
|
|
|
|
/* return oid of the citus_extradata_container(internal) function */
|
|
Oid
|
|
CitusExtraDataContainerFuncId(void)
|
|
{
|
|
List *nameList = NIL;
|
|
Oid paramOids[1] = { INTERNALOID };
|
|
|
|
if (MetadataCache.extraDataContainerFuncId == InvalidOid)
|
|
{
|
|
nameList = list_make2(makeString("pg_catalog"),
|
|
makeString("citus_extradata_container"));
|
|
MetadataCache.extraDataContainerFuncId =
|
|
LookupFuncName(nameList, 1, paramOids, false);
|
|
}
|
|
|
|
return MetadataCache.extraDataContainerFuncId;
|
|
}
|
|
|
|
|
|
/* return oid of the any_value aggregate function */
|
|
Oid
|
|
CitusAnyValueFunctionId(void)
|
|
{
|
|
if (MetadataCache.anyValueFunctionId == InvalidOid)
|
|
{
|
|
const int argCount = 1;
|
|
MetadataCache.anyValueFunctionId =
|
|
FunctionOid("pg_catalog", "any_value", argCount);
|
|
}
|
|
|
|
return MetadataCache.anyValueFunctionId;
|
|
}
|
|
|
|
|
|
/* return oid of the citus_text_send_as_jsonb(text) function */
|
|
Oid
|
|
CitusTextSendAsJsonbFunctionId(void)
|
|
{
|
|
if (MetadataCache.textSendAsJsonbFunctionId == InvalidOid)
|
|
{
|
|
List *nameList = list_make2(makeString("pg_catalog"),
|
|
makeString("citus_text_send_as_jsonb"));
|
|
Oid paramOids[1] = { TEXTOID };
|
|
|
|
MetadataCache.textSendAsJsonbFunctionId =
|
|
LookupFuncName(nameList, 1, paramOids, false);
|
|
}
|
|
|
|
return MetadataCache.textSendAsJsonbFunctionId;
|
|
}
|
|
|
|
|
|
/* return oid of the textout(text) function */
|
|
Oid
|
|
TextOutFunctionId(void)
|
|
{
|
|
if (MetadataCache.textoutFunctionId == InvalidOid)
|
|
{
|
|
List *nameList = list_make2(makeString("pg_catalog"),
|
|
makeString("textout"));
|
|
Oid paramOids[1] = { TEXTOID };
|
|
|
|
MetadataCache.textoutFunctionId =
|
|
LookupFuncName(nameList, 1, paramOids, false);
|
|
}
|
|
|
|
return MetadataCache.textoutFunctionId;
|
|
}
|
|
|
|
|
|
/*
|
|
* RelationIsAKnownShardFuncId returns oid of the relation_is_a_known_shard function.
|
|
*/
|
|
Oid
|
|
RelationIsAKnownShardFuncId(void)
|
|
{
|
|
if (MetadataCache.relationIsAKnownShardFuncId == InvalidOid)
|
|
{
|
|
const int argCount = 1;
|
|
|
|
MetadataCache.relationIsAKnownShardFuncId =
|
|
FunctionOid("pg_catalog", "relation_is_a_known_shard", argCount);
|
|
}
|
|
|
|
return MetadataCache.relationIsAKnownShardFuncId;
|
|
}
|
|
|
|
|
|
/*
|
|
* JsonbExtractPathFuncId returns oid of the jsonb_extract_path function.
|
|
*/
|
|
Oid
|
|
JsonbExtractPathFuncId(void)
|
|
{
|
|
if (MetadataCache.jsonbExtractPathFuncId == InvalidOid)
|
|
{
|
|
const int argCount = 2;
|
|
|
|
MetadataCache.jsonbExtractPathFuncId =
|
|
FunctionOid("pg_catalog", "jsonb_extract_path", argCount);
|
|
}
|
|
|
|
return MetadataCache.jsonbExtractPathFuncId;
|
|
}
|
|
|
|
|
|
/*
|
|
* JsonbExtractPathTextFuncId returns oid of the jsonb_extract_path_text function.
|
|
*/
|
|
Oid
|
|
JsonbExtractPathTextFuncId(void)
|
|
{
|
|
if (MetadataCache.jsonbExtractPathTextFuncId == InvalidOid)
|
|
{
|
|
const int argCount = 2;
|
|
|
|
MetadataCache.jsonbExtractPathTextFuncId =
|
|
FunctionOid("pg_catalog", "jsonb_extract_path_text", argCount);
|
|
}
|
|
|
|
return MetadataCache.jsonbExtractPathTextFuncId;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusDependentObjectFuncId returns oid of the is_citus_depended_object function.
|
|
*/
|
|
Oid
|
|
CitusDependentObjectFuncId(void)
|
|
{
|
|
if (!HideCitusDependentObjects)
|
|
{
|
|
ereport(ERROR, (errmsg(
|
|
"is_citus_depended_object can only be used while running the regression tests")));
|
|
}
|
|
|
|
if (MetadataCache.CitusDependentObjectFuncId == InvalidOid)
|
|
{
|
|
const int argCount = 2;
|
|
|
|
MetadataCache.CitusDependentObjectFuncId =
|
|
FunctionOid("pg_catalog", "is_citus_depended_object", argCount);
|
|
}
|
|
|
|
return MetadataCache.CitusDependentObjectFuncId;
|
|
}
|
|
|
|
|
|
/*
|
|
* CurrentDatabaseName gets the name of the current database and caches
|
|
* the result.
|
|
*
|
|
* Given that the database name cannot be changed when there is at least
|
|
* one session connected to it, we do not need to implement any invalidation
|
|
* mechanism.
|
|
*/
|
|
const char *
|
|
CurrentDatabaseName(void)
|
|
{
|
|
if (!MetadataCache.databaseNameValid)
|
|
{
|
|
char *databaseName = get_database_name(MyDatabaseId);
|
|
if (databaseName == NULL)
|
|
{
|
|
ereport(ERROR, (errmsg("database that is connected to does not exist")));
|
|
}
|
|
|
|
strlcpy(MetadataCache.databaseName, databaseName, NAMEDATALEN);
|
|
MetadataCache.databaseNameValid = true;
|
|
}
|
|
|
|
return MetadataCache.databaseName;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusExtensionOwner() returns the owner of the 'citus' extension. That user
|
|
* is, amongst others, used to perform actions a normal user might not be
|
|
* allowed to perform.
|
|
*/
|
|
extern Oid
|
|
CitusExtensionOwner(void)
|
|
{
|
|
ScanKeyData entry[1];
|
|
Form_pg_extension extensionForm = NULL;
|
|
|
|
if (MetadataCache.extensionOwner != InvalidOid)
|
|
{
|
|
return MetadataCache.extensionOwner;
|
|
}
|
|
|
|
Relation relation = table_open(ExtensionRelationId, AccessShareLock);
|
|
|
|
ScanKeyInit(&entry[0],
|
|
Anum_pg_extension_extname,
|
|
BTEqualStrategyNumber, F_NAMEEQ,
|
|
CStringGetDatum("citus"));
|
|
|
|
SysScanDesc scandesc = systable_beginscan(relation, ExtensionNameIndexId, true,
|
|
NULL, 1, entry);
|
|
|
|
HeapTuple extensionTuple = systable_getnext(scandesc);
|
|
|
|
/* We assume that there can be at most one matching tuple */
|
|
if (HeapTupleIsValid(extensionTuple))
|
|
{
|
|
extensionForm = (Form_pg_extension) GETSTRUCT(extensionTuple);
|
|
|
|
/*
|
|
* For some operations Citus requires superuser permissions; we use
|
|
* the extension owner for that. The extension owner is guaranteed to
|
|
* be a superuser (otherwise C functions can't be created), but it'd
|
|
* be possible to change the owner. So check that this still a
|
|
* superuser.
|
|
*/
|
|
if (!superuser_arg(extensionForm->extowner))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("citus extension needs to be owned by superuser")));
|
|
}
|
|
MetadataCache.extensionOwner = extensionForm->extowner;
|
|
Assert(OidIsValid(MetadataCache.extensionOwner));
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("citus extension not loaded")));
|
|
}
|
|
|
|
systable_endscan(scandesc);
|
|
|
|
table_close(relation, AccessShareLock);
|
|
|
|
return MetadataCache.extensionOwner;
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusExtensionOwnerName returns the name of the owner of the extension.
|
|
*/
|
|
char *
|
|
CitusExtensionOwnerName(void)
|
|
{
|
|
Oid superUserId = CitusExtensionOwner();
|
|
|
|
return GetUserNameFromId(superUserId, false);
|
|
}
|
|
|
|
|
|
/* return the username of the currently active role */
|
|
char *
|
|
CurrentUserName(void)
|
|
{
|
|
Oid userId = GetUserId();
|
|
|
|
return GetUserNameFromId(userId, false);
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupTypeOid returns the Oid of the "{schemaNameSting}.{typeNameString}" type, or
|
|
* InvalidOid if it does not exist.
|
|
*/
|
|
Oid
|
|
LookupTypeOid(char *schemaNameSting, char *typeNameString)
|
|
{
|
|
String *schemaName = makeString(schemaNameSting);
|
|
String *typeName = makeString(typeNameString);
|
|
List *qualifiedName = list_make2(schemaName, typeName);
|
|
TypeName *enumTypeName = makeTypeNameFromNameList(qualifiedName);
|
|
|
|
|
|
/* typenameTypeId but instead of raising an error return InvalidOid */
|
|
Type tup = LookupTypeName(NULL, enumTypeName, NULL, false);
|
|
if (tup == NULL)
|
|
{
|
|
return InvalidOid;
|
|
}
|
|
|
|
Oid nodeRoleTypId = ((Form_pg_type) GETSTRUCT(tup))->oid;
|
|
ReleaseSysCache(tup);
|
|
|
|
return nodeRoleTypId;
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupStringEnumValueId returns the Oid of the value in "pg_catalog.{enumName}"
|
|
* which matches the provided valueName, or InvalidOid if the enum doesn't exist yet.
|
|
*/
|
|
static Oid
|
|
LookupStringEnumValueId(char *enumName, char *valueName)
|
|
{
|
|
Oid enumTypeId = LookupTypeOid("pg_catalog", enumName);
|
|
|
|
if (enumTypeId == InvalidOid)
|
|
{
|
|
return InvalidOid;
|
|
}
|
|
else
|
|
{
|
|
Oid valueId = LookupEnumValueId(enumTypeId, valueName);
|
|
return valueId;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupEnumValueId looks up the OID of an enum value.
|
|
*/
|
|
static Oid
|
|
LookupEnumValueId(Oid typeId, char *valueName)
|
|
{
|
|
Datum typeIdDatum = ObjectIdGetDatum(typeId);
|
|
Datum valueDatum = CStringGetDatum(valueName);
|
|
Datum valueIdDatum = DirectFunctionCall2(enum_in, valueDatum, typeIdDatum);
|
|
Oid valueId = DatumGetObjectId(valueIdDatum);
|
|
|
|
return valueId;
|
|
}
|
|
|
|
|
|
/* return the Oid of the 'primary' nodeRole enum value */
|
|
Oid
|
|
PrimaryNodeRoleId(void)
|
|
{
|
|
if (!MetadataCache.primaryNodeRoleId)
|
|
{
|
|
MetadataCache.primaryNodeRoleId = LookupStringEnumValueId("noderole", "primary");
|
|
}
|
|
|
|
return MetadataCache.primaryNodeRoleId;
|
|
}
|
|
|
|
|
|
/* return the Oid of the 'secodary' nodeRole enum value */
|
|
Oid
|
|
SecondaryNodeRoleId(void)
|
|
{
|
|
if (!MetadataCache.secondaryNodeRoleId)
|
|
{
|
|
MetadataCache.secondaryNodeRoleId = LookupStringEnumValueId("noderole",
|
|
"secondary");
|
|
}
|
|
|
|
return MetadataCache.secondaryNodeRoleId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusJobStatusScheduledId(void)
|
|
{
|
|
if (!MetadataCache.citusJobStatusScheduledId)
|
|
{
|
|
MetadataCache.citusJobStatusScheduledId =
|
|
LookupStringEnumValueId("citus_job_status", "scheduled");
|
|
}
|
|
|
|
return MetadataCache.citusJobStatusScheduledId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusJobStatusRunningId(void)
|
|
{
|
|
if (!MetadataCache.citusJobStatusRunningId)
|
|
{
|
|
MetadataCache.citusJobStatusRunningId =
|
|
LookupStringEnumValueId("citus_job_status", "running");
|
|
}
|
|
|
|
return MetadataCache.citusJobStatusRunningId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusJobStatusCancellingId(void)
|
|
{
|
|
if (!MetadataCache.citusJobStatusCancellingId)
|
|
{
|
|
MetadataCache.citusJobStatusCancellingId =
|
|
LookupStringEnumValueId("citus_job_status", "cancelling");
|
|
}
|
|
|
|
return MetadataCache.citusJobStatusCancellingId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusJobStatusFinishedId(void)
|
|
{
|
|
if (!MetadataCache.citusJobStatusFinishedId)
|
|
{
|
|
MetadataCache.citusJobStatusFinishedId =
|
|
LookupStringEnumValueId("citus_job_status", "finished");
|
|
}
|
|
|
|
return MetadataCache.citusJobStatusFinishedId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusJobStatusCancelledId(void)
|
|
{
|
|
if (!MetadataCache.citusJobStatusCancelledId)
|
|
{
|
|
MetadataCache.citusJobStatusCancelledId =
|
|
LookupStringEnumValueId("citus_job_status", "cancelled");
|
|
}
|
|
|
|
return MetadataCache.citusJobStatusCancelledId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusJobStatusFailedId(void)
|
|
{
|
|
if (!MetadataCache.citusJobStatusFailedId)
|
|
{
|
|
MetadataCache.citusJobStatusFailedId =
|
|
LookupStringEnumValueId("citus_job_status", "failed");
|
|
}
|
|
|
|
return MetadataCache.citusJobStatusFailedId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusJobStatusFailingId(void)
|
|
{
|
|
if (!MetadataCache.citusJobStatusFailingId)
|
|
{
|
|
MetadataCache.citusJobStatusFailingId =
|
|
LookupStringEnumValueId("citus_job_status", "failing");
|
|
}
|
|
|
|
return MetadataCache.citusJobStatusFailingId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusTaskStatusBlockedId(void)
|
|
{
|
|
if (!MetadataCache.citusTaskStatusBlockedId)
|
|
{
|
|
MetadataCache.citusTaskStatusBlockedId =
|
|
LookupStringEnumValueId("citus_task_status", "blocked");
|
|
}
|
|
|
|
return MetadataCache.citusTaskStatusBlockedId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusTaskStatusCancelledId(void)
|
|
{
|
|
if (!MetadataCache.citusTaskStatusCancelledId)
|
|
{
|
|
MetadataCache.citusTaskStatusCancelledId =
|
|
LookupStringEnumValueId("citus_task_status", "cancelled");
|
|
}
|
|
|
|
return MetadataCache.citusTaskStatusCancelledId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusTaskStatusCancellingId(void)
|
|
{
|
|
if (!MetadataCache.citusTaskStatusCancellingId)
|
|
{
|
|
MetadataCache.citusTaskStatusCancellingId =
|
|
LookupStringEnumValueId("citus_task_status", "cancelling");
|
|
}
|
|
|
|
return MetadataCache.citusTaskStatusCancellingId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusTaskStatusRunnableId(void)
|
|
{
|
|
if (!MetadataCache.citusTaskStatusRunnableId)
|
|
{
|
|
MetadataCache.citusTaskStatusRunnableId =
|
|
LookupStringEnumValueId("citus_task_status", "runnable");
|
|
}
|
|
|
|
return MetadataCache.citusTaskStatusRunnableId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusTaskStatusRunningId(void)
|
|
{
|
|
if (!MetadataCache.citusTaskStatusRunningId)
|
|
{
|
|
MetadataCache.citusTaskStatusRunningId =
|
|
LookupStringEnumValueId("citus_task_status", "running");
|
|
}
|
|
|
|
return MetadataCache.citusTaskStatusRunningId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusTaskStatusDoneId(void)
|
|
{
|
|
if (!MetadataCache.citusTaskStatusDoneId)
|
|
{
|
|
MetadataCache.citusTaskStatusDoneId =
|
|
LookupStringEnumValueId("citus_task_status", "done");
|
|
}
|
|
|
|
return MetadataCache.citusTaskStatusDoneId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusTaskStatusErrorId(void)
|
|
{
|
|
if (!MetadataCache.citusTaskStatusErrorId)
|
|
{
|
|
MetadataCache.citusTaskStatusErrorId =
|
|
LookupStringEnumValueId("citus_task_status", "error");
|
|
}
|
|
|
|
return MetadataCache.citusTaskStatusErrorId;
|
|
}
|
|
|
|
|
|
Oid
|
|
CitusTaskStatusUnscheduledId(void)
|
|
{
|
|
if (!MetadataCache.citusTaskStatusUnscheduledId)
|
|
{
|
|
MetadataCache.citusTaskStatusUnscheduledId =
|
|
LookupStringEnumValueId("citus_task_status", "unscheduled");
|
|
}
|
|
|
|
return MetadataCache.citusTaskStatusUnscheduledId;
|
|
}
|
|
|
|
|
|
/*
|
|
* citus_dist_partition_cache_invalidate is a trigger function that performs
|
|
* relcache invalidations when the contents of pg_dist_partition are changed
|
|
* on the SQL level.
|
|
*
|
|
* NB: We decided there is little point in checking permissions here, there
|
|
* are much easier ways to waste CPU than causing cache invalidations.
|
|
*/
|
|
Datum
|
|
citus_dist_partition_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
CheckCitusVersion(ERROR);
|
|
|
|
TriggerData *triggerData = (TriggerData *) fcinfo->context;
|
|
Oid oldLogicalRelationId = InvalidOid;
|
|
Oid newLogicalRelationId = InvalidOid;
|
|
|
|
if (!CALLED_AS_TRIGGER(fcinfo))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
|
|
errmsg("must be called as trigger")));
|
|
}
|
|
|
|
if (RelationGetRelid(triggerData->tg_relation) != DistPartitionRelationId())
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
|
|
errmsg("triggered on incorrect relation")));
|
|
}
|
|
|
|
HeapTuple newTuple = triggerData->tg_newtuple;
|
|
HeapTuple oldTuple = triggerData->tg_trigtuple;
|
|
|
|
/* collect logicalrelid for OLD and NEW tuple */
|
|
if (oldTuple != NULL)
|
|
{
|
|
Form_pg_dist_partition distPart = (Form_pg_dist_partition) GETSTRUCT(oldTuple);
|
|
|
|
oldLogicalRelationId = distPart->logicalrelid;
|
|
}
|
|
|
|
if (newTuple != NULL)
|
|
{
|
|
Form_pg_dist_partition distPart = (Form_pg_dist_partition) GETSTRUCT(newTuple);
|
|
|
|
newLogicalRelationId = distPart->logicalrelid;
|
|
}
|
|
|
|
/*
|
|
* Invalidate relcache for the relevant relation(s). In theory
|
|
* logicalrelid should never change, but it doesn't hurt to be
|
|
* paranoid.
|
|
*/
|
|
if (oldLogicalRelationId != InvalidOid &&
|
|
oldLogicalRelationId != newLogicalRelationId)
|
|
{
|
|
CitusInvalidateRelcacheByRelid(oldLogicalRelationId);
|
|
}
|
|
|
|
if (newLogicalRelationId != InvalidOid)
|
|
{
|
|
CitusInvalidateRelcacheByRelid(newLogicalRelationId);
|
|
}
|
|
|
|
PG_RETURN_DATUM(PointerGetDatum(NULL));
|
|
}
|
|
|
|
|
|
/*
|
|
* master_dist_partition_cache_invalidate is a wrapper function for old UDF name.
|
|
*/
|
|
Datum
|
|
master_dist_partition_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
return citus_dist_partition_cache_invalidate(fcinfo);
|
|
}
|
|
|
|
|
|
/*
|
|
* citus_dist_shard_cache_invalidate is a trigger function that performs
|
|
* relcache invalidations when the contents of pg_dist_shard are changed
|
|
* on the SQL level.
|
|
*
|
|
* NB: We decided there is little point in checking permissions here, there
|
|
* are much easier ways to waste CPU than causing cache invalidations.
|
|
*/
|
|
Datum
|
|
citus_dist_shard_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
CheckCitusVersion(ERROR);
|
|
|
|
TriggerData *triggerData = (TriggerData *) fcinfo->context;
|
|
Oid oldLogicalRelationId = InvalidOid;
|
|
Oid newLogicalRelationId = InvalidOid;
|
|
|
|
if (!CALLED_AS_TRIGGER(fcinfo))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
|
|
errmsg("must be called as trigger")));
|
|
}
|
|
|
|
if (RelationGetRelid(triggerData->tg_relation) != DistShardRelationId())
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
|
|
errmsg("triggered on incorrect relation")));
|
|
}
|
|
|
|
HeapTuple newTuple = triggerData->tg_newtuple;
|
|
HeapTuple oldTuple = triggerData->tg_trigtuple;
|
|
|
|
/* collect logicalrelid for OLD and NEW tuple */
|
|
if (oldTuple != NULL)
|
|
{
|
|
Form_pg_dist_shard distShard = (Form_pg_dist_shard) GETSTRUCT(oldTuple);
|
|
|
|
oldLogicalRelationId = distShard->logicalrelid;
|
|
}
|
|
|
|
if (newTuple != NULL)
|
|
{
|
|
Form_pg_dist_shard distShard = (Form_pg_dist_shard) GETSTRUCT(newTuple);
|
|
|
|
newLogicalRelationId = distShard->logicalrelid;
|
|
}
|
|
|
|
/*
|
|
* Invalidate relcache for the relevant relation(s). In theory
|
|
* logicalrelid should never change, but it doesn't hurt to be
|
|
* paranoid.
|
|
*/
|
|
if (oldLogicalRelationId != InvalidOid &&
|
|
oldLogicalRelationId != newLogicalRelationId)
|
|
{
|
|
CitusInvalidateRelcacheByRelid(oldLogicalRelationId);
|
|
}
|
|
|
|
if (newLogicalRelationId != InvalidOid)
|
|
{
|
|
CitusInvalidateRelcacheByRelid(newLogicalRelationId);
|
|
}
|
|
|
|
PG_RETURN_DATUM(PointerGetDatum(NULL));
|
|
}
|
|
|
|
|
|
/*
|
|
* master_dist_shard_cache_invalidate is a wrapper function for old UDF name.
|
|
*/
|
|
Datum
|
|
master_dist_shard_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
return citus_dist_shard_cache_invalidate(fcinfo);
|
|
}
|
|
|
|
|
|
/*
|
|
* citus_dist_placement_cache_invalidate is a trigger function that performs
|
|
* relcache invalidations when the contents of pg_dist_placement are
|
|
* changed on the SQL level.
|
|
*
|
|
* NB: We decided there is little point in checking permissions here, there
|
|
* are much easier ways to waste CPU than causing cache invalidations.
|
|
*/
|
|
Datum
|
|
citus_dist_placement_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
CheckCitusVersion(ERROR);
|
|
|
|
TriggerData *triggerData = (TriggerData *) fcinfo->context;
|
|
Oid oldShardId = InvalidOid;
|
|
Oid newShardId = InvalidOid;
|
|
|
|
if (!CALLED_AS_TRIGGER(fcinfo))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
|
|
errmsg("must be called as trigger")));
|
|
}
|
|
|
|
/*
|
|
* Before 7.0-2 this trigger is on pg_dist_shard_placement,
|
|
* ignore trigger in this scenario.
|
|
*/
|
|
Oid pgDistShardPlacementId = get_relname_relid("pg_dist_shard_placement",
|
|
PG_CATALOG_NAMESPACE);
|
|
if (RelationGetRelid(triggerData->tg_relation) == pgDistShardPlacementId)
|
|
{
|
|
PG_RETURN_DATUM(PointerGetDatum(NULL));
|
|
}
|
|
|
|
if (RelationGetRelid(triggerData->tg_relation) != DistPlacementRelationId())
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
|
|
errmsg("triggered on incorrect relation")));
|
|
}
|
|
|
|
HeapTuple newTuple = triggerData->tg_newtuple;
|
|
HeapTuple oldTuple = triggerData->tg_trigtuple;
|
|
|
|
/* collect shardid for OLD and NEW tuple */
|
|
if (oldTuple != NULL)
|
|
{
|
|
Form_pg_dist_placement distPlacement =
|
|
(Form_pg_dist_placement) GETSTRUCT(oldTuple);
|
|
|
|
oldShardId = distPlacement->shardid;
|
|
}
|
|
|
|
if (newTuple != NULL)
|
|
{
|
|
Form_pg_dist_placement distPlacement =
|
|
(Form_pg_dist_placement) GETSTRUCT(newTuple);
|
|
|
|
newShardId = distPlacement->shardid;
|
|
}
|
|
|
|
/*
|
|
* Invalidate relcache for the relevant relation(s). In theory shardId
|
|
* should never change, but it doesn't hurt to be paranoid.
|
|
*/
|
|
if (oldShardId != InvalidOid &&
|
|
oldShardId != newShardId)
|
|
{
|
|
CitusInvalidateRelcacheByShardId(oldShardId);
|
|
}
|
|
|
|
if (newShardId != InvalidOid)
|
|
{
|
|
CitusInvalidateRelcacheByShardId(newShardId);
|
|
}
|
|
|
|
PG_RETURN_DATUM(PointerGetDatum(NULL));
|
|
}
|
|
|
|
|
|
/*
|
|
* master_dist_placement_cache_invalidate is a wrapper function for old UDF name.
|
|
*/
|
|
Datum
|
|
master_dist_placement_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
return citus_dist_placement_cache_invalidate(fcinfo);
|
|
}
|
|
|
|
|
|
/*
|
|
* citus_dist_node_cache_invalidate is a trigger function that performs
|
|
* relcache invalidations when the contents of pg_dist_node are changed
|
|
* on the SQL level.
|
|
*
|
|
* NB: We decided there is little point in checking permissions here, there
|
|
* are much easier ways to waste CPU than causing cache invalidations.
|
|
*/
|
|
Datum
|
|
citus_dist_node_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
CheckCitusVersion(ERROR);
|
|
|
|
if (!CALLED_AS_TRIGGER(fcinfo))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
|
|
errmsg("must be called as trigger")));
|
|
}
|
|
|
|
CitusInvalidateRelcacheByRelid(DistNodeRelationId());
|
|
|
|
PG_RETURN_DATUM(PointerGetDatum(NULL));
|
|
}
|
|
|
|
|
|
/*
|
|
* master_dist_node_cache_invalidate is a wrapper function for old UDF name.
|
|
*/
|
|
Datum
|
|
master_dist_node_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
return citus_dist_node_cache_invalidate(fcinfo);
|
|
}
|
|
|
|
|
|
/*
|
|
* citus_conninfo_cache_invalidate is a trigger function that performs
|
|
* relcache invalidations when the contents of pg_dist_authinfo are changed
|
|
* on the SQL level.
|
|
*
|
|
* NB: We decided there is little point in checking permissions here, there
|
|
* are much easier ways to waste CPU than causing cache invalidations.
|
|
*/
|
|
Datum
|
|
citus_conninfo_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
CheckCitusVersion(ERROR);
|
|
|
|
if (!CALLED_AS_TRIGGER(fcinfo))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
|
|
errmsg("must be called as trigger")));
|
|
}
|
|
|
|
CitusInvalidateRelcacheByRelid(DistAuthinfoRelationId());
|
|
|
|
PG_RETURN_DATUM(PointerGetDatum(NULL));
|
|
}
|
|
|
|
|
|
/*
|
|
* master_dist_authinfo_cache_invalidate is a wrapper function for old UDF name.
|
|
*/
|
|
Datum
|
|
master_dist_authinfo_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
return citus_conninfo_cache_invalidate(fcinfo);
|
|
}
|
|
|
|
|
|
/*
|
|
* citus_dist_local_group_cache_invalidate is a trigger function that performs
|
|
* relcache invalidations when the contents of pg_dist_local_group are changed
|
|
* on the SQL level.
|
|
*
|
|
* NB: We decided there is little point in checking permissions here, there
|
|
* are much easier ways to waste CPU than causing cache invalidations.
|
|
*/
|
|
Datum
|
|
citus_dist_local_group_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
CheckCitusVersion(ERROR);
|
|
|
|
if (!CALLED_AS_TRIGGER(fcinfo))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
|
|
errmsg("must be called as trigger")));
|
|
}
|
|
|
|
CitusInvalidateRelcacheByRelid(DistLocalGroupIdRelationId());
|
|
|
|
PG_RETURN_DATUM(PointerGetDatum(NULL));
|
|
}
|
|
|
|
|
|
/*
|
|
* master_dist_local_group_cache_invalidate is a wrapper function for old UDF name.
|
|
*/
|
|
Datum
|
|
master_dist_local_group_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
return citus_dist_local_group_cache_invalidate(fcinfo);
|
|
}
|
|
|
|
|
|
/*
|
|
* citus_dist_object_cache_invalidate is a trigger function that performs relcache
|
|
* invalidation when the contents of pg_dist_object are changed on the SQL
|
|
* level.
|
|
*
|
|
* NB: We decided there is little point in checking permissions here, there
|
|
* are much easier ways to waste CPU than causing cache invalidations.
|
|
*/
|
|
Datum
|
|
citus_dist_object_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
CheckCitusVersion(ERROR);
|
|
|
|
if (!CALLED_AS_TRIGGER(fcinfo))
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
|
|
errmsg("must be called as trigger")));
|
|
}
|
|
|
|
CitusInvalidateRelcacheByRelid(DistObjectRelationId());
|
|
|
|
PG_RETURN_DATUM(PointerGetDatum(NULL));
|
|
}
|
|
|
|
|
|
/*
|
|
* master_dist_object_cache_invalidate is a wrapper function for old UDF name.
|
|
*/
|
|
Datum
|
|
master_dist_object_cache_invalidate(PG_FUNCTION_ARGS)
|
|
{
|
|
return citus_dist_object_cache_invalidate(fcinfo);
|
|
}
|
|
|
|
|
|
/*
|
|
* InitializeCaches() registers invalidation handlers for metadata_cache.c's
|
|
* caches.
|
|
*/
|
|
static void
|
|
InitializeCaches(void)
|
|
{
|
|
static bool performedInitialization = false;
|
|
|
|
if (!performedInitialization)
|
|
{
|
|
MetadataCacheMemoryContext = NULL;
|
|
|
|
/*
|
|
* If either of dist table cache or shard cache
|
|
* allocation and initializations fail due to an exception
|
|
* that is caused by OOM or any other reason,
|
|
* we reset the flag, and delete the shard cache memory
|
|
* context to reclaim partially allocated memory.
|
|
*
|
|
* Command will continue to fail since we re-throw the exception.
|
|
*/
|
|
PG_TRY();
|
|
{
|
|
/* set first, to avoid recursion dangers */
|
|
performedInitialization = true;
|
|
|
|
/* make sure we've initialized CacheMemoryContext */
|
|
if (CacheMemoryContext == NULL)
|
|
{
|
|
CreateCacheMemoryContext();
|
|
}
|
|
|
|
MetadataCacheMemoryContext = AllocSetContextCreate(
|
|
CacheMemoryContext,
|
|
"MetadataCacheMemoryContext",
|
|
ALLOCSET_DEFAULT_SIZES);
|
|
|
|
InitializeDistCache();
|
|
RegisterForeignKeyGraphCacheCallbacks();
|
|
RegisterWorkerNodeCacheCallbacks();
|
|
RegisterLocalGroupIdCacheCallbacks();
|
|
RegisterAuthinfoCacheCallbacks();
|
|
RegisterCitusTableCacheEntryReleaseCallbacks();
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
performedInitialization = false;
|
|
|
|
if (MetadataCacheMemoryContext != NULL)
|
|
{
|
|
MemoryContextDelete(MetadataCacheMemoryContext);
|
|
}
|
|
|
|
MetadataCacheMemoryContext = NULL;
|
|
DistTableCacheHash = NULL;
|
|
DistTableCacheExpired = NIL;
|
|
ShardIdCacheHash = NULL;
|
|
|
|
PG_RE_THROW();
|
|
}
|
|
PG_END_TRY();
|
|
}
|
|
}
|
|
|
|
|
|
/* initialize the infrastructure for the metadata cache */
|
|
static void
|
|
InitializeDistCache(void)
|
|
{
|
|
/* build initial scan keys, copied for every relation scan */
|
|
memset(&DistPartitionScanKey, 0, sizeof(DistPartitionScanKey));
|
|
|
|
fmgr_info_cxt(F_OIDEQ,
|
|
&DistPartitionScanKey[0].sk_func,
|
|
MetadataCacheMemoryContext);
|
|
DistPartitionScanKey[0].sk_strategy = BTEqualStrategyNumber;
|
|
DistPartitionScanKey[0].sk_subtype = InvalidOid;
|
|
DistPartitionScanKey[0].sk_collation = InvalidOid;
|
|
DistPartitionScanKey[0].sk_attno = Anum_pg_dist_partition_logicalrelid;
|
|
|
|
memset(&DistShardScanKey, 0, sizeof(DistShardScanKey));
|
|
|
|
fmgr_info_cxt(F_OIDEQ,
|
|
&DistShardScanKey[0].sk_func,
|
|
MetadataCacheMemoryContext);
|
|
DistShardScanKey[0].sk_strategy = BTEqualStrategyNumber;
|
|
DistShardScanKey[0].sk_subtype = InvalidOid;
|
|
DistShardScanKey[0].sk_collation = InvalidOid;
|
|
DistShardScanKey[0].sk_attno = Anum_pg_dist_shard_logicalrelid;
|
|
|
|
CreateDistTableCache();
|
|
CreateShardIdCache();
|
|
|
|
InitializeDistObjectCache();
|
|
}
|
|
|
|
|
|
static void
|
|
InitializeDistObjectCache(void)
|
|
{
|
|
/* build initial scan keys, copied for every relation scan */
|
|
memset(&DistObjectScanKey, 0, sizeof(DistObjectScanKey));
|
|
|
|
fmgr_info_cxt(F_OIDEQ,
|
|
&DistObjectScanKey[0].sk_func,
|
|
MetadataCacheMemoryContext);
|
|
DistObjectScanKey[0].sk_strategy = BTEqualStrategyNumber;
|
|
DistObjectScanKey[0].sk_subtype = InvalidOid;
|
|
DistObjectScanKey[0].sk_collation = InvalidOid;
|
|
DistObjectScanKey[0].sk_attno = Anum_pg_dist_object_classid;
|
|
|
|
fmgr_info_cxt(F_OIDEQ,
|
|
&DistObjectScanKey[1].sk_func,
|
|
MetadataCacheMemoryContext);
|
|
DistObjectScanKey[1].sk_strategy = BTEqualStrategyNumber;
|
|
DistObjectScanKey[1].sk_subtype = InvalidOid;
|
|
DistObjectScanKey[1].sk_collation = InvalidOid;
|
|
DistObjectScanKey[1].sk_attno = Anum_pg_dist_object_objid;
|
|
|
|
fmgr_info_cxt(F_INT4EQ,
|
|
&DistObjectScanKey[2].sk_func,
|
|
MetadataCacheMemoryContext);
|
|
DistObjectScanKey[2].sk_strategy = BTEqualStrategyNumber;
|
|
DistObjectScanKey[2].sk_subtype = InvalidOid;
|
|
DistObjectScanKey[2].sk_collation = InvalidOid;
|
|
DistObjectScanKey[2].sk_attno = Anum_pg_dist_object_objsubid;
|
|
|
|
CreateDistObjectCache();
|
|
}
|
|
|
|
|
|
/*
|
|
* GetWorkerNodeHash returns the worker node data as a hash with the nodename and
|
|
* nodeport as a key.
|
|
*
|
|
* The hash is returned from the cache, if the cache is not (yet) valid, it is first
|
|
* rebuilt.
|
|
*/
|
|
HTAB *
|
|
GetWorkerNodeHash(void)
|
|
{
|
|
PrepareWorkerNodeCache();
|
|
|
|
return WorkerNodeHash;
|
|
}
|
|
|
|
|
|
/*
|
|
* PrepareWorkerNodeCache makes sure the worker node data from pg_dist_node is cached,
|
|
* if it is not already cached.
|
|
*/
|
|
static void
|
|
PrepareWorkerNodeCache(void)
|
|
{
|
|
InitializeCaches(); /* ensure relevant callbacks are registered */
|
|
|
|
/*
|
|
* Simulate a SELECT from pg_dist_node, ensure pg_dist_node doesn't change while our
|
|
* caller is using WorkerNodeHash.
|
|
*/
|
|
LockRelationOid(DistNodeRelationId(), AccessShareLock);
|
|
|
|
/*
|
|
* We might have some concurrent metadata changes. In order to get the changes,
|
|
* we first need to accept the cache invalidation messages.
|
|
*/
|
|
AcceptInvalidationMessages();
|
|
|
|
if (!workerNodeHashValid)
|
|
{
|
|
InitializeWorkerNodeCache();
|
|
|
|
workerNodeHashValid = true;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* InitializeWorkerNodeCache initialize the infrastructure for the worker node cache.
|
|
* The function reads the worker nodes from the metadata table, adds them to the hash and
|
|
* finally registers an invalidation callback.
|
|
*/
|
|
static void
|
|
InitializeWorkerNodeCache(void)
|
|
{
|
|
HASHCTL info;
|
|
long maxTableSize = (long) MaxWorkerNodesTracked;
|
|
bool includeNodesFromOtherClusters = false;
|
|
int workerNodeIndex = 0;
|
|
|
|
InitializeCaches();
|
|
|
|
/*
|
|
* Create the hash that holds the worker nodes. The key is the combination of
|
|
* nodename and nodeport, instead of the unique nodeid because worker nodes are
|
|
* searched by the nodename and nodeport in every physical plan creation.
|
|
*/
|
|
memset(&info, 0, sizeof(info));
|
|
info.keysize = sizeof(uint32) + WORKER_LENGTH + sizeof(uint32);
|
|
info.entrysize = sizeof(WorkerNode);
|
|
info.hcxt = MetadataCacheMemoryContext;
|
|
info.hash = WorkerNodeHashCode;
|
|
info.match = WorkerNodeCompare;
|
|
int hashFlags = HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT | HASH_COMPARE;
|
|
|
|
HTAB *newWorkerNodeHash = hash_create("Worker Node Hash", maxTableSize, &info,
|
|
hashFlags);
|
|
|
|
/* read the list from pg_dist_node */
|
|
List *workerNodeList = ReadDistNode(includeNodesFromOtherClusters);
|
|
|
|
int newWorkerNodeCount = list_length(workerNodeList);
|
|
WorkerNode **newWorkerNodeArray = MemoryContextAlloc(MetadataCacheMemoryContext,
|
|
sizeof(WorkerNode *) *
|
|
newWorkerNodeCount);
|
|
|
|
/* iterate over the worker node list */
|
|
WorkerNode *currentNode = NULL;
|
|
foreach_ptr(currentNode, workerNodeList)
|
|
{
|
|
bool handleFound = false;
|
|
|
|
/* search for the worker node in the hash, and then insert the values */
|
|
void *hashKey = (void *) currentNode;
|
|
WorkerNode *workerNode = (WorkerNode *) hash_search(newWorkerNodeHash, hashKey,
|
|
HASH_ENTER, &handleFound);
|
|
|
|
/* fill the newly allocated workerNode in the cache */
|
|
strlcpy(workerNode->workerName, currentNode->workerName, WORKER_LENGTH);
|
|
workerNode->workerPort = currentNode->workerPort;
|
|
workerNode->groupId = currentNode->groupId;
|
|
workerNode->nodeId = currentNode->nodeId;
|
|
strlcpy(workerNode->workerRack, currentNode->workerRack, WORKER_LENGTH);
|
|
workerNode->hasMetadata = currentNode->hasMetadata;
|
|
workerNode->metadataSynced = currentNode->metadataSynced;
|
|
workerNode->isActive = currentNode->isActive;
|
|
workerNode->nodeRole = currentNode->nodeRole;
|
|
workerNode->shouldHaveShards = currentNode->shouldHaveShards;
|
|
strlcpy(workerNode->nodeCluster, currentNode->nodeCluster, NAMEDATALEN);
|
|
|
|
newWorkerNodeArray[workerNodeIndex++] = workerNode;
|
|
|
|
if (handleFound)
|
|
{
|
|
ereport(WARNING, (errmsg("multiple lines for worker node: \"%s:%u\"",
|
|
workerNode->workerName,
|
|
workerNode->workerPort)));
|
|
}
|
|
|
|
/* we do not need the currentNode anymore */
|
|
pfree(currentNode);
|
|
}
|
|
|
|
/* now, safe to destroy the old hash */
|
|
hash_destroy(WorkerNodeHash);
|
|
|
|
if (WorkerNodeArray != NULL)
|
|
{
|
|
pfree(WorkerNodeArray);
|
|
}
|
|
|
|
WorkerNodeCount = newWorkerNodeCount;
|
|
WorkerNodeArray = newWorkerNodeArray;
|
|
WorkerNodeHash = newWorkerNodeHash;
|
|
}
|
|
|
|
|
|
/*
|
|
* RegisterForeignKeyGraphCacheCallbacks registers callbacks required for
|
|
* the foreign key graph cache.
|
|
*/
|
|
static void
|
|
RegisterForeignKeyGraphCacheCallbacks(void)
|
|
{
|
|
/* Watch for invalidation events. */
|
|
CacheRegisterRelcacheCallback(InvalidateForeignRelationGraphCacheCallback,
|
|
(Datum) 0);
|
|
}
|
|
|
|
|
|
/*
|
|
* RegisterWorkerNodeCacheCallbacks registers the callbacks required for the
|
|
* worker node cache. It's separate from InitializeWorkerNodeCache so the
|
|
* callback can be registered early, before the metadata tables exist.
|
|
*/
|
|
static void
|
|
RegisterWorkerNodeCacheCallbacks(void)
|
|
{
|
|
/* Watch for invalidation events. */
|
|
CacheRegisterRelcacheCallback(InvalidateNodeRelationCacheCallback,
|
|
(Datum) 0);
|
|
}
|
|
|
|
|
|
/*
|
|
* RegisterCitusTableCacheEntryReleaseCallbacks registers callbacks to release
|
|
* cache entries. Data should be locked by callers to avoid staleness.
|
|
*/
|
|
static void
|
|
RegisterCitusTableCacheEntryReleaseCallbacks(void)
|
|
{
|
|
RegisterResourceReleaseCallback(CitusTableCacheEntryReleaseCallback, NULL);
|
|
}
|
|
|
|
|
|
/*
|
|
* GetLocalGroupId returns the group identifier of the local node. The function
|
|
* assumes that pg_dist_local_group has exactly one row and has at least one
|
|
* column. Otherwise, the function errors out.
|
|
*/
|
|
int32
|
|
GetLocalGroupId(void)
|
|
{
|
|
ScanKeyData scanKey[1];
|
|
int scanKeyCount = 0;
|
|
int32 groupId = 0;
|
|
|
|
InitializeCaches();
|
|
|
|
/*
|
|
* Already set the group id, no need to read the heap again.
|
|
*/
|
|
if (LocalGroupId != -1)
|
|
{
|
|
return LocalGroupId;
|
|
}
|
|
|
|
Oid localGroupTableOid = DistLocalGroupIdRelationId();
|
|
if (localGroupTableOid == InvalidOid)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
Relation pgDistLocalGroupId = table_open(localGroupTableOid, AccessShareLock);
|
|
|
|
SysScanDesc scanDescriptor = systable_beginscan(pgDistLocalGroupId,
|
|
InvalidOid, false,
|
|
NULL, scanKeyCount, scanKey);
|
|
|
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistLocalGroupId);
|
|
|
|
HeapTuple heapTuple = systable_getnext(scanDescriptor);
|
|
|
|
if (HeapTupleIsValid(heapTuple))
|
|
{
|
|
bool isNull = false;
|
|
Datum groupIdDatum = heap_getattr(heapTuple,
|
|
Anum_pg_dist_local_groupid,
|
|
tupleDescriptor, &isNull);
|
|
|
|
groupId = DatumGetInt32(groupIdDatum);
|
|
|
|
/* set the local cache variable */
|
|
LocalGroupId = groupId;
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Upgrade is happening. When upgrading postgres, pg_dist_local_group is
|
|
* temporarily empty before citus_finish_pg_upgrade() finishes execution.
|
|
*/
|
|
groupId = GROUP_ID_UPGRADING;
|
|
}
|
|
|
|
systable_endscan(scanDescriptor);
|
|
table_close(pgDistLocalGroupId, AccessShareLock);
|
|
|
|
return groupId;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetNodeId returns the node identifier of the local node.
|
|
*/
|
|
int32
|
|
GetLocalNodeId(void)
|
|
{
|
|
InitializeCaches();
|
|
|
|
/*
|
|
* Already set the node id, no need to read the heap again.
|
|
*/
|
|
if (LocalNodeId != -1)
|
|
{
|
|
return LocalNodeId;
|
|
}
|
|
|
|
uint32 nodeId = -1;
|
|
|
|
int32 localGroupId = GetLocalGroupId();
|
|
|
|
bool includeNodesFromOtherClusters = false;
|
|
List *workerNodeList = ReadDistNode(includeNodesFromOtherClusters);
|
|
|
|
WorkerNode *workerNode = NULL;
|
|
foreach_ptr(workerNode, workerNodeList)
|
|
{
|
|
if (workerNode->groupId == localGroupId &&
|
|
workerNode->isActive)
|
|
{
|
|
nodeId = workerNode->nodeId;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* nodeId is -1 if we cannot find an active node whose group id is
|
|
* localGroupId in pg_dist_node.
|
|
*/
|
|
if (nodeId == -1)
|
|
{
|
|
elog(DEBUG4, "there is no active node with group id '%d' on pg_dist_node",
|
|
localGroupId);
|
|
|
|
/*
|
|
* This is expected if the coordinator is not added to the metadata.
|
|
* We'll return GLOBAL_PID_NODE_ID_FOR_NODES_NOT_IN_METADATA for this case and
|
|
* for all cases so views can function almost normally
|
|
*/
|
|
nodeId = GLOBAL_PID_NODE_ID_FOR_NODES_NOT_IN_METADATA;
|
|
}
|
|
|
|
LocalNodeId = nodeId;
|
|
|
|
return nodeId;
|
|
}
|
|
|
|
|
|
/*
|
|
* RegisterLocalGroupIdCacheCallbacks registers the callbacks required to
|
|
* maintain LocalGroupId at a consistent value. It's separate from
|
|
* GetLocalGroupId so the callback can be registered early, before metadata
|
|
* tables exist.
|
|
*/
|
|
static void
|
|
RegisterLocalGroupIdCacheCallbacks(void)
|
|
{
|
|
/* Watch for invalidation events. */
|
|
CacheRegisterRelcacheCallback(InvalidateLocalGroupIdRelationCacheCallback,
|
|
(Datum) 0);
|
|
}
|
|
|
|
|
|
/*
|
|
* RegisterAuthinfoCacheCallbacks registers the callbacks required to
|
|
* maintain cached connection parameters at fresh values.
|
|
*/
|
|
static void
|
|
RegisterAuthinfoCacheCallbacks(void)
|
|
{
|
|
/* Watch for invalidation events. */
|
|
CacheRegisterRelcacheCallback(InvalidateConnParamsCacheCallback, (Datum) 0);
|
|
}
|
|
|
|
|
|
/*
|
|
* ResetCitusTableCacheEntry frees any out-of-band memory used by a cache entry,
|
|
* but does not free the entry itself.
|
|
*/
|
|
static void
|
|
ResetCitusTableCacheEntry(CitusTableCacheEntry *cacheEntry)
|
|
{
|
|
if (cacheEntry->partitionKeyString != NULL)
|
|
{
|
|
pfree(cacheEntry->partitionKeyString);
|
|
cacheEntry->partitionKeyString = NULL;
|
|
}
|
|
|
|
if (cacheEntry->shardIntervalCompareFunction != NULL)
|
|
{
|
|
pfree(cacheEntry->shardIntervalCompareFunction);
|
|
cacheEntry->shardIntervalCompareFunction = NULL;
|
|
}
|
|
|
|
if (cacheEntry->hashFunction)
|
|
{
|
|
pfree(cacheEntry->hashFunction);
|
|
cacheEntry->hashFunction = NULL;
|
|
}
|
|
|
|
if (cacheEntry->partitionColumn != NULL)
|
|
{
|
|
pfree(cacheEntry->partitionColumn);
|
|
cacheEntry->partitionColumn = NULL;
|
|
}
|
|
|
|
if (cacheEntry->shardIntervalArrayLength == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
/* clean up ShardIdCacheHash */
|
|
RemoveStaleShardIdCacheEntries(cacheEntry);
|
|
|
|
for (int shardIndex = 0; shardIndex < cacheEntry->shardIntervalArrayLength;
|
|
shardIndex++)
|
|
{
|
|
ShardInterval *shardInterval = cacheEntry->sortedShardIntervalArray[shardIndex];
|
|
GroupShardPlacement *placementArray =
|
|
cacheEntry->arrayOfPlacementArrays[shardIndex];
|
|
bool valueByVal = shardInterval->valueByVal;
|
|
|
|
/* delete the shard's placements */
|
|
if (placementArray != NULL)
|
|
{
|
|
pfree(placementArray);
|
|
}
|
|
|
|
/* delete data pointed to by ShardInterval */
|
|
if (!valueByVal)
|
|
{
|
|
if (shardInterval->minValueExists)
|
|
{
|
|
pfree(DatumGetPointer(shardInterval->minValue));
|
|
}
|
|
|
|
if (shardInterval->maxValueExists)
|
|
{
|
|
pfree(DatumGetPointer(shardInterval->maxValue));
|
|
}
|
|
}
|
|
|
|
/* and finally the ShardInterval itself */
|
|
pfree(shardInterval);
|
|
}
|
|
|
|
if (cacheEntry->sortedShardIntervalArray)
|
|
{
|
|
pfree(cacheEntry->sortedShardIntervalArray);
|
|
cacheEntry->sortedShardIntervalArray = NULL;
|
|
}
|
|
if (cacheEntry->arrayOfPlacementArrayLengths)
|
|
{
|
|
pfree(cacheEntry->arrayOfPlacementArrayLengths);
|
|
cacheEntry->arrayOfPlacementArrayLengths = NULL;
|
|
}
|
|
if (cacheEntry->arrayOfPlacementArrays)
|
|
{
|
|
pfree(cacheEntry->arrayOfPlacementArrays);
|
|
cacheEntry->arrayOfPlacementArrays = NULL;
|
|
}
|
|
if (cacheEntry->referencedRelationsViaForeignKey)
|
|
{
|
|
list_free(cacheEntry->referencedRelationsViaForeignKey);
|
|
cacheEntry->referencedRelationsViaForeignKey = NIL;
|
|
}
|
|
if (cacheEntry->referencingRelationsViaForeignKey)
|
|
{
|
|
list_free(cacheEntry->referencingRelationsViaForeignKey);
|
|
cacheEntry->referencingRelationsViaForeignKey = NIL;
|
|
}
|
|
|
|
cacheEntry->shardIntervalArrayLength = 0;
|
|
cacheEntry->hasUninitializedShardInterval = false;
|
|
cacheEntry->hasUniformHashDistribution = false;
|
|
cacheEntry->hasOverlappingShardInterval = false;
|
|
cacheEntry->autoConverted = false;
|
|
|
|
pfree(cacheEntry);
|
|
}
|
|
|
|
|
|
/*
|
|
* RemoveStaleShardIdCacheEntries removes all shard ID cache entries belonging to the
|
|
* given table entry. If the shard ID belongs to a different (newer) table entry,
|
|
* we leave it in place.
|
|
*/
|
|
static void
|
|
RemoveStaleShardIdCacheEntries(CitusTableCacheEntry *invalidatedTableEntry)
|
|
{
|
|
int shardIndex = 0;
|
|
int shardCount = invalidatedTableEntry->shardIntervalArrayLength;
|
|
|
|
for (shardIndex = 0; shardIndex < shardCount; shardIndex++)
|
|
{
|
|
ShardInterval *shardInterval =
|
|
invalidatedTableEntry->sortedShardIntervalArray[shardIndex];
|
|
int64 shardId = shardInterval->shardId;
|
|
bool foundInCache = false;
|
|
|
|
ShardIdCacheEntry *shardIdCacheEntry =
|
|
hash_search(ShardIdCacheHash, &shardId, HASH_FIND, &foundInCache);
|
|
|
|
if (foundInCache && shardIdCacheEntry->tableEntry == invalidatedTableEntry)
|
|
{
|
|
hash_search(ShardIdCacheHash, &shardId, HASH_REMOVE, &foundInCache);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* InvalidateForeignRelationGraphCacheCallback invalidates the foreign key relation
|
|
* graph and entire distributed cache entries.
|
|
*/
|
|
static void
|
|
InvalidateForeignRelationGraphCacheCallback(Datum argument, Oid relationId)
|
|
{
|
|
if (relationId == MetadataCache.distColocationRelationId)
|
|
{
|
|
SetForeignConstraintRelationshipGraphInvalid();
|
|
InvalidateDistTableCache();
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* InvalidateForeignKeyGraph is used to invalidate the cached foreign key
|
|
* graph (see ForeignKeyRelationGraph @ utils/foreign_key_relationship.c).
|
|
*
|
|
* To invalidate the foreign key graph, we hack around relcache invalidation
|
|
* callbacks. Given that there is no metadata table associated with the foreign
|
|
* key graph cache, we use pg_dist_colocation, which is never invalidated for
|
|
* other purposes.
|
|
*
|
|
* We acknowledge that it is not a very intuitive way of implementing this cache
|
|
* invalidation, but, seems acceptable for now. If this becomes problematic, we
|
|
* could try using a magic oid where we're sure that no relation would ever use
|
|
* that oid.
|
|
*/
|
|
void
|
|
InvalidateForeignKeyGraph(void)
|
|
{
|
|
if (!CitusHasBeenLoaded())
|
|
{
|
|
/*
|
|
* We should not try to invalidate foreign key graph
|
|
* if citus is not loaded.
|
|
*/
|
|
return;
|
|
}
|
|
|
|
CitusInvalidateRelcacheByRelid(DistColocationRelationId());
|
|
|
|
/* bump command counter to force invalidation to take effect */
|
|
CommandCounterIncrement();
|
|
}
|
|
|
|
|
|
/*
|
|
* InvalidateDistRelationCacheCallback flushes cache entries when a relation
|
|
* is updated (or flushes the entire cache).
|
|
*/
|
|
void
|
|
InvalidateDistRelationCacheCallback(Datum argument, Oid relationId)
|
|
{
|
|
/* invalidate either entire cache or a specific entry */
|
|
if (relationId == InvalidOid)
|
|
{
|
|
InvalidateDistTableCache();
|
|
InvalidateDistObjectCache();
|
|
InvalidateMetadataSystemCache();
|
|
}
|
|
else
|
|
{
|
|
void *hashKey = (void *) &relationId;
|
|
bool foundInCache = false;
|
|
|
|
if (DistTableCacheHash == NULL)
|
|
{
|
|
return;
|
|
}
|
|
|
|
CitusTableCacheEntrySlot *cacheSlot =
|
|
hash_search(DistTableCacheHash, hashKey, HASH_FIND, &foundInCache);
|
|
if (foundInCache)
|
|
{
|
|
InvalidateCitusTableCacheEntrySlot(cacheSlot);
|
|
}
|
|
|
|
/*
|
|
* if pg_dist_partition relcache is invalidated for some reason,
|
|
* invalidate the MetadataCache. It is likely an overkill to invalidate
|
|
* the entire cache here. But until a better fix, we keep it this way
|
|
* for postgres regression tests that includes
|
|
* REINDEX SCHEMA CONCURRENTLY pg_catalog
|
|
* command.
|
|
*/
|
|
if (relationId == MetadataCache.distPartitionRelationId)
|
|
{
|
|
InvalidateMetadataSystemCache();
|
|
}
|
|
|
|
|
|
if (relationId == MetadataCache.distObjectRelationId)
|
|
{
|
|
InvalidateDistObjectCache();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* InvalidateCitusTableCacheEntrySlot marks a CitusTableCacheEntrySlot as invalid,
|
|
* meaning it needs to be rebuilt and the citusTableMetadata (if any) should be
|
|
* released.
|
|
*/
|
|
static void
|
|
InvalidateCitusTableCacheEntrySlot(CitusTableCacheEntrySlot *cacheSlot)
|
|
{
|
|
/* recheck whether this is a distributed table */
|
|
cacheSlot->isValid = false;
|
|
|
|
if (cacheSlot->citusTableMetadata != NULL)
|
|
{
|
|
/* reload the metadata */
|
|
cacheSlot->citusTableMetadata->isValid = false;
|
|
|
|
/* clean up ShardIdCacheHash */
|
|
RemoveStaleShardIdCacheEntries(cacheSlot->citusTableMetadata);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* InvalidateDistTableCache marks all DistTableCacheHash entries invalid.
|
|
*/
|
|
static void
|
|
InvalidateDistTableCache(void)
|
|
{
|
|
CitusTableCacheEntrySlot *cacheSlot = NULL;
|
|
HASH_SEQ_STATUS status;
|
|
|
|
if (DistTableCacheHash == NULL)
|
|
{
|
|
return;
|
|
}
|
|
|
|
hash_seq_init(&status, DistTableCacheHash);
|
|
|
|
while ((cacheSlot = (CitusTableCacheEntrySlot *) hash_seq_search(&status)) != NULL)
|
|
{
|
|
InvalidateCitusTableCacheEntrySlot(cacheSlot);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* InvalidateDistObjectCache marks all DistObjectCacheHash entries invalid.
|
|
*/
|
|
static void
|
|
InvalidateDistObjectCache(void)
|
|
{
|
|
DistObjectCacheEntry *cacheEntry = NULL;
|
|
HASH_SEQ_STATUS status;
|
|
|
|
if (DistObjectCacheHash == NULL)
|
|
{
|
|
return;
|
|
}
|
|
|
|
hash_seq_init(&status, DistObjectCacheHash);
|
|
|
|
while ((cacheEntry = (DistObjectCacheEntry *) hash_seq_search(&status)) != NULL)
|
|
{
|
|
cacheEntry->isValid = false;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* FlushDistTableCache flushes the entire distributed relation cache, frees
|
|
* all entries, and recreates the cache.
|
|
*/
|
|
void
|
|
FlushDistTableCache(void)
|
|
{
|
|
CitusTableCacheEntrySlot *cacheSlot = NULL;
|
|
HASH_SEQ_STATUS status;
|
|
|
|
hash_seq_init(&status, DistTableCacheHash);
|
|
|
|
while ((cacheSlot = (CitusTableCacheEntrySlot *) hash_seq_search(&status)) != NULL)
|
|
{
|
|
ResetCitusTableCacheEntry(cacheSlot->citusTableMetadata);
|
|
}
|
|
|
|
hash_destroy(DistTableCacheHash);
|
|
hash_destroy(ShardIdCacheHash);
|
|
CreateDistTableCache();
|
|
CreateShardIdCache();
|
|
}
|
|
|
|
|
|
/* CreateDistTableCache initializes the per-table hash table */
|
|
static void
|
|
CreateDistTableCache(void)
|
|
{
|
|
HASHCTL info;
|
|
MemSet(&info, 0, sizeof(info));
|
|
info.keysize = sizeof(Oid);
|
|
info.entrysize = sizeof(CitusTableCacheEntrySlot);
|
|
info.hash = tag_hash;
|
|
info.hcxt = MetadataCacheMemoryContext;
|
|
DistTableCacheHash =
|
|
hash_create("Distributed Relation Cache", 32, &info,
|
|
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
|
|
}
|
|
|
|
|
|
/* CreateShardIdCache initializes the shard ID mapping */
|
|
static void
|
|
CreateShardIdCache(void)
|
|
{
|
|
HASHCTL info;
|
|
MemSet(&info, 0, sizeof(info));
|
|
info.keysize = sizeof(int64);
|
|
info.entrysize = sizeof(ShardIdCacheEntry);
|
|
info.hash = tag_hash;
|
|
info.hcxt = MetadataCacheMemoryContext;
|
|
ShardIdCacheHash =
|
|
hash_create("Shard Id Cache", 128, &info,
|
|
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
|
|
}
|
|
|
|
|
|
/* CreateDistObjectCache initializes the per-object hash table */
|
|
static void
|
|
CreateDistObjectCache(void)
|
|
{
|
|
HASHCTL info;
|
|
MemSet(&info, 0, sizeof(info));
|
|
info.keysize = sizeof(DistObjectCacheEntryKey);
|
|
info.entrysize = sizeof(DistObjectCacheEntry);
|
|
info.hash = tag_hash;
|
|
info.hcxt = MetadataCacheMemoryContext;
|
|
DistObjectCacheHash =
|
|
hash_create("Distributed Object Cache", 32, &info,
|
|
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
|
|
}
|
|
|
|
|
|
/*
|
|
* InvalidateMetadataSystemCache resets all the cached OIDs and the extensionCreatedState
|
|
* flag and invalidates the worker node, ConnParams, and local group ID caches.
|
|
*/
|
|
void
|
|
InvalidateMetadataSystemCache(void)
|
|
{
|
|
InvalidateConnParamsHashEntries();
|
|
|
|
memset(&MetadataCache, 0, sizeof(MetadataCache));
|
|
workerNodeHashValid = false;
|
|
LocalGroupId = -1;
|
|
LocalNodeId = -1;
|
|
}
|
|
|
|
|
|
/*
|
|
* AllCitusTableIds returns all citus table ids.
|
|
*/
|
|
List *
|
|
AllCitusTableIds(void)
|
|
{
|
|
return CitusTableTypeIdList(ANY_CITUS_TABLE_TYPE);
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusTableTypeIdList function scans pg_dist_partition and returns a
|
|
* list of OID's for the tables matching given citusTableType.
|
|
* To create the list, it performs sequential scan. Since it is not expected
|
|
* that this function will be called frequently, it is OK not to use index
|
|
* scan. If this function becomes performance bottleneck, it is possible to
|
|
* modify this function to perform index scan.
|
|
*/
|
|
List *
|
|
CitusTableTypeIdList(CitusTableType citusTableType)
|
|
{
|
|
ScanKeyData scanKey[1];
|
|
int scanKeyCount = 0;
|
|
List *relationIdList = NIL;
|
|
|
|
Relation pgDistPartition = table_open(DistPartitionRelationId(), AccessShareLock);
|
|
|
|
SysScanDesc scanDescriptor = systable_beginscan(pgDistPartition,
|
|
InvalidOid, false,
|
|
NULL, scanKeyCount, scanKey);
|
|
|
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistPartition);
|
|
|
|
HeapTuple heapTuple = systable_getnext(scanDescriptor);
|
|
while (HeapTupleIsValid(heapTuple))
|
|
{
|
|
bool isNullArray[Natts_pg_dist_partition];
|
|
Datum datumArray[Natts_pg_dist_partition];
|
|
heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray);
|
|
|
|
Datum partMethodDatum = datumArray[Anum_pg_dist_partition_partmethod - 1];
|
|
Datum replicationModelDatum = datumArray[Anum_pg_dist_partition_repmodel - 1];
|
|
Datum colocationIdDatum = datumArray[Anum_pg_dist_partition_colocationid - 1];
|
|
|
|
char partitionMethod = DatumGetChar(partMethodDatum);
|
|
char replicationModel = DatumGetChar(replicationModelDatum);
|
|
uint32 colocationId = DatumGetUInt32(colocationIdDatum);
|
|
|
|
if (IsCitusTableTypeInternal(partitionMethod, replicationModel, colocationId,
|
|
citusTableType))
|
|
{
|
|
Datum relationIdDatum = datumArray[Anum_pg_dist_partition_logicalrelid - 1];
|
|
|
|
Oid relationId = DatumGetObjectId(relationIdDatum);
|
|
|
|
relationIdList = lappend_oid(relationIdList, relationId);
|
|
}
|
|
|
|
heapTuple = systable_getnext(scanDescriptor);
|
|
}
|
|
|
|
systable_endscan(scanDescriptor);
|
|
table_close(pgDistPartition, AccessShareLock);
|
|
|
|
return relationIdList;
|
|
}
|
|
|
|
|
|
/*
|
|
* InvalidateNodeRelationCacheCallback destroys the WorkerNodeHash when
|
|
* any change happens on pg_dist_node table. It also set WorkerNodeHash to
|
|
* NULL, which allows consequent accesses to the hash read from the
|
|
* pg_dist_node from scratch.
|
|
*/
|
|
static void
|
|
InvalidateNodeRelationCacheCallback(Datum argument, Oid relationId)
|
|
{
|
|
if (relationId == InvalidOid || relationId == MetadataCache.distNodeRelationId)
|
|
{
|
|
workerNodeHashValid = false;
|
|
LocalNodeId = -1;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* InvalidateLocalGroupIdRelationCacheCallback sets the LocalGroupId to
|
|
* the default value.
|
|
*/
|
|
static void
|
|
InvalidateLocalGroupIdRelationCacheCallback(Datum argument, Oid relationId)
|
|
{
|
|
/* when invalidation happens simply set the LocalGroupId to the default value */
|
|
if (relationId == InvalidOid || relationId == MetadataCache.distLocalGroupRelationId)
|
|
{
|
|
LocalGroupId = -1;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* InvalidateConnParamsCacheCallback sets isValid flag to false for all entries
|
|
* in ConnParamsHash, a cache used during connection establishment.
|
|
*/
|
|
static void
|
|
InvalidateConnParamsCacheCallback(Datum argument, Oid relationId)
|
|
{
|
|
if (relationId == MetadataCache.distAuthinfoRelationId ||
|
|
relationId == MetadataCache.distPoolinfoRelationId ||
|
|
relationId == InvalidOid)
|
|
{
|
|
ConnParamsHashEntry *entry = NULL;
|
|
HASH_SEQ_STATUS status;
|
|
|
|
hash_seq_init(&status, ConnParamsHash);
|
|
|
|
while ((entry = (ConnParamsHashEntry *) hash_seq_search(&status)) != NULL)
|
|
{
|
|
entry->isValid = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusTableCacheFlushInvalidatedEntries frees invalidated cache entries.
|
|
* Invalidated entries aren't freed immediately as callers expect their lifetime
|
|
* to extend beyond that scope.
|
|
*/
|
|
void
|
|
CitusTableCacheFlushInvalidatedEntries()
|
|
{
|
|
if (DistTableCacheHash != NULL && DistTableCacheExpired != NIL)
|
|
{
|
|
CitusTableCacheEntry *cacheEntry = NULL;
|
|
foreach_ptr(cacheEntry, DistTableCacheExpired)
|
|
{
|
|
ResetCitusTableCacheEntry(cacheEntry);
|
|
}
|
|
list_free(DistTableCacheExpired);
|
|
DistTableCacheExpired = NIL;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* CitusTableCacheEntryReleaseCallback frees invalidated cache entries.
|
|
*/
|
|
static void
|
|
CitusTableCacheEntryReleaseCallback(ResourceReleasePhase phase, bool isCommit,
|
|
bool isTopLevel, void *arg)
|
|
{
|
|
if (isTopLevel && phase == RESOURCE_RELEASE_LOCKS)
|
|
{
|
|
CitusTableCacheFlushInvalidatedEntries();
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupDistPartitionTuple searches pg_dist_partition for relationId's entry
|
|
* and returns that or, if no matching entry was found, NULL.
|
|
*/
|
|
static HeapTuple
|
|
LookupDistPartitionTuple(Relation pgDistPartition, Oid relationId)
|
|
{
|
|
HeapTuple distPartitionTuple = NULL;
|
|
ScanKeyData scanKey[1];
|
|
|
|
/* copy scankey to local copy, it will be modified during the scan */
|
|
scanKey[0] = DistPartitionScanKey[0];
|
|
|
|
/* set scan arguments */
|
|
scanKey[0].sk_argument = ObjectIdGetDatum(relationId);
|
|
|
|
SysScanDesc scanDescriptor = systable_beginscan(pgDistPartition,
|
|
DistPartitionLogicalRelidIndexId(),
|
|
true, NULL, 1, scanKey);
|
|
|
|
HeapTuple currentPartitionTuple = systable_getnext(scanDescriptor);
|
|
if (HeapTupleIsValid(currentPartitionTuple))
|
|
{
|
|
distPartitionTuple = heap_copytuple(currentPartitionTuple);
|
|
}
|
|
|
|
systable_endscan(scanDescriptor);
|
|
|
|
return distPartitionTuple;
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupDistShardTuples returns a list of all dist_shard tuples for the
|
|
* specified relation.
|
|
*/
|
|
List *
|
|
LookupDistShardTuples(Oid relationId)
|
|
{
|
|
List *distShardTupleList = NIL;
|
|
ScanKeyData scanKey[1];
|
|
|
|
Relation pgDistShard = table_open(DistShardRelationId(), AccessShareLock);
|
|
|
|
/* copy scankey to local copy, it will be modified during the scan */
|
|
scanKey[0] = DistShardScanKey[0];
|
|
|
|
/* set scan arguments */
|
|
scanKey[0].sk_argument = ObjectIdGetDatum(relationId);
|
|
|
|
SysScanDesc scanDescriptor = systable_beginscan(pgDistShard,
|
|
DistShardLogicalRelidIndexId(), true,
|
|
NULL, 1, scanKey);
|
|
|
|
HeapTuple currentShardTuple = systable_getnext(scanDescriptor);
|
|
while (HeapTupleIsValid(currentShardTuple))
|
|
{
|
|
HeapTuple shardTupleCopy = heap_copytuple(currentShardTuple);
|
|
distShardTupleList = lappend(distShardTupleList, shardTupleCopy);
|
|
|
|
currentShardTuple = systable_getnext(scanDescriptor);
|
|
}
|
|
|
|
systable_endscan(scanDescriptor);
|
|
table_close(pgDistShard, AccessShareLock);
|
|
|
|
return distShardTupleList;
|
|
}
|
|
|
|
|
|
/*
|
|
* LookupShardRelationFromCatalog returns the logical relation oid a shard belongs to.
|
|
*
|
|
* Errors out if the shardId does not exist and missingOk is false.
|
|
* Returns InvalidOid if the shardId does not exist and missingOk is true.
|
|
*/
|
|
Oid
|
|
LookupShardRelationFromCatalog(int64 shardId, bool missingOk)
|
|
{
|
|
ScanKeyData scanKey[1];
|
|
int scanKeyCount = 1;
|
|
Form_pg_dist_shard shardForm = NULL;
|
|
Relation pgDistShard = table_open(DistShardRelationId(), AccessShareLock);
|
|
Oid relationId = InvalidOid;
|
|
|
|
ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid,
|
|
BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
|
|
|
|
SysScanDesc scanDescriptor = systable_beginscan(pgDistShard,
|
|
DistShardShardidIndexId(), true,
|
|
NULL, scanKeyCount, scanKey);
|
|
|
|
HeapTuple heapTuple = systable_getnext(scanDescriptor);
|
|
if (!HeapTupleIsValid(heapTuple) && !missingOk)
|
|
{
|
|
ereport(ERROR, (errmsg("could not find valid entry for shard "
|
|
UINT64_FORMAT, shardId)));
|
|
}
|
|
|
|
if (!HeapTupleIsValid(heapTuple))
|
|
{
|
|
relationId = InvalidOid;
|
|
}
|
|
else
|
|
{
|
|
shardForm = (Form_pg_dist_shard) GETSTRUCT(heapTuple);
|
|
relationId = shardForm->logicalrelid;
|
|
}
|
|
|
|
systable_endscan(scanDescriptor);
|
|
table_close(pgDistShard, NoLock);
|
|
|
|
return relationId;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetPartitionTypeInputInfo populates output parameters with the interval type
|
|
* identifier and modifier for the specified partition key/method combination.
|
|
*/
|
|
static void
|
|
GetPartitionTypeInputInfo(char *partitionKeyString, char partitionMethod,
|
|
Oid *columnTypeId, int32 *columnTypeMod,
|
|
Oid *intervalTypeId, int32 *intervalTypeMod)
|
|
{
|
|
*columnTypeId = InvalidOid;
|
|
*columnTypeMod = -1;
|
|
*intervalTypeId = InvalidOid;
|
|
*intervalTypeMod = -1;
|
|
|
|
switch (partitionMethod)
|
|
{
|
|
case DISTRIBUTE_BY_APPEND:
|
|
case DISTRIBUTE_BY_RANGE:
|
|
case DISTRIBUTE_BY_HASH:
|
|
{
|
|
Node *partitionNode = stringToNode(partitionKeyString);
|
|
Var *partitionColumn = (Var *) partitionNode;
|
|
Assert(IsA(partitionNode, Var));
|
|
|
|
GetIntervalTypeInfo(partitionMethod, partitionColumn,
|
|
intervalTypeId, intervalTypeMod);
|
|
|
|
*columnTypeId = partitionColumn->vartype;
|
|
*columnTypeMod = partitionColumn->vartypmod;
|
|
break;
|
|
}
|
|
|
|
case DISTRIBUTE_BY_NONE:
|
|
{
|
|
break;
|
|
}
|
|
|
|
default:
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("unsupported table partition type: %c",
|
|
partitionMethod)));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* GetIntervalTypeInfo gets type id and type mod of the min/max values
|
|
* of shard intervals for a distributed table with given partition method
|
|
* and partition column.
|
|
*/
|
|
void
|
|
GetIntervalTypeInfo(char partitionMethod, Var *partitionColumn,
|
|
Oid *intervalTypeId, int32 *intervalTypeMod)
|
|
{
|
|
*intervalTypeId = InvalidOid;
|
|
*intervalTypeMod = -1;
|
|
|
|
switch (partitionMethod)
|
|
{
|
|
case DISTRIBUTE_BY_APPEND:
|
|
case DISTRIBUTE_BY_RANGE:
|
|
{
|
|
/* we need a valid partition column Var in this case */
|
|
if (partitionColumn == NULL)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR),
|
|
errmsg("unexpected partition column value: null"),
|
|
errdetail("Please report this to the Citus core team.")));
|
|
}
|
|
*intervalTypeId = partitionColumn->vartype;
|
|
*intervalTypeMod = partitionColumn->vartypmod;
|
|
break;
|
|
}
|
|
|
|
case DISTRIBUTE_BY_HASH:
|
|
{
|
|
*intervalTypeId = INT4OID;
|
|
break;
|
|
}
|
|
|
|
default:
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* TupleToShardInterval transforms the specified dist_shard tuple into a new
|
|
* ShardInterval using the provided descriptor and partition type information.
|
|
*/
|
|
ShardInterval *
|
|
TupleToShardInterval(HeapTuple heapTuple, TupleDesc tupleDescriptor, Oid
|
|
intervalTypeId,
|
|
int32 intervalTypeMod)
|
|
{
|
|
Datum datumArray[Natts_pg_dist_shard];
|
|
bool isNullArray[Natts_pg_dist_shard];
|
|
|
|
/*
|
|
* We use heap_deform_tuple() instead of heap_getattr() to expand tuple
|
|
* to contain missing values when ALTER TABLE ADD COLUMN happens.
|
|
*/
|
|
heap_deform_tuple(heapTuple, tupleDescriptor, datumArray, isNullArray);
|
|
|
|
ShardInterval *shardInterval =
|
|
DeformedDistShardTupleToShardInterval(datumArray, isNullArray,
|
|
intervalTypeId, intervalTypeMod);
|
|
|
|
return shardInterval;
|
|
}
|
|
|
|
|
|
/*
|
|
* DeformedDistShardTupleToShardInterval transforms the specified deformed
|
|
* pg_dist_shard tuple into a new ShardInterval.
|
|
*/
|
|
ShardInterval *
|
|
DeformedDistShardTupleToShardInterval(Datum *datumArray, bool *isNullArray,
|
|
Oid intervalTypeId, int32 intervalTypeMod)
|
|
{
|
|
Oid inputFunctionId = InvalidOid;
|
|
Oid typeIoParam = InvalidOid;
|
|
Datum minValue = 0;
|
|
Datum maxValue = 0;
|
|
bool minValueExists = false;
|
|
bool maxValueExists = false;
|
|
int16 intervalTypeLen = 0;
|
|
bool intervalByVal = false;
|
|
char intervalAlign = '0';
|
|
char intervalDelim = '0';
|
|
|
|
Oid relationId =
|
|
DatumGetObjectId(datumArray[Anum_pg_dist_shard_logicalrelid - 1]);
|
|
int64 shardId = DatumGetInt64(datumArray[Anum_pg_dist_shard_shardid - 1]);
|
|
char storageType = DatumGetChar(datumArray[Anum_pg_dist_shard_shardstorage - 1]);
|
|
Datum minValueTextDatum = datumArray[Anum_pg_dist_shard_shardminvalue - 1];
|
|
Datum maxValueTextDatum = datumArray[Anum_pg_dist_shard_shardmaxvalue - 1];
|
|
|
|
bool minValueNull = isNullArray[Anum_pg_dist_shard_shardminvalue - 1];
|
|
bool maxValueNull = isNullArray[Anum_pg_dist_shard_shardmaxvalue - 1];
|
|
|
|
if (!minValueNull && !maxValueNull)
|
|
{
|
|
char *minValueString = TextDatumGetCString(minValueTextDatum);
|
|
char *maxValueString = TextDatumGetCString(maxValueTextDatum);
|
|
|
|
/* TODO: move this up the call stack to avoid per-tuple invocation? */
|
|
get_type_io_data(intervalTypeId, IOFunc_input, &intervalTypeLen,
|
|
&intervalByVal,
|
|
&intervalAlign, &intervalDelim, &typeIoParam,
|
|
&inputFunctionId);
|
|
|
|
/* finally convert min/max values to their actual types */
|
|
minValue = OidInputFunctionCall(inputFunctionId, minValueString,
|
|
typeIoParam, intervalTypeMod);
|
|
maxValue = OidInputFunctionCall(inputFunctionId, maxValueString,
|
|
typeIoParam, intervalTypeMod);
|
|
|
|
minValueExists = true;
|
|
maxValueExists = true;
|
|
}
|
|
|
|
ShardInterval *shardInterval = CitusMakeNode(ShardInterval);
|
|
shardInterval->relationId = relationId;
|
|
shardInterval->storageType = storageType;
|
|
shardInterval->valueTypeId = intervalTypeId;
|
|
shardInterval->valueTypeLen = intervalTypeLen;
|
|
shardInterval->valueByVal = intervalByVal;
|
|
shardInterval->minValueExists = minValueExists;
|
|
shardInterval->maxValueExists = maxValueExists;
|
|
shardInterval->minValue = minValue;
|
|
shardInterval->maxValue = maxValue;
|
|
shardInterval->shardId = shardId;
|
|
|
|
return shardInterval;
|
|
}
|
|
|
|
|
|
/*
|
|
* CachedNamespaceLookup performs a cached lookup for the namespace (schema), with the
|
|
* result cached in cachedOid.
|
|
*/
|
|
static void
|
|
CachedNamespaceLookup(const char *nspname, Oid *cachedOid)
|
|
{
|
|
/* force callbacks to be registered, so we always get notified upon changes */
|
|
InitializeCaches();
|
|
|
|
if (*cachedOid == InvalidOid)
|
|
{
|
|
*cachedOid = get_namespace_oid(nspname, true);
|
|
|
|
if (*cachedOid == InvalidOid)
|
|
{
|
|
ereport(ERROR, (errmsg(
|
|
"cache lookup failed for namespace %s, called too early?",
|
|
nspname)));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* CachedRelationLookup performs a cached lookup for the relation
|
|
* relationName, with the result cached in *cachedOid.
|
|
*/
|
|
static void
|
|
CachedRelationLookup(const char *relationName, Oid *cachedOid)
|
|
{
|
|
CachedRelationNamespaceLookup(relationName, PG_CATALOG_NAMESPACE, cachedOid);
|
|
}
|
|
|
|
|
|
/*
|
|
* CachedRelationLookupExtended performs a cached lookup for the relation
|
|
* relationName, with the result cached in *cachedOid. Will _not_ throw an error when
|
|
* missing_ok is set to true.
|
|
*/
|
|
static void
|
|
CachedRelationLookupExtended(const char *relationName, Oid *cachedOid, bool missing_ok)
|
|
{
|
|
CachedRelationNamespaceLookupExtended(relationName, PG_CATALOG_NAMESPACE, cachedOid,
|
|
missing_ok);
|
|
}
|
|
|
|
|
|
static void
|
|
CachedRelationNamespaceLookup(const char *relationName, Oid relnamespace,
|
|
Oid *cachedOid)
|
|
{
|
|
CachedRelationNamespaceLookupExtended(relationName, relnamespace, cachedOid, false);
|
|
}
|
|
|
|
|
|
static void
|
|
CachedRelationNamespaceLookupExtended(const char *relationName, Oid relnamespace,
|
|
Oid *cachedOid, bool missing_ok)
|
|
{
|
|
/* force callbacks to be registered, so we always get notified upon changes */
|
|
InitializeCaches();
|
|
|
|
if (*cachedOid == InvalidOid)
|
|
{
|
|
*cachedOid = get_relname_relid(relationName, relnamespace);
|
|
|
|
if (*cachedOid == InvalidOid && !missing_ok)
|
|
{
|
|
ereport(ERROR, (errmsg(
|
|
"cache lookup failed for %s, called too early?",
|
|
relationName)));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* RelationExists returns whether a relation with the given OID exists.
|
|
*/
|
|
bool
|
|
RelationExists(Oid relationId)
|
|
{
|
|
HeapTuple relTuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relationId));
|
|
|
|
bool relationExists = HeapTupleIsValid(relTuple);
|
|
if (relationExists)
|
|
{
|
|
ReleaseSysCache(relTuple);
|
|
}
|
|
|
|
return relationExists;
|
|
}
|
|
|
|
|
|
/*
|
|
* Register a relcache invalidation for a non-shared relation.
|
|
*
|
|
* We ignore the case that there's no corresponding pg_class entry - that
|
|
* happens if we register a relcache invalidation (e.g. for a
|
|
* pg_dist_partition deletion) after the relation has been dropped. That's ok,
|
|
* because in those cases we're guaranteed to already have registered an
|
|
* invalidation for the target relation.
|
|
*/
|
|
void
|
|
CitusInvalidateRelcacheByRelid(Oid relationId)
|
|
{
|
|
HeapTuple classTuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relationId));
|
|
|
|
if (HeapTupleIsValid(classTuple))
|
|
{
|
|
CacheInvalidateRelcacheByTuple(classTuple);
|
|
ReleaseSysCache(classTuple);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* Register a relcache invalidation for the distributed relation associated
|
|
* with the shard.
|
|
*/
|
|
void
|
|
CitusInvalidateRelcacheByShardId(int64 shardId)
|
|
{
|
|
ScanKeyData scanKey[1];
|
|
int scanKeyCount = 1;
|
|
Form_pg_dist_shard shardForm = NULL;
|
|
Relation pgDistShard = table_open(DistShardRelationId(), AccessShareLock);
|
|
|
|
/*
|
|
* Load shard, to find the associated relation id. Can't use
|
|
* LoadShardInterval directly because that'd fail if the shard doesn't
|
|
* exist anymore, which we can't have. Also lower overhead is desirable
|
|
* here.
|
|
*/
|
|
|
|
ScanKeyInit(&scanKey[0], Anum_pg_dist_shard_shardid,
|
|
BTEqualStrategyNumber, F_INT8EQ, Int64GetDatum(shardId));
|
|
|
|
SysScanDesc scanDescriptor = systable_beginscan(pgDistShard,
|
|
DistShardShardidIndexId(), true,
|
|
NULL, scanKeyCount, scanKey);
|
|
|
|
HeapTuple heapTuple = systable_getnext(scanDescriptor);
|
|
if (HeapTupleIsValid(heapTuple))
|
|
{
|
|
shardForm = (Form_pg_dist_shard) GETSTRUCT(heapTuple);
|
|
CitusInvalidateRelcacheByRelid(shardForm->logicalrelid);
|
|
}
|
|
else
|
|
{
|
|
/*
|
|
* Couldn't find associated relation. That can primarily happen in two cases:
|
|
*
|
|
* 1) A placement row is inserted before the shard row. That's fine,
|
|
* since we don't need invalidations via placements in that case.
|
|
*
|
|
* 2) The shard has been deleted, but some placements were
|
|
* unreachable, and the user is manually deleting the rows. Not
|
|
* much point in WARNING or ERRORing in that case either, there's
|
|
* nothing to invalidate.
|
|
*
|
|
* Hence we just emit a DEBUG5 message.
|
|
*/
|
|
ereport(DEBUG5, (errmsg(
|
|
"could not find distributed relation to invalidate for "
|
|
"shard "INT64_FORMAT, shardId)));
|
|
}
|
|
|
|
systable_endscan(scanDescriptor);
|
|
table_close(pgDistShard, NoLock);
|
|
|
|
/* bump command counter, to force invalidation to take effect */
|
|
CommandCounterIncrement();
|
|
}
|
|
|
|
|
|
/*
|
|
* DistNodeMetadata returns the single metadata jsonb object stored in
|
|
* pg_dist_node_metadata.
|
|
*/
|
|
Datum
|
|
DistNodeMetadata(void)
|
|
{
|
|
Datum metadata = 0;
|
|
ScanKeyData scanKey[1];
|
|
const int scanKeyCount = 0;
|
|
|
|
Oid metadataTableOid = get_relname_relid("pg_dist_node_metadata",
|
|
PG_CATALOG_NAMESPACE);
|
|
if (metadataTableOid == InvalidOid)
|
|
{
|
|
ereport(ERROR, (errmsg("pg_dist_node_metadata was not found")));
|
|
}
|
|
|
|
Relation pgDistNodeMetadata = table_open(metadataTableOid, AccessShareLock);
|
|
SysScanDesc scanDescriptor = systable_beginscan(pgDistNodeMetadata,
|
|
InvalidOid, false,
|
|
NULL, scanKeyCount, scanKey);
|
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistNodeMetadata);
|
|
|
|
HeapTuple heapTuple = systable_getnext(scanDescriptor);
|
|
if (HeapTupleIsValid(heapTuple))
|
|
{
|
|
bool isNull = false;
|
|
metadata = heap_getattr(heapTuple, Anum_pg_dist_node_metadata_metadata,
|
|
tupleDescriptor, &isNull);
|
|
Assert(!isNull);
|
|
}
|
|
else
|
|
{
|
|
ereport(ERROR, (errmsg(
|
|
"could not find any entries in pg_dist_metadata")));
|
|
}
|
|
|
|
/*
|
|
* Copy the jsonb result before closing the table
|
|
* since that memory can be freed.
|
|
*/
|
|
metadata = JsonbPGetDatum(DatumGetJsonbPCopy(metadata));
|
|
|
|
systable_endscan(scanDescriptor);
|
|
table_close(pgDistNodeMetadata, AccessShareLock);
|
|
|
|
return metadata;
|
|
}
|
|
|
|
|
|
/*
|
|
* role_exists is a check constraint which ensures that roles referenced in the
|
|
* pg_dist_authinfo catalog actually exist (at least at the time of insertion).
|
|
*/
|
|
Datum
|
|
role_exists(PG_FUNCTION_ARGS)
|
|
{
|
|
Name roleName = PG_GETARG_NAME(0);
|
|
bool roleExists = SearchSysCacheExists1(AUTHNAME, NameGetDatum(roleName));
|
|
|
|
PG_RETURN_BOOL(roleExists);
|
|
}
|
|
|
|
|
|
/*
|
|
* GetPoolinfoViaCatalog searches the pg_dist_poolinfo table for a row matching
|
|
* the provided nodeId and returns the poolinfo field of this row if found.
|
|
* Otherwise, this function returns NULL.
|
|
*/
|
|
char *
|
|
GetPoolinfoViaCatalog(int32 nodeId)
|
|
{
|
|
ScanKeyData scanKey[1];
|
|
const int scanKeyCount = 1;
|
|
const AttrNumber nodeIdIdx = 1, poolinfoIdx = 2;
|
|
Relation pgDistPoolinfo = table_open(DistPoolinfoRelationId(), AccessShareLock);
|
|
bool indexOK = true;
|
|
char *poolinfo = NULL;
|
|
|
|
/* set scan arguments */
|
|
ScanKeyInit(&scanKey[0], nodeIdIdx, BTEqualStrategyNumber, F_INT4EQ,
|
|
Int32GetDatum(nodeId));
|
|
|
|
SysScanDesc scanDescriptor = systable_beginscan(pgDistPoolinfo, DistPoolinfoIndexId(),
|
|
indexOK,
|
|
NULL, scanKeyCount, scanKey);
|
|
|
|
HeapTuple heapTuple = systable_getnext(scanDescriptor);
|
|
if (HeapTupleIsValid(heapTuple))
|
|
{
|
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistPoolinfo);
|
|
bool isNull = false;
|
|
|
|
Datum poolinfoDatum = heap_getattr(heapTuple, poolinfoIdx, tupleDescriptor,
|
|
&isNull);
|
|
|
|
Assert(!isNull);
|
|
|
|
poolinfo = TextDatumGetCString(poolinfoDatum);
|
|
}
|
|
|
|
systable_endscan(scanDescriptor);
|
|
table_close(pgDistPoolinfo, AccessShareLock);
|
|
|
|
return poolinfo;
|
|
}
|
|
|
|
|
|
/*
|
|
* GetAuthinfoViaCatalog searches pg_dist_authinfo for a row matching a pro-
|
|
* vided role and node id. Three types of rules are currently permitted: those
|
|
* matching a specific node (non-zero nodeid), those matching all nodes (a
|
|
* nodeid of zero), and those denoting a loopback connection (nodeid of -1).
|
|
* Rolename must always be specified. If both types of rules exist for a given
|
|
* user/host, the more specific (host-specific) rule wins. This means that when
|
|
* both a zero and non-zero row exist for a given rolename, the non-zero row
|
|
* has precedence.
|
|
*
|
|
* In short, this function will return a rule matching nodeId, or if that's
|
|
* absent the rule for 0, or if that's absent, an empty string. Callers can
|
|
* just use the returned authinfo and know the precedence has been honored.
|
|
*/
|
|
char *
|
|
GetAuthinfoViaCatalog(const char *roleName, int64 nodeId)
|
|
{
|
|
char *authinfo = "";
|
|
Datum nodeIdDatumArray[2] = {
|
|
Int32GetDatum(nodeId),
|
|
Int32GetDatum(WILDCARD_NODE_ID)
|
|
};
|
|
ArrayType *nodeIdArrayType = DatumArrayToArrayType(nodeIdDatumArray,
|
|
lengthof(nodeIdDatumArray),
|
|
INT4OID);
|
|
ScanKeyData scanKey[2];
|
|
const AttrNumber nodeIdIdx = 1, roleIdx = 2, authinfoIdx = 3;
|
|
|
|
/*
|
|
* Our index's definition ensures correct precedence for positive nodeIds,
|
|
* but when handling a negative value we need to traverse backwards to keep
|
|
* the invariant that the zero rule has lowest precedence.
|
|
*/
|
|
ScanDirection direction = (nodeId < 0) ? BackwardScanDirection : ForwardScanDirection;
|
|
|
|
if (ReindexIsProcessingIndex(DistAuthinfoIndexId()))
|
|
{
|
|
ereport(ERROR, (errmsg("authinfo is being reindexed; try again")));
|
|
}
|
|
|
|
memset(&scanKey, 0, sizeof(scanKey));
|
|
|
|
/* first column in index is rolename, need exact match there ... */
|
|
ScanKeyInit(&scanKey[0], roleIdx, BTEqualStrategyNumber,
|
|
F_NAMEEQ, CStringGetDatum(roleName));
|
|
|
|
/* second column is nodeId, match against array of nodeid and zero (any node) ... */
|
|
ScanKeyInit(&scanKey[1], nodeIdIdx, BTEqualStrategyNumber,
|
|
F_INT4EQ, PointerGetDatum(nodeIdArrayType));
|
|
scanKey[1].sk_flags |= SK_SEARCHARRAY;
|
|
|
|
/*
|
|
* It's important that we traverse the index in order: we need to ensure
|
|
* that rules with nodeid 0 are encountered last. We'll use the first tuple
|
|
* we find. This ordering defines the precedence order of authinfo rules.
|
|
*/
|
|
Relation pgDistAuthinfo = table_open(DistAuthinfoRelationId(), AccessShareLock);
|
|
Relation pgDistAuthinfoIdx = index_open(DistAuthinfoIndexId(), AccessShareLock);
|
|
SysScanDesc scanDescriptor = systable_beginscan_ordered(pgDistAuthinfo,
|
|
pgDistAuthinfoIdx,
|
|
NULL, lengthof(scanKey),
|
|
scanKey);
|
|
|
|
/* first tuple represents highest-precedence rule for this node */
|
|
HeapTuple authinfoTuple = systable_getnext_ordered(scanDescriptor, direction);
|
|
if (HeapTupleIsValid(authinfoTuple))
|
|
{
|
|
TupleDesc tupleDescriptor = RelationGetDescr(pgDistAuthinfo);
|
|
bool isNull = false;
|
|
|
|
Datum authinfoDatum = heap_getattr(authinfoTuple, authinfoIdx,
|
|
tupleDescriptor, &isNull);
|
|
|
|
Assert(!isNull);
|
|
|
|
authinfo = TextDatumGetCString(authinfoDatum);
|
|
}
|
|
|
|
systable_endscan_ordered(scanDescriptor);
|
|
index_close(pgDistAuthinfoIdx, AccessShareLock);
|
|
table_close(pgDistAuthinfo, AccessShareLock);
|
|
|
|
return authinfo;
|
|
}
|
|
|
|
|
|
/*
|
|
* authinfo_valid is a check constraint to verify that an inserted authinfo row
|
|
* uses only permitted libpq parameters.
|
|
*/
|
|
Datum
|
|
authinfo_valid(PG_FUNCTION_ARGS)
|
|
{
|
|
char *authinfo = TextDatumGetCString(PG_GETARG_DATUM(0));
|
|
|
|
/* this array _must_ be kept in an order usable by bsearch */
|
|
const char *allowList[] = { "password", "sslcert", "sslkey" };
|
|
bool authinfoValid = CheckConninfo(authinfo, allowList, lengthof(allowList), NULL);
|
|
|
|
PG_RETURN_BOOL(authinfoValid);
|
|
}
|
|
|
|
|
|
/*
|
|
* poolinfo_valid is a check constraint to verify that an inserted poolinfo row
|
|
* uses only permitted libpq parameters.
|
|
*/
|
|
Datum
|
|
poolinfo_valid(PG_FUNCTION_ARGS)
|
|
{
|
|
char *poolinfo = TextDatumGetCString(PG_GETARG_DATUM(0));
|
|
|
|
/* this array _must_ be kept in an order usable by bsearch */
|
|
const char *allowList[] = { "dbname", "host", "port" };
|
|
bool poolinfoValid = CheckConninfo(poolinfo, allowList, lengthof(allowList), NULL);
|
|
|
|
PG_RETURN_BOOL(poolinfoValid);
|
|
}
|