citus/src/backend/distributed/utils/citus_stat_tenants.c

798 lines
22 KiB
C

/*-------------------------------------------------------------------------
*
* citus_stat_tenants.c
* Routines related to the multi tenant monitor.
*
* Copyright (c) Citus Data, Inc.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "unistd.h"
#include "distributed/citus_safe_lib.h"
#include "distributed/colocation_utils.h"
#include "distributed/distributed_planner.h"
#include "distributed/jsonbutils.h"
#include "distributed/log_utils.h"
#include "distributed/listutils.h"
#include "distributed/metadata_cache.h"
#include "distributed/multi_executor.h"
#include "distributed/tuplestore.h"
#include "distributed/utils/citus_stat_tenants.h"
#include "executor/execdesc.h"
#include "storage/ipc.h"
#include "storage/lwlock.h"
#include "storage/shmem.h"
#include "sys/time.h"
#include "utils/builtins.h"
#include "utils/datetime.h"
#include "utils/json.h"
#include <time.h>
static void AttributeMetricsIfApplicable(void);
ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
#define ATTRIBUTE_PREFIX "/*{\"tId\":"
#define ATTRIBUTE_STRING_FORMAT "/*{\"tId\":%s,\"cId\":%d}*/"
#define STAT_TENANTS_COLUMNS 9
#define ONE_QUERY_SCORE 1000000000
static char AttributeToTenant[MAX_TENANT_ATTRIBUTE_LENGTH] = "";
static CmdType AttributeToCommandType = CMD_UNKNOWN;
static int AttributeToColocationGroupId = INVALID_COLOCATION_ID;
static clock_t QueryStartClock = { 0 };
static clock_t QueryEndClock = { 0 };
static const char *SharedMemoryNameForMultiTenantMonitor =
"Shared memory for multi tenant monitor";
static char *TenantTrancheName = "Tenant Tranche";
static char *MonitorTrancheName = "Multi Tenant Monitor Tranche";
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
static int CompareTenantScore(const void *leftElement, const void *rightElement);
static void UpdatePeriodsIfNecessary(TenantStats *tenantStats, TimestampTz queryTime);
static void ReduceScoreIfNecessary(TenantStats *tenantStats, TimestampTz queryTime);
static void EvictTenantsIfNecessary(TimestampTz queryTime);
static void RecordTenantStats(TenantStats *tenantStats, TimestampTz queryTime);
static void CreateMultiTenantMonitor(void);
static MultiTenantMonitor * CreateSharedMemoryForMultiTenantMonitor(void);
static MultiTenantMonitor * GetMultiTenantMonitor(void);
static void MultiTenantMonitorSMInit(void);
static int CreateTenantStats(MultiTenantMonitor *monitor, TimestampTz queryTime);
static int FindTenantStats(MultiTenantMonitor *monitor);
static size_t MultiTenantMonitorshmemSize(void);
static char * ExtractTopComment(const char *inputString);
static char * EscapeCommentChars(const char *str);
static char * UnescapeCommentChars(const char *str);
int StatTenantsLogLevel = CITUS_LOG_LEVEL_OFF;
int StatTenantsPeriod = (time_t) 60;
int StatTenantsLimit = 100;
int StatTenantsTrack = STAT_TENANTS_TRACK_NONE;
PG_FUNCTION_INFO_V1(citus_stat_tenants_local);
PG_FUNCTION_INFO_V1(citus_stat_tenants_local_reset);
/*
* citus_stat_tenants_local finds, updates and returns the statistics for tenants.
*/
Datum
citus_stat_tenants_local(PG_FUNCTION_ARGS)
{
CheckCitusVersion(ERROR);
/*
* We keep more than StatTenantsLimit tenants in our monitor.
* We do this to not lose data if a tenant falls out of top StatTenantsLimit in case they need to return soon.
* Normally we return StatTenantsLimit tenants but if returnAllTenants is true we return all of them.
*/
bool returnAllTenants = PG_GETARG_BOOL(0);
TupleDesc tupleDescriptor = NULL;
Tuplestorestate *tupleStore = SetupTuplestore(fcinfo, &tupleDescriptor);
TimestampTz monitoringTime = GetCurrentTimestamp();
Datum values[STAT_TENANTS_COLUMNS];
bool isNulls[STAT_TENANTS_COLUMNS];
MultiTenantMonitor *monitor = GetMultiTenantMonitor();
if (monitor == NULL)
{
PG_RETURN_VOID();
}
LWLockAcquire(&monitor->lock, LW_EXCLUSIVE);
int numberOfRowsToReturn = 0;
if (returnAllTenants)
{
numberOfRowsToReturn = monitor->tenantCount;
}
else
{
numberOfRowsToReturn = Min(monitor->tenantCount, StatTenantsLimit);
}
for (int tenantIndex = 0; tenantIndex < monitor->tenantCount; tenantIndex++)
{
UpdatePeriodsIfNecessary(&monitor->tenants[tenantIndex], monitoringTime);
ReduceScoreIfNecessary(&monitor->tenants[tenantIndex], monitoringTime);
}
SafeQsort(monitor->tenants, monitor->tenantCount, sizeof(TenantStats),
CompareTenantScore);
for (int i = 0; i < numberOfRowsToReturn; i++)
{
memset(values, 0, sizeof(values));
memset(isNulls, false, sizeof(isNulls));
TenantStats *tenantStats = &monitor->tenants[i];
values[0] = Int32GetDatum(tenantStats->colocationGroupId);
values[1] = PointerGetDatum(cstring_to_text(tenantStats->tenantAttribute));
values[2] = Int32GetDatum(tenantStats->readsInThisPeriod);
values[3] = Int32GetDatum(tenantStats->readsInLastPeriod);
values[4] = Int32GetDatum(tenantStats->readsInThisPeriod +
tenantStats->writesInThisPeriod);
values[5] = Int32GetDatum(tenantStats->readsInLastPeriod +
tenantStats->writesInLastPeriod);
values[6] = Float8GetDatum(tenantStats->cpuUsageInThisPeriod);
values[7] = Float8GetDatum(tenantStats->cpuUsageInLastPeriod);
values[8] = Int64GetDatum(tenantStats->score);
tuplestore_putvalues(tupleStore, tupleDescriptor, values, isNulls);
}
LWLockRelease(&monitor->lock);
PG_RETURN_VOID();
}
/*
* citus_stat_tenants_local_reset resets monitor for tenant statistics
* on the local node.
*/
Datum
citus_stat_tenants_local_reset(PG_FUNCTION_ARGS)
{
MultiTenantMonitor *monitor = GetMultiTenantMonitor();
monitor->tenantCount = 0;
PG_RETURN_VOID();
}
/*
* AttributeQueryIfAnnotated checks the query annotation and if the query is annotated
* for the tenant statistics monitoring this function records the tenant attributes.
*/
void
AttributeQueryIfAnnotated(const char *query_string, CmdType commandType)
{
if (StatTenantsTrack == STAT_TENANTS_TRACK_NONE)
{
return;
}
strcpy_s(AttributeToTenant, sizeof(AttributeToTenant), "");
if (query_string == NULL)
{
return;
}
if (strncmp(ATTRIBUTE_PREFIX, query_string, strlen(ATTRIBUTE_PREFIX)) == 0)
{
char *annotation = ExtractTopComment(query_string);
if (annotation != NULL)
{
Datum jsonbDatum = DirectFunctionCall1(jsonb_in, PointerGetDatum(annotation));
text *tenantIdTextP = ExtractFieldTextP(jsonbDatum, "tId");
char *tenantId = NULL;
if (tenantIdTextP != NULL)
{
tenantId = UnescapeCommentChars(text_to_cstring(tenantIdTextP));
}
int colocationId = ExtractFieldInt32(jsonbDatum, "cId",
INVALID_COLOCATION_ID);
AttributeTask(tenantId, colocationId, commandType);
}
}
}
/*
* AttributeTask assigns the given attributes of a tenant and starts a timer
*/
void
AttributeTask(char *tenantId, int colocationId, CmdType commandType)
{
if (StatTenantsTrack == STAT_TENANTS_TRACK_NONE ||
tenantId == NULL || colocationId == INVALID_COLOCATION_ID)
{
return;
}
AttributeToColocationGroupId = colocationId;
strncpy_s(AttributeToTenant, MAX_TENANT_ATTRIBUTE_LENGTH, tenantId,
MAX_TENANT_ATTRIBUTE_LENGTH - 1);
AttributeToCommandType = commandType;
QueryStartClock = clock();
}
/*
* AnnotateQuery annotates the query with tenant attributes.
*/
char *
AnnotateQuery(char *queryString, Const *partitionKeyValue, int colocationId)
{
if (StatTenantsTrack == STAT_TENANTS_TRACK_NONE || partitionKeyValue == NULL)
{
return queryString;
}
char *partitionKeyValueString = DatumToString(partitionKeyValue->constvalue,
partitionKeyValue->consttype);
char *commentCharsEscaped = EscapeCommentChars(partitionKeyValueString);
StringInfo escapedSourceName = makeStringInfo();
escape_json(escapedSourceName, commentCharsEscaped);
StringInfo newQuery = makeStringInfo();
appendStringInfo(newQuery, ATTRIBUTE_STRING_FORMAT, escapedSourceName->data,
colocationId);
appendStringInfoString(newQuery, queryString);
return newQuery->data;
}
/*
* CitusAttributeToEnd keeps the statistics for the tenant and calls the previously installed end hook
* or the standard executor end function.
*/
void
CitusAttributeToEnd(QueryDesc *queryDesc)
{
/*
* At the end of the Executor is the last moment we have to attribute the previous
* attribution to a tenant, if applicable
*/
AttributeMetricsIfApplicable();
/* now call in to the previously installed hook, or the standard implementation */
if (prev_ExecutorEnd)
{
prev_ExecutorEnd(queryDesc);
}
else
{
standard_ExecutorEnd(queryDesc);
}
}
/*
* CompareTenantScore is used to sort the tenant statistics by score
* in descending order.
*/
static int
CompareTenantScore(const void *leftElement, const void *rightElement)
{
const TenantStats *leftTenant = (const TenantStats *) leftElement;
const TenantStats *rightTenant = (const TenantStats *) rightElement;
if (leftTenant->score > rightTenant->score)
{
return -1;
}
else if (leftTenant->score < rightTenant->score)
{
return 1;
}
return 0;
}
/*
* AttributeMetricsIfApplicable updates the metrics for current tenant's statistics
*/
static void
AttributeMetricsIfApplicable()
{
if (StatTenantsTrack == STAT_TENANTS_TRACK_NONE ||
AttributeToTenant[0] == '\0')
{
return;
}
/*
* return if we are not in the top level to make sure we are not
* stopping counting time for a sub-level execution
*/
if (ExecutorLevel != 0 || PlannerLevel != 0)
{
return;
}
QueryEndClock = clock();
TimestampTz queryTime = GetCurrentTimestamp();
MultiTenantMonitor *monitor = GetMultiTenantMonitor();
/*
* We need to acquire the monitor lock in shared mode to check if the tenant is
* already in the monitor. If it is not, we need to acquire the lock in
* exclusive mode to add the tenant to the monitor.
*
* We need to check again if the tenant is in the monitor after acquiring the
* exclusive lock to avoid adding the tenant twice. Some other backend might
* have added the tenant while we were waiting for the lock.
*
* After releasing the exclusive lock, we need to acquire the lock in shared
* mode to update the tenant's statistics. We need to check again if the tenant
* is in the monitor after acquiring the shared lock because some other backend
* might have removed the tenant while we were waiting for the lock.
*/
LWLockAcquire(&monitor->lock, LW_SHARED);
int currentTenantIndex = FindTenantStats(monitor);
if (currentTenantIndex != -1)
{
TenantStats *tenantStats = &monitor->tenants[currentTenantIndex];
LWLockAcquire(&tenantStats->lock, LW_EXCLUSIVE);
UpdatePeriodsIfNecessary(tenantStats, queryTime);
ReduceScoreIfNecessary(tenantStats, queryTime);
RecordTenantStats(tenantStats, queryTime);
LWLockRelease(&tenantStats->lock);
}
else
{
LWLockRelease(&monitor->lock);
LWLockAcquire(&monitor->lock, LW_EXCLUSIVE);
currentTenantIndex = FindTenantStats(monitor);
if (currentTenantIndex == -1)
{
currentTenantIndex = CreateTenantStats(monitor, queryTime);
}
LWLockRelease(&monitor->lock);
LWLockAcquire(&monitor->lock, LW_SHARED);
currentTenantIndex = FindTenantStats(monitor);
if (currentTenantIndex != -1)
{
TenantStats *tenantStats = &monitor->tenants[currentTenantIndex];
LWLockAcquire(&tenantStats->lock, LW_EXCLUSIVE);
UpdatePeriodsIfNecessary(tenantStats, queryTime);
ReduceScoreIfNecessary(tenantStats, queryTime);
RecordTenantStats(tenantStats, queryTime);
LWLockRelease(&tenantStats->lock);
}
}
LWLockRelease(&monitor->lock);
strcpy_s(AttributeToTenant, sizeof(AttributeToTenant), "");
}
/*
* UpdatePeriodsIfNecessary moves the query counts to previous periods if a enough time has passed.
*
* If 1 period has passed after the latest query, this function moves this period's counts to the last period
* and cleans this period's statistics.
*
* If 2 or more periods has passed after the last query, this function cleans all both this and last period's
* statistics.
*/
static void
UpdatePeriodsIfNecessary(TenantStats *tenantStats, TimestampTz queryTime)
{
long long int periodInMicroSeconds = StatTenantsPeriod * USECS_PER_SEC;
long long int periodInMilliSeconds = StatTenantsPeriod * 1000;
TimestampTz periodStart = queryTime - (queryTime % periodInMicroSeconds);
/*
* If the last query in this tenant was before the start of current period
* but there are some query count for this period we move them to the last period.
*/
if (tenantStats->lastQueryTime < periodStart &&
(tenantStats->writesInThisPeriod || tenantStats->readsInThisPeriod))
{
tenantStats->writesInLastPeriod = tenantStats->writesInThisPeriod;
tenantStats->writesInThisPeriod = 0;
tenantStats->readsInLastPeriod = tenantStats->readsInThisPeriod;
tenantStats->readsInThisPeriod = 0;
tenantStats->cpuUsageInLastPeriod = tenantStats->cpuUsageInThisPeriod;
tenantStats->cpuUsageInThisPeriod = 0;
}
/*
* If the last query is more than two periods ago, we clean the last period counts too.
*/
if (TimestampDifferenceExceeds(tenantStats->lastQueryTime, periodStart,
periodInMilliSeconds))
{
tenantStats->writesInLastPeriod = 0;
tenantStats->readsInLastPeriod = 0;
tenantStats->cpuUsageInLastPeriod = 0;
}
}
/*
* ReduceScoreIfNecessary reduces the tenant score only if it is necessary.
*
* We halve the tenants' scores after each period. This function checks the number of
* periods that passed after the lsat score reduction and reduces the score accordingly.
*/
static void
ReduceScoreIfNecessary(TenantStats *tenantStats, TimestampTz queryTime)
{
long long int periodInMicroSeconds = StatTenantsPeriod * USECS_PER_SEC;
TimestampTz periodStart = queryTime - (queryTime % periodInMicroSeconds);
/*
* With each query we increase the score of tenant by ONE_QUERY_SCORE.
* After one period we halve the scores.
*
* Here we calculate how many periods passed after the last time we did score reduction
* If the latest score reduction was in this period this number should be 0,
* if it was in the last period this number should be 1 and so on.
*/
int periodCountAfterLastScoreReduction = (periodStart -
tenantStats->lastScoreReduction +
periodInMicroSeconds - 1) /
periodInMicroSeconds;
/*
* This should not happen but let's make sure
*/
if (periodCountAfterLastScoreReduction < 0)
{
periodCountAfterLastScoreReduction = 0;
}
/*
* If the last score reduction was not in this period we do score reduction now.
*/
if (periodCountAfterLastScoreReduction > 0)
{
tenantStats->score >>= periodCountAfterLastScoreReduction;
tenantStats->lastScoreReduction = queryTime;
}
}
/*
* EvictTenantsIfNecessary sorts and evicts the tenants if the tenant count is more than or
* equal to 3 * StatTenantsLimit.
*/
static void
EvictTenantsIfNecessary(TimestampTz queryTime)
{
MultiTenantMonitor *monitor = GetMultiTenantMonitor();
/*
* We keep up to StatTenantsLimit * 3 tenants instead of StatTenantsLimit,
* so we don't lose data immediately after a tenant is out of top StatTenantsLimit
*
* Every time tenant count hits StatTenantsLimit * 3, we reduce it back to StatTenantsLimit * 2.
*/
if (monitor->tenantCount >= StatTenantsLimit * 3)
{
for (int tenantIndex = 0; tenantIndex < monitor->tenantCount; tenantIndex++)
{
ReduceScoreIfNecessary(&monitor->tenants[tenantIndex], queryTime);
}
SafeQsort(monitor->tenants, monitor->tenantCount, sizeof(TenantStats),
CompareTenantScore);
monitor->tenantCount = StatTenantsLimit * 2;
}
}
/*
* RecordTenantStats records the query statistics for the tenant.
*/
static void
RecordTenantStats(TenantStats *tenantStats, TimestampTz queryTime)
{
if (tenantStats->score < LLONG_MAX - ONE_QUERY_SCORE)
{
tenantStats->score += ONE_QUERY_SCORE;
}
else
{
tenantStats->score = LLONG_MAX;
}
if (AttributeToCommandType == CMD_SELECT)
{
tenantStats->readsInThisPeriod++;
}
else if (AttributeToCommandType == CMD_UPDATE ||
AttributeToCommandType == CMD_INSERT ||
AttributeToCommandType == CMD_DELETE)
{
tenantStats->writesInThisPeriod++;
}
double queryCpuTime = ((double) (QueryEndClock - QueryStartClock)) / CLOCKS_PER_SEC;
tenantStats->cpuUsageInThisPeriod += queryCpuTime;
tenantStats->lastQueryTime = queryTime;
}
/*
* CreateMultiTenantMonitor creates the data structure for multi tenant monitor.
*/
static void
CreateMultiTenantMonitor()
{
MultiTenantMonitor *monitor = CreateSharedMemoryForMultiTenantMonitor();
monitor->tenantCount = 0;
}
/*
* CreateSharedMemoryForMultiTenantMonitor creates a dynamic shared memory segment for multi tenant monitor.
*/
static MultiTenantMonitor *
CreateSharedMemoryForMultiTenantMonitor()
{
bool found = false;
MultiTenantMonitor *monitor = ShmemInitStruct(SharedMemoryNameForMultiTenantMonitor,
MultiTenantMonitorshmemSize(),
&found);
if (found)
{
return monitor;
}
monitor->namedLockTranche.trancheId = LWLockNewTrancheId();
monitor->namedLockTranche.trancheName = MonitorTrancheName;
LWLockRegisterTranche(monitor->namedLockTranche.trancheId,
monitor->namedLockTranche.trancheName);
LWLockInitialize(&monitor->lock, monitor->namedLockTranche.trancheId);
return monitor;
}
/*
* GetMultiTenantMonitor returns the data structure for multi tenant monitor.
*/
static MultiTenantMonitor *
GetMultiTenantMonitor()
{
bool found = false;
MultiTenantMonitor *monitor = ShmemInitStruct(SharedMemoryNameForMultiTenantMonitor,
MultiTenantMonitorshmemSize(),
&found);
if (!found)
{
elog(WARNING, "monitor not found");
return NULL;
}
return monitor;
}
/*
* InitializeMultiTenantMonitorSMHandleManagement sets up the shared memory startup hook
* so that the multi tenant monitor can be initialized and stored in shared memory.
*/
void
InitializeMultiTenantMonitorSMHandleManagement()
{
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = MultiTenantMonitorSMInit;
}
/*
* MultiTenantMonitorSMInit initializes the shared memory for MultiTenantMonitorSMData.
*/
static void
MultiTenantMonitorSMInit()
{
CreateMultiTenantMonitor();
if (prev_shmem_startup_hook != NULL)
{
prev_shmem_startup_hook();
}
}
/*
* CreateTenantStats creates the data structure for a tenant's statistics.
*
* Calling this function should be protected by the monitor->lock in LW_EXCLUSIVE mode.
*/
static int
CreateTenantStats(MultiTenantMonitor *monitor, TimestampTz queryTime)
{
/*
* If the tenant count reached 3 * StatTenantsLimit, we evict the tenants
* with the lowest score.
*/
EvictTenantsIfNecessary(queryTime);
int tenantIndex = monitor->tenantCount;
memset(&monitor->tenants[tenantIndex], 0, sizeof(monitor->tenants[tenantIndex]));
strcpy_s(monitor->tenants[tenantIndex].tenantAttribute,
sizeof(monitor->tenants[tenantIndex].tenantAttribute), AttributeToTenant);
monitor->tenants[tenantIndex].colocationGroupId = AttributeToColocationGroupId;
monitor->tenants[tenantIndex].namedLockTranche.trancheId = LWLockNewTrancheId();
monitor->tenants[tenantIndex].namedLockTranche.trancheName = TenantTrancheName;
LWLockRegisterTranche(monitor->tenants[tenantIndex].namedLockTranche.trancheId,
monitor->tenants[tenantIndex].namedLockTranche.trancheName);
LWLockInitialize(&monitor->tenants[tenantIndex].lock,
monitor->tenants[tenantIndex].namedLockTranche.trancheId);
monitor->tenantCount++;
return tenantIndex;
}
/*
* FindTenantStats finds the index for the current tenant's statistics.
*/
static int
FindTenantStats(MultiTenantMonitor *monitor)
{
for (int i = 0; i < monitor->tenantCount; i++)
{
TenantStats *tenantStats = &monitor->tenants[i];
if (strcmp(tenantStats->tenantAttribute, AttributeToTenant) == 0 &&
tenantStats->colocationGroupId == AttributeToColocationGroupId)
{
return i;
}
}
return -1;
}
/*
* MultiTenantMonitorshmemSize calculates the size of the multi tenant monitor using
* StatTenantsLimit parameter.
*/
static size_t
MultiTenantMonitorshmemSize(void)
{
Size size = sizeof(MultiTenantMonitor);
size = add_size(size, mul_size(sizeof(TenantStats), StatTenantsLimit * 3));
return size;
}
/*
* ExtractTopComment extracts the top-level multi-line comment from a given input string.
*/
static char *
ExtractTopComment(const char *inputString)
{
int commentCharsLength = 2;
int inputStringLen = strlen(inputString);
if (inputStringLen < commentCharsLength)
{
return NULL;
}
const char *commentStartChars = "/*";
const char *commentEndChars = "*/";
/* If query doesn't start with a comment, return NULL */
if (strstr(inputString, commentStartChars) != inputString)
{
return NULL;
}
StringInfo commentData = makeStringInfo();
/* Skip the comment start characters */
const char *commentStart = inputString + commentCharsLength;
/* Find the first comment end character */
const char *commentEnd = strstr(commentStart, commentEndChars);
if (commentEnd == NULL)
{
return NULL;
}
/* Append the comment to the StringInfo buffer */
int commentLength = commentEnd - commentStart;
appendStringInfo(commentData, "%.*s", commentLength, commentStart);
/* Return the extracted comment */
return commentData->data;
}
/* EscapeCommentChars adds a backslash before each occurrence of '*' or '/' in the input string */
static char *
EscapeCommentChars(const char *str)
{
int originalStringLength = strlen(str);
StringInfo escapedString = makeStringInfo();
for (int originalStringIndex = 0; originalStringIndex < originalStringLength;
originalStringIndex++)
{
if (str[originalStringIndex] == '*' || str[originalStringIndex] == '/')
{
appendStringInfoChar(escapedString, '\\');
}
appendStringInfoChar(escapedString, str[originalStringIndex]);
}
return escapedString->data;
}
/* UnescapeCommentChars removes the backslash that precedes '*' or '/' in the input string. */
static char *
UnescapeCommentChars(const char *str)
{
int originalStringLength = strlen(str);
StringInfo unescapedString = makeStringInfo();
for (int originalStringindex = 0; originalStringindex < originalStringLength;
originalStringindex++)
{
if (str[originalStringindex] == '\\' &&
originalStringindex < originalStringLength - 1 &&
(str[originalStringindex + 1] == '*' ||
str[originalStringindex + 1] == '/'))
{
originalStringindex++;
}
appendStringInfoChar(unescapedString, str[originalStringindex]);
}
return unescapedString->data;
}