Merge pull request #5766 from citusdata/implement_ddl_blocked

Improve citus_lock_waits
pull/5773/head
Önder Kalacı 2022-03-07 11:20:23 +01:00 committed by GitHub
commit ad346fedcf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 271 additions and 54 deletions

View File

@ -2226,6 +2226,19 @@ CitusAuthHook(Port *port, int status)
"regular client connections",
MaxClientConnections)));
}
/*
* Right after this, before we assign global pid, this backend
* might get blocked by a DDL as that happens during parsing.
*
* That's why, lets mark the backend as an external backend
* which is likely to execute a distributed command.
*
* We do this so that this backend gets the chance to show
* up in citus_lock_waits.
*/
InitializeBackendData();
SetBackendDataDistributedCommandOriginator(true);
}
/* let other authentication hooks to kick in first */

View File

@ -35,6 +35,16 @@ DROP FUNCTION IF EXISTS pg_catalog.citus_dist_stat_activity() CASCADE;
DROP FUNCTION IF EXISTS pg_catalog.citus_worker_stat_activity() CASCADE;
#include "udfs/citus_dist_stat_activity/11.0-1.sql"
-- a very simple helper function defined for citus_lock_waits
CREATE OR REPLACE FUNCTION get_nodeid_for_groupid(groupIdInput int) RETURNS int AS $$
DECLARE
returnNodeNodeId int := 0;
begin
SELECT nodeId into returnNodeNodeId FROM pg_dist_node WHERE groupid = groupIdInput and nodecluster = current_setting('citus.cluster_name');
RETURN returnNodeNodeId;
end
$$ LANGUAGE plpgsql;
#include "udfs/citus_lock_waits/11.0-1.sql"
#include "udfs/pg_cancel_backend/11.0-1.sql"

View File

@ -357,6 +357,7 @@ GRANT SELECT ON pg_catalog.citus_lock_waits TO PUBLIC;
DROP FUNCTION pg_catalog.citus_finalize_upgrade_to_citus11(bool);
DROP FUNCTION pg_catalog.citus_calculate_gpid(integer,integer);
DROP FUNCTION pg_catalog.citus_backend_gpid();
DROP FUNCTION get_nodeid_for_groupid(integer);
RESET search_path;

View File

@ -2,25 +2,44 @@ SET search_path = 'pg_catalog';
CREATE VIEW citus.citus_lock_waits AS
WITH
unique_global_wait_edges_with_calculated_gpids AS (
SELECT
-- if global_pid is NULL, it is most likely that a backend is blocked on a DDL
-- also for legacy reasons citus_internal_global_blocked_processes() returns groupId, we replace that with nodeIds
case WHEN waiting_global_pid !=0 THEN waiting_global_pid ELSE citus_calculate_gpid(get_nodeid_for_groupid(waiting_node_id), waiting_pid) END waiting_global_pid,
case WHEN blocking_global_pid !=0 THEN blocking_global_pid ELSE citus_calculate_gpid(get_nodeid_for_groupid(blocking_node_id), blocking_pid) END blocking_global_pid,
-- citus_internal_global_blocked_processes returns groupId, we replace it here with actual
-- nodeId to be consisten with the other views
get_nodeid_for_groupid(blocking_node_id) as blocking_node_id,
get_nodeid_for_groupid(waiting_node_id) as waiting_node_id,
blocking_transaction_waiting
FROM citus_internal_global_blocked_processes()
),
unique_global_wait_edges AS
(
SELECT DISTINCT ON(waiting_global_pid, blocking_global_pid) * FROM citus_internal_global_blocked_processes()
SELECT DISTINCT ON(waiting_global_pid, blocking_global_pid) * FROM unique_global_wait_edges_with_calculated_gpids
),
citus_dist_stat_activity_with_calculated_gpids AS
(
-- if global_pid is NULL, it is most likely that a backend is blocked on a DDL
SELECT CASE WHEN global_pid != 0 THEN global_pid ELSE citus_calculate_gpid(nodeid, pid) END global_pid, nodeid, pid, query FROM citus_dist_stat_activity
)
SELECT
waiting.global_pid as waiting_gpid,
blocking.global_pid as blocking_gpid,
waiting.pid AS waiting_pid,
blocking.pid AS blocking_pid,
waiting.query AS blocked_statement,
blocking.query AS current_statement_in_blocking_process,
waiting.nodeid AS waiting_nodeid,
blocking.nodeid AS blocking_nodeid
waiting.global_pid as waiting_gpid,
blocking.global_pid as blocking_gpid,
waiting.query AS blocked_statement,
blocking.query AS current_statement_in_blocking_process,
waiting.nodeid AS waiting_nodeid,
blocking.nodeid AS blocking_nodeid
FROM
unique_global_wait_edges
JOIN
citus_dist_stat_activity waiting ON (unique_global_wait_edges.waiting_global_pid = waiting.global_pid)
JOIN
citus_dist_stat_activity blocking ON (unique_global_wait_edges.blocking_global_pid = blocking.global_pid);
unique_global_wait_edges
JOIN
citus_dist_stat_activity_with_calculated_gpids waiting ON (unique_global_wait_edges.waiting_global_pid = waiting.global_pid)
JOIN
citus_dist_stat_activity_with_calculated_gpids blocking ON (unique_global_wait_edges.blocking_global_pid = blocking.global_pid);
ALTER VIEW citus.citus_lock_waits SET SCHEMA pg_catalog;
GRANT SELECT ON pg_catalog.citus_lock_waits TO PUBLIC;

View File

@ -2,25 +2,44 @@ SET search_path = 'pg_catalog';
CREATE VIEW citus.citus_lock_waits AS
WITH
unique_global_wait_edges_with_calculated_gpids AS (
SELECT
-- if global_pid is NULL, it is most likely that a backend is blocked on a DDL
-- also for legacy reasons citus_internal_global_blocked_processes() returns groupId, we replace that with nodeIds
case WHEN waiting_global_pid !=0 THEN waiting_global_pid ELSE citus_calculate_gpid(get_nodeid_for_groupid(waiting_node_id), waiting_pid) END waiting_global_pid,
case WHEN blocking_global_pid !=0 THEN blocking_global_pid ELSE citus_calculate_gpid(get_nodeid_for_groupid(blocking_node_id), blocking_pid) END blocking_global_pid,
-- citus_internal_global_blocked_processes returns groupId, we replace it here with actual
-- nodeId to be consisten with the other views
get_nodeid_for_groupid(blocking_node_id) as blocking_node_id,
get_nodeid_for_groupid(waiting_node_id) as waiting_node_id,
blocking_transaction_waiting
FROM citus_internal_global_blocked_processes()
),
unique_global_wait_edges AS
(
SELECT DISTINCT ON(waiting_global_pid, blocking_global_pid) * FROM citus_internal_global_blocked_processes()
SELECT DISTINCT ON(waiting_global_pid, blocking_global_pid) * FROM unique_global_wait_edges_with_calculated_gpids
),
citus_dist_stat_activity_with_calculated_gpids AS
(
-- if global_pid is NULL, it is most likely that a backend is blocked on a DDL
SELECT CASE WHEN global_pid != 0 THEN global_pid ELSE citus_calculate_gpid(nodeid, pid) END global_pid, nodeid, pid, query FROM citus_dist_stat_activity
)
SELECT
waiting.global_pid as waiting_gpid,
blocking.global_pid as blocking_gpid,
waiting.pid AS waiting_pid,
blocking.pid AS blocking_pid,
waiting.query AS blocked_statement,
blocking.query AS current_statement_in_blocking_process,
waiting.nodeid AS waiting_nodeid,
blocking.nodeid AS blocking_nodeid
waiting.global_pid as waiting_gpid,
blocking.global_pid as blocking_gpid,
waiting.query AS blocked_statement,
blocking.query AS current_statement_in_blocking_process,
waiting.nodeid AS waiting_nodeid,
blocking.nodeid AS blocking_nodeid
FROM
unique_global_wait_edges
JOIN
citus_dist_stat_activity waiting ON (unique_global_wait_edges.waiting_global_pid = waiting.global_pid)
JOIN
citus_dist_stat_activity blocking ON (unique_global_wait_edges.blocking_global_pid = blocking.global_pid);
unique_global_wait_edges
JOIN
citus_dist_stat_activity_with_calculated_gpids waiting ON (unique_global_wait_edges.waiting_global_pid = waiting.global_pid)
JOIN
citus_dist_stat_activity_with_calculated_gpids blocking ON (unique_global_wait_edges.blocking_global_pid = blocking.global_pid);
ALTER VIEW citus.citus_lock_waits SET SCHEMA pg_catalog;
GRANT SELECT ON pg_catalog.citus_lock_waits TO PUBLIC;

View File

@ -188,7 +188,7 @@ run_commands_on_session_level_connection_to_node(PG_FUNCTION_ARGS)
/*
* override_backend_data_command_originator is a wrapper around
* OverrideBackendDataDistributedCommandOriginator().
* SetBackendDataDistributedCommandOriginator().
*/
Datum
override_backend_data_command_originator(PG_FUNCTION_ARGS)
@ -197,7 +197,7 @@ override_backend_data_command_originator(PG_FUNCTION_ARGS)
bool distributedCommandOriginator = PG_GETARG_BOOL(0);
OverrideBackendDataDistributedCommandOriginator(distributedCommandOriginator);
SetBackendDataDistributedCommandOriginator(distributedCommandOriginator);
PG_RETURN_VOID();
}

View File

@ -721,6 +721,7 @@ UnSetGlobalPID(void)
MyBackendData->globalPID = 0;
MyBackendData->databaseId = 0;
MyBackendData->userId = 0;
MyBackendData->distributedCommandOriginator = false;
SpinLockRelease(&MyBackendData->mutex);
}
@ -850,12 +851,16 @@ AssignGlobalPID(void)
/*
* OverrideBackendDataDistributedCommandOriginator should only be used for isolation testing.
* See how it is used in the relevant functions.
* SetBackendDataDistributedCommandOriginator is used to set the distributedCommandOriginator
* field on MyBackendData.
*/
void
OverrideBackendDataDistributedCommandOriginator(bool distributedCommandOriginator)
SetBackendDataDistributedCommandOriginator(bool distributedCommandOriginator)
{
if (!MyBackendData)
{
return;
}
SpinLockAcquire(&MyBackendData->mutex);
MyBackendData->distributedCommandOriginator =
distributedCommandOriginator;

View File

@ -55,8 +55,8 @@ extern void UnSetGlobalPID(void);
extern void AssignDistributedTransactionId(void);
extern void AssignGlobalPID(void);
extern uint64 GetGlobalPID(void);
extern void OverrideBackendDataDistributedCommandOriginator(bool
distributedCommandOriginator);
extern void SetBackendDataDistributedCommandOriginator(bool
distributedCommandOriginator);
extern uint64 ExtractGlobalPID(char *applicationName);
extern int ExtractNodeIdFromGlobalPID(uint64 globalPID, bool missingOk);
extern int ExtractProcessIdFromGlobalPID(uint64 globalPID);

View File

@ -1,4 +1,4 @@
Parsed test spec with 4 sessions
Parsed test spec with 8 sessions
starting permutation: s1-begin s1-update-ref-table-from-coordinator s2-start-session-level-connection s2-begin-on-worker s2-update-ref-table s3-select-distributed-waiting-queries s1-commit s2-commit-worker s2-stop-connection
step s1-begin:
@ -61,7 +61,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -157,7 +157,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -253,7 +253,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -349,7 +349,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -445,7 +445,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -539,7 +539,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -635,7 +635,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -729,7 +729,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -823,7 +823,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -919,7 +919,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -986,7 +986,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -1018,7 +1018,7 @@ step s1-commit:
COMMIT;
step s2-update-on-the-coordinator: <... completed>
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -1114,7 +1114,7 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
@ -1194,13 +1194,13 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
citus_remove_node
---------------------------------------------------------------------
(1 row)
starting permutation: s1-begin s1-update-ref-table-from-coordinator s2-start-session-level-connection s2-update-ref-table s3-select-distributed-waiting-queries s1-commit s2-stop-connection
starting permutation: s1-begin s1-update-ref-table-from-coordinator s2-start-session-level-connection s2-update-ref-table s3-select-distributed-waiting-queries s1-commit s2-stop-connection s5-begin s5-alter s6-select s3-select-distributed-waiting-queries s3-show-actual-gpids s5-rollback s8-begin s8-select s7-alter s3-select-distributed-waiting-queries s3-show-actual-gpids s8-rollback
step s1-begin:
BEGIN;
@ -1245,7 +1245,87 @@ stop_session_level_connection_to_node
(1 row)
restore_isolation_tester_func
step s5-begin:
BEGIN;
step s5-alter:
ALTER TABLE tt1 ADD COLUMN new_column INT;
step s6-select:
SELECT user_id FROM tt1 ORDER BY user_id DESC LIMIT 1;
<waiting ...>
step s3-select-distributed-waiting-queries:
SELECT blocked_statement, current_statement_in_blocking_process FROM citus_lock_waits WHERE blocked_statement NOT ILIKE '%run_commands_on_session_level_connection_to_node%' AND current_statement_in_blocking_process NOT ILIKE '%run_commands_on_session_level_connection_to_node%';
blocked_statement |current_statement_in_blocking_process
---------------------------------------------------------------------
SELECT user_id FROM tt1 ORDER BY user_id DESC LIMIT 1;
|
ALTER TABLE tt1 ADD COLUMN new_column INT;
(1 row)
step s3-show-actual-gpids:
SELECT global_pid > 0 as gpid_exists, query FROM citus_stat_activity WHERE state = 'active' AND query IN (SELECT blocked_statement FROM citus_lock_waits UNION SELECT current_statement_in_blocking_process FROM citus_lock_waits) ORDER BY 1 DESC;
gpid_exists|query
---------------------------------------------------------------------
f |
SELECT user_id FROM tt1 ORDER BY user_id DESC LIMIT 1;
(1 row)
step s5-rollback:
ROLLBACK;
step s6-select: <... completed>
user_id
---------------------------------------------------------------------
7
(1 row)
step s8-begin:
BEGIN;
step s8-select:
SELECT user_id FROM tt1 ORDER BY user_id DESC LIMIT 1;
user_id
---------------------------------------------------------------------
7
(1 row)
step s7-alter:
ALTER TABLE tt1 ADD COLUMN new_column INT;
<waiting ...>
step s3-select-distributed-waiting-queries:
SELECT blocked_statement, current_statement_in_blocking_process FROM citus_lock_waits WHERE blocked_statement NOT ILIKE '%run_commands_on_session_level_connection_to_node%' AND current_statement_in_blocking_process NOT ILIKE '%run_commands_on_session_level_connection_to_node%';
blocked_statement |current_statement_in_blocking_process
---------------------------------------------------------------------
ALTER TABLE tt1 ADD COLUMN new_column INT;
|
SELECT user_id FROM tt1 ORDER BY user_id DESC LIMIT 1;
(1 row)
step s3-show-actual-gpids:
SELECT global_pid > 0 as gpid_exists, query FROM citus_stat_activity WHERE state = 'active' AND query IN (SELECT blocked_statement FROM citus_lock_waits UNION SELECT current_statement_in_blocking_process FROM citus_lock_waits) ORDER BY 1 DESC;
gpid_exists|query
---------------------------------------------------------------------
t |
ALTER TABLE tt1 ADD COLUMN new_column INT;
(1 row)
step s8-rollback:
ROLLBACK;
step s7-alter: <... completed>
citus_remove_node
---------------------------------------------------------------------
(1 row)

View File

@ -1014,6 +1014,7 @@ SELECT * FROM multi_extension.print_extension_changes();
| function citus_shards_on_worker() SETOF record
| function citus_stat_activity() SETOF record
| function create_distributed_function(regprocedure,text,text,boolean) void
| function get_nodeid_for_groupid(integer) integer
| function pg_cancel_backend(bigint) boolean
| function pg_terminate_backend(bigint,bigint) boolean
| function run_command_on_all_nodes(text,boolean,boolean) SETOF record
@ -1023,7 +1024,7 @@ SELECT * FROM multi_extension.print_extension_changes();
| function worker_partition_query_result(text,text,integer,citus.distribution_type,text[],text[],boolean,boolean,boolean) SETOF record
| table pg_dist_object
| view citus_stat_activity
(35 rows)
(36 rows)
DROP TABLE multi_extension.prev_objects, multi_extension.extension_diff;
-- show running version

View File

@ -146,6 +146,7 @@ ORDER BY 1;
function get_current_transaction_id()
function get_global_active_transactions()
function get_missing_time_partition_ranges(regclass,interval,timestamp with time zone,timestamp with time zone)
function get_nodeid_for_groupid(integer)
function get_rebalance_progress()
function get_rebalance_table_shards_plan(regclass,real,integer,bigint[],boolean,name,real)
function get_shard_id_for_distribution_column(regclass,"any")
@ -277,5 +278,5 @@ ORDER BY 1;
view citus_stat_statements
view pg_dist_shard_placement
view time_partitions
(261 rows)
(262 rows)

View File

@ -1,6 +1,7 @@
#include "isolation_mx_common.include.spec"
setup {
SELECT citus_add_node('localhost', 57636, groupid:=0);
CREATE TABLE ref_table(user_id int, value_1 int);
SELECT create_reference_table('ref_table');
INSERT INTO ref_table VALUES (1, 11), (2, 21), (3, 31), (4, 41), (5, 51), (6, 61), (7, 71);
@ -17,6 +18,7 @@ teardown
DROP TABLE ref_table;
DROP TABLE tt1;
SELECT citus_internal.restore_isolation_tester_func();
SELECT citus_remove_node('localhost', 57636);
}
session "s1"
@ -162,6 +164,12 @@ step "s3-select-distributed-waiting-queries"
SELECT blocked_statement, current_statement_in_blocking_process FROM citus_lock_waits WHERE blocked_statement NOT ILIKE '%run_commands_on_session_level_connection_to_node%' AND current_statement_in_blocking_process NOT ILIKE '%run_commands_on_session_level_connection_to_node%';
}
// only works for the coordinator
step "s3-show-actual-gpids"
{
SELECT global_pid > 0 as gpid_exists, query FROM citus_stat_activity WHERE state = 'active' AND query IN (SELECT blocked_statement FROM citus_lock_waits UNION SELECT current_statement_in_blocking_process FROM citus_lock_waits) ORDER BY 1 DESC;
}
// session s1 and s4 executes the commands on the same worker node
session "s4"
@ -189,6 +197,59 @@ step "s4-commit-worker"
SELECT run_commands_on_session_level_connection_to_node('COMMIT');
}
// on the coordinator, show that even if a backend is blocked on a DDL as the first command
// (e.g., as of today global pid has not been assigned), we can still show the blocking activity
// we use the following 4 sessions 5,6,7,8 for this purpose
session "s5"
step "s5-begin"
{
BEGIN;
}
step "s5-alter"
{
ALTER TABLE tt1 ADD COLUMN new_column INT;
}
step "s5-rollback"
{
ROLLBACK;
}
session "s6"
step "s6-select"
{
SELECT user_id FROM tt1 ORDER BY user_id DESC LIMIT 1;
}
session "s7"
step "s7-alter"
{
ALTER TABLE tt1 ADD COLUMN new_column INT;
}
session "s8"
step "s8-begin"
{
BEGIN;
}
step "s8-select"
{
SELECT user_id FROM tt1 ORDER BY user_id DESC LIMIT 1;
}
step "s8-rollback"
{
ROLLBACK;
}
permutation "s1-begin" "s1-update-ref-table-from-coordinator" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-update-ref-table" "s3-select-distributed-waiting-queries" "s1-commit" "s2-commit-worker" "s2-stop-connection"
permutation "s1-start-session-level-connection" "s1-begin-on-worker" "s1-update-ref-table" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-update-ref-table" "s3-select-distributed-waiting-queries" "s1-commit-worker" "s2-commit-worker" "s1-stop-connection" "s2-stop-connection"
permutation "s1-start-session-level-connection" "s1-begin-on-worker" "s1-update-dist-table" "s2-start-session-level-connection" "s2-begin-on-worker" "s2-update-dist-table" "s3-select-distributed-waiting-queries" "s1-commit-worker" "s2-commit-worker" "s1-stop-connection" "s2-stop-connection"
@ -212,3 +273,10 @@ permutation "s1-start-session-level-connection" "s1-begin-on-worker" "s1-update-
// we can find the blocking relationship
permutation "s1-start-session-level-connection" "s1-begin-on-worker" "s1-update-dist-table-id-1" "s2-start-session-level-connection" "s2-update-dist-table-id-1" "s3-select-distributed-waiting-queries" "s1-commit-worker" "s1-stop-connection" "s2-stop-connection"
permutation "s1-begin" "s1-update-ref-table-from-coordinator" "s2-start-session-level-connection" "s2-update-ref-table" "s3-select-distributed-waiting-queries" "s1-commit" "s2-stop-connection"
// show that we can see blocking activity even if these are the first commands in the sessions
// such that global_pids have not been assigned
// in the second permutation, s3-show-actual-gpids shows the gpid for ALTER TABLE
// because ALTER TABLE is not blocked on the parser but during the execution (hence gpid already asssigned)
"s5-begin" "s5-alter" "s6-select" "s3-select-distributed-waiting-queries" "s3-show-actual-gpids" "s5-rollback"
"s8-begin" "s8-select" "s7-alter" "s3-select-distributed-waiting-queries" "s3-show-actual-gpids" "s8-rollback"