Order same frequency common values, and add test (#8167)

Added similar test to what @colm-mchugh tested in the original PR
https://github.com/citusdata/citus/pull/8026#discussion_r2279021218
release-13.2-naisila
Naisila Puka 2025-08-29 01:41:32 +03:00 committed by naisila
parent 274504465d
commit f79dd61a92
4 changed files with 107 additions and 3 deletions

View File

@ -58,7 +58,7 @@ common_val_occurrence AS (
sum(common_freq * shard_reltuples)::bigint AS occurrence
FROM most_common_vals m
GROUP BY citus_table, m.attname, common_val
ORDER BY 1, 2, occurrence DESC)
ORDER BY 1, 2, occurrence DESC, 3)
SELECT nsp.nspname AS schemaname, p.relname AS tablename, c.attname,

View File

@ -58,7 +58,7 @@ common_val_occurrence AS (
sum(common_freq * shard_reltuples)::bigint AS occurrence
FROM most_common_vals m
GROUP BY citus_table, m.attname, common_val
ORDER BY 1, 2, occurrence DESC)
ORDER BY 1, 2, occurrence DESC, 3)
SELECT nsp.nspname AS schemaname, p.relname AS tablename, c.attname,

View File

@ -134,9 +134,70 @@ SELECT attname, null_frac, most_common_vals, most_common_freqs FROM citus_stats
id | 0 | {1} | {1}
(1 row)
-- more real-world scenario:
-- outputs of pg_stats and citus_stats are NOT the same
-- but citus_stats does a fair estimation job
SELECT setseed(0.42);
setseed
---------------------------------------------------------------------
(1 row)
CREATE TABLE orders (id bigint , custid int, product text, quantity int);
INSERT INTO orders(id, custid, product, quantity)
SELECT i, (random() * 100)::int, 'product' || (random() * 10)::int, NULL
FROM generate_series(1,11) d(i);
-- frequent customer
INSERT INTO orders(id, custid, product, quantity)
SELECT 1200, 17, 'product' || (random() * 10)::int, NULL
FROM generate_series(1, 57) sk(i);
-- popular product
INSERT INTO orders(id, custid, product, quantity)
SELECT i+100 % 17, NULL, 'product3', (random() * 40)::int
FROM generate_series(1, 37) sk(i);
-- frequent customer
INSERT INTO orders(id, custid, product, quantity)
SELECT 1390, 76, 'product' || ((random() * 20)::int % 3), (random() * 30)::int
FROM generate_series(1, 33) sk(i);
ANALYZE orders;
-- pg_stats
SELECT schemaname, tablename, attname, null_frac, most_common_vals, most_common_freqs FROM pg_stats
WHERE tablename IN ('orders')
ORDER BY 3;
schemaname | tablename | attname | null_frac | most_common_vals | most_common_freqs
---------------------------------------------------------------------
citus_aggregated_stats | orders | custid | 0.268116 | {17,76} | {0.413043,0.23913}
citus_aggregated_stats | orders | id | 0 | {1200,1390} | {0.413043,0.23913}
citus_aggregated_stats | orders | product | 0 | {product3,product2,product0,product1,product9,product4,product8,product5,product10,product6} | {0.347826,0.15942,0.115942,0.108696,0.0652174,0.057971,0.0507246,0.0362319,0.0289855,0.0289855}
citus_aggregated_stats | orders | quantity | 0.492754 | {26,23,6,8,11,12,13,17,20,25,30,4,14,15,16,19,24,27,35,36,38,40} | {0.0362319,0.0289855,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928}
(4 rows)
SELECT create_distributed_table('orders', 'id');
NOTICE: Copying data from local table...
NOTICE: copying the data has completed
DETAIL: The local data in the table is no longer visible, but is still on disk.
HINT: To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$citus_aggregated_stats.orders$$)
create_distributed_table
---------------------------------------------------------------------
(1 row)
ANALYZE orders;
-- citus_stats
SELECT schemaname, tablename, attname, null_frac, most_common_vals, most_common_freqs FROM citus_stats
WHERE tablename IN ('orders')
ORDER BY 3;
schemaname | tablename | attname | null_frac | most_common_vals | most_common_freqs
---------------------------------------------------------------------
citus_aggregated_stats | orders | custid | 0.268116 | {17,76} | {0.413043,0.23913}
citus_aggregated_stats | orders | id | 0 | {1200,1390} | {0.413043,0.23913}
citus_aggregated_stats | orders | product | 0 | {product3,product2,product0,product1,product9,product4,product8,product5,product10,product6} | {0.347826,0.15942,0.115942,0.108696,0.0652174,0.057971,0.0507246,0.0362319,0.0289855,0.0289855}
citus_aggregated_stats | orders | quantity | 0.492754 | {26,13,17,20,23,8,11,12,14,16,19,24,25,27,30,35,38,40,6} | {0.0362319,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928}
(4 rows)
RESET SESSION AUTHORIZATION;
DROP SCHEMA citus_aggregated_stats CASCADE;
NOTICE: drop cascades to 7 other objects
NOTICE: drop cascades to 8 other objects
DETAIL: drop cascades to table current_check
drop cascades to table dist_current_check
drop cascades to table ref_current_check
@ -144,4 +205,5 @@ drop cascades to table citus_local_current_check_1870003
drop cascades to table ref_current_check_1870002
drop cascades to table citus_local_current_check
drop cascades to table organizations
drop cascades to table orders
DROP USER user1;

View File

@ -102,6 +102,48 @@ SELECT attname, null_frac, most_common_vals, most_common_freqs FROM citus_stats
WHERE tablename IN ('organizations')
ORDER BY 1;
-- more real-world scenario:
-- outputs of pg_stats and citus_stats are NOT the same
-- but citus_stats does a fair estimation job
SELECT setseed(0.42);
CREATE TABLE orders (id bigint , custid int, product text, quantity int);
INSERT INTO orders(id, custid, product, quantity)
SELECT i, (random() * 100)::int, 'product' || (random() * 10)::int, NULL
FROM generate_series(1,11) d(i);
-- frequent customer
INSERT INTO orders(id, custid, product, quantity)
SELECT 1200, 17, 'product' || (random() * 10)::int, NULL
FROM generate_series(1, 57) sk(i);
-- popular product
INSERT INTO orders(id, custid, product, quantity)
SELECT i+100 % 17, NULL, 'product3', (random() * 40)::int
FROM generate_series(1, 37) sk(i);
-- frequent customer
INSERT INTO orders(id, custid, product, quantity)
SELECT 1390, 76, 'product' || ((random() * 20)::int % 3), (random() * 30)::int
FROM generate_series(1, 33) sk(i);
ANALYZE orders;
-- pg_stats
SELECT schemaname, tablename, attname, null_frac, most_common_vals, most_common_freqs FROM pg_stats
WHERE tablename IN ('orders')
ORDER BY 3;
SELECT create_distributed_table('orders', 'id');
ANALYZE orders;
-- citus_stats
SELECT schemaname, tablename, attname, null_frac, most_common_vals, most_common_freqs FROM citus_stats
WHERE tablename IN ('orders')
ORDER BY 3;
RESET SESSION AUTHORIZATION;
DROP SCHEMA citus_aggregated_stats CASCADE;
DROP USER user1;