Order same frequency common values, and add test (#8167)

Added similar test to what @colm-mchugh tested in the original PR https://github.com/citusdata/citus/pull/8026#discussion_r2279021218
2025-08-29 01:41:32 +03:00 · 2025-08-29 01:41:32 +03:00 · f79dd61a92
parent 274504465d
commit f79dd61a92
4 changed files with 107 additions and 3 deletions
--- a/src/backend/distributed/sql/udfs/citus_stats/13.2-1.sql
+++ b/src/backend/distributed/sql/udfs/citus_stats/13.2-1.sql
@ -58,7 +58,7 @@ common_val_occurrence AS (
            sum(common_freq * shard_reltuples)::bigint AS occurrence
    FROM most_common_vals m
    GROUP BY citus_table, m.attname, common_val
-    ORDER BY 1, 2, occurrence DESC)
+    ORDER BY 1, 2, occurrence DESC, 3)

 SELECT nsp.nspname AS schemaname, p.relname AS tablename, c.attname,

--- a/src/backend/distributed/sql/udfs/citus_stats/latest.sql
+++ b/src/backend/distributed/sql/udfs/citus_stats/latest.sql
@ -58,7 +58,7 @@ common_val_occurrence AS (
            sum(common_freq * shard_reltuples)::bigint AS occurrence
    FROM most_common_vals m
    GROUP BY citus_table, m.attname, common_val
-    ORDER BY 1, 2, occurrence DESC)
+    ORDER BY 1, 2, occurrence DESC, 3)

 SELECT nsp.nspname AS schemaname, p.relname AS tablename, c.attname,

--- a/src/test/regress/expected/citus_aggregated_stats.out
+++ b/src/test/regress/expected/citus_aggregated_stats.out
@ -134,9 +134,70 @@ SELECT attname, null_frac, most_common_vals, most_common_freqs FROM citus_stats
 id      |         0 | {1}              | {1}
 (1 row)

+-- more real-world scenario:
+-- outputs of pg_stats and citus_stats are NOT the same
+-- but citus_stats does a fair estimation job
+SELECT setseed(0.42);
+ setseed
+---------------------------------------------------------------------
+
+(1 row)
+
+CREATE TABLE orders (id bigint , custid int, product text, quantity int);
+INSERT INTO orders(id, custid, product, quantity)
+SELECT i, (random() * 100)::int, 'product' || (random() * 10)::int, NULL
+FROM generate_series(1,11) d(i);
+-- frequent customer
+INSERT INTO orders(id, custid, product, quantity)
+SELECT 1200, 17, 'product' || (random() * 10)::int, NULL
+FROM generate_series(1, 57) sk(i);
+-- popular product
+INSERT INTO orders(id, custid, product, quantity)
+SELECT i+100 % 17, NULL, 'product3', (random() * 40)::int
+FROM generate_series(1, 37) sk(i);
+-- frequent customer
+INSERT INTO orders(id, custid, product, quantity)
+SELECT 1390, 76, 'product' || ((random() * 20)::int % 3), (random() * 30)::int
+FROM generate_series(1, 33) sk(i);
+ANALYZE orders;
+-- pg_stats
+SELECT schemaname, tablename, attname, null_frac, most_common_vals, most_common_freqs FROM pg_stats
+  WHERE tablename IN ('orders')
+ORDER BY 3;
+       schemaname       | tablename | attname  | null_frac |                                       most_common_vals                                       |                                                                                                       most_common_freqs
+---------------------------------------------------------------------
+ citus_aggregated_stats | orders    | custid   |  0.268116 | {17,76}                                                                                      | {0.413043,0.23913}
+ citus_aggregated_stats | orders    | id       |         0 | {1200,1390}                                                                                  | {0.413043,0.23913}
+ citus_aggregated_stats | orders    | product  |         0 | {product3,product2,product0,product1,product9,product4,product8,product5,product10,product6} | {0.347826,0.15942,0.115942,0.108696,0.0652174,0.057971,0.0507246,0.0362319,0.0289855,0.0289855}
+ citus_aggregated_stats | orders    | quantity |  0.492754 | {26,23,6,8,11,12,13,17,20,25,30,4,14,15,16,19,24,27,35,36,38,40}                             | {0.0362319,0.0289855,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928}
+(4 rows)
+
+SELECT create_distributed_table('orders', 'id');
+NOTICE:  Copying data from local table...
+NOTICE:  copying the data has completed
+DETAIL:  The local data in the table is no longer visible, but is still on disk.
+HINT:  To remove the local data, run: SELECT truncate_local_data_after_distributing_table($$citus_aggregated_stats.orders$$)
+ create_distributed_table
+---------------------------------------------------------------------
+
+(1 row)
+
+ANALYZE orders;
+-- citus_stats
+SELECT schemaname, tablename, attname, null_frac, most_common_vals, most_common_freqs FROM citus_stats
+  WHERE tablename IN ('orders')
+ORDER BY 3;
+       schemaname       | tablename | attname  | null_frac |                                       most_common_vals                                       |                                                                                        most_common_freqs
+---------------------------------------------------------------------
+ citus_aggregated_stats | orders    | custid   |  0.268116 | {17,76}                                                                                      | {0.413043,0.23913}
+ citus_aggregated_stats | orders    | id       |         0 | {1200,1390}                                                                                  | {0.413043,0.23913}
+ citus_aggregated_stats | orders    | product  |         0 | {product3,product2,product0,product1,product9,product4,product8,product5,product10,product6} | {0.347826,0.15942,0.115942,0.108696,0.0652174,0.057971,0.0507246,0.0362319,0.0289855,0.0289855}
+ citus_aggregated_stats | orders    | quantity |  0.492754 | {26,13,17,20,23,8,11,12,14,16,19,24,25,27,30,35,38,40,6}                                     | {0.0362319,0.0217391,0.0217391,0.0217391,0.0217391,0.0217391,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928,0.0144928}
+(4 rows)
+
 RESET SESSION AUTHORIZATION;
 DROP SCHEMA citus_aggregated_stats CASCADE;
-NOTICE:  drop cascades to 7 other objects
+NOTICE:  drop cascades to 8 other objects
 DETAIL:  drop cascades to table current_check
 drop cascades to table dist_current_check
 drop cascades to table ref_current_check
@ -144,4 +205,5 @@ drop cascades to table citus_local_current_check_1870003
 drop cascades to table ref_current_check_1870002
 drop cascades to table citus_local_current_check
 drop cascades to table organizations
+drop cascades to table orders
 DROP USER user1;
--- a/src/test/regress/sql/citus_aggregated_stats.sql
+++ b/src/test/regress/sql/citus_aggregated_stats.sql
@ -102,6 +102,48 @@ SELECT attname, null_frac, most_common_vals, most_common_freqs FROM citus_stats
  WHERE tablename IN ('organizations')
  ORDER BY 1;

+-- more real-world scenario:
+-- outputs of pg_stats and citus_stats are NOT the same
+-- but citus_stats does a fair estimation job
+
+SELECT setseed(0.42);
+
+CREATE TABLE orders (id bigint , custid int, product text, quantity int);
+
+INSERT INTO orders(id, custid, product, quantity)
+SELECT i, (random() * 100)::int, 'product' || (random() * 10)::int, NULL
+FROM generate_series(1,11) d(i);
+
+-- frequent customer
+INSERT INTO orders(id, custid, product, quantity)
+SELECT 1200, 17, 'product' || (random() * 10)::int, NULL
+FROM generate_series(1, 57) sk(i);
+
+-- popular product
+INSERT INTO orders(id, custid, product, quantity)
+SELECT i+100 % 17, NULL, 'product3', (random() * 40)::int
+FROM generate_series(1, 37) sk(i);
+
+-- frequent customer
+INSERT INTO orders(id, custid, product, quantity)
+SELECT 1390, 76, 'product' || ((random() * 20)::int % 3), (random() * 30)::int
+FROM generate_series(1, 33) sk(i);
+
+ANALYZE orders;
+
+-- pg_stats
+SELECT schemaname, tablename, attname, null_frac, most_common_vals, most_common_freqs FROM pg_stats
+  WHERE tablename IN ('orders')
+ORDER BY 3;
+
+SELECT create_distributed_table('orders', 'id');
+ANALYZE orders;
+
+-- citus_stats
+SELECT schemaname, tablename, attname, null_frac, most_common_vals, most_common_freqs FROM citus_stats
+  WHERE tablename IN ('orders')
+ORDER BY 3;
+
 RESET SESSION AUTHORIZATION;
 DROP SCHEMA citus_aggregated_stats CASCADE;
 DROP USER user1;