From 5544c65cf6fbcdc7ccdf380eb9f6484f8aa2e1ae Mon Sep 17 00:00:00 2001
From: Nils Dijk <nils@citusdata.com>
Date: Thu, 23 Nov 2023 18:19:54 +0100
Subject: [PATCH] Sort includes (#7326)

CHERRY-PICK NOTES: This cherry-pick only includes the scripts, not the
actual changes. These are done in a follow up commit to ease further
backporting.

This change adds a script to programatically group all includes in a
specific order. The script was used as a one time invocation to group
and sort all includes throught our formatted code. The grouping is as
follows:

 - System includes (eg. `#include<...>`)
 - Postgres.h (eg. `#include "postgres.h"`)
- Toplevel imports from postgres, not contained in a directory (eg.
`#include "miscadmin.h"`)
 - General postgres includes (eg . `#include "nodes/..."`)
- Toplevel citus includes, not contained in a directory (eg. `#include
"citus_verion.h"`)
 - Columnar includes (eg. `#include "columnar/..."`)
 - Distributed includes (eg. `#include "distributed/..."`)

Because it is quite hard to understand the difference between toplevel
citus includes and toplevel postgres includes it hardcodes the list of
toplevel citus includes. In the same manner it assumes anything not
prefixed with `columnar/` or `distributed/` as a postgres include.

The sorting/grouping is enforced by CI. Since we do so with our own
script there are not changes required in our uncrustify configuration.

(cherry picked from commit 0620c8f9a6f37bacf9f2e709ccca3c27f6dfbaad)
---
 .github/workflows/build_and_test.yml |   2 +
 ci/README.md                         |  15 +++
 ci/fix_style.sh                      |   1 +
 ci/include_grouping.py               | 157 +++++++++++++++++++++++++++
 ci/sort_and_group_includes.sh        |  12 ++
 5 files changed, 187 insertions(+)
 create mode 100755 ci/include_grouping.py
 create mode 100755 ci/sort_and_group_includes.sh
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index f7e125cf1..840814f7f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -71,6 +71,8 @@ jobs:
       run: ci/editorconfig.sh && git diff --exit-code
     - name: Remove useless declarations
       run: ci/remove_useless_declarations.sh && git diff --cached --exit-code
+    - name: Sort and group includes
+      run: ci/sort_and_group_includes.sh && git diff --exit-code
     - name: Normalize test output
       run: ci/normalize_expected.sh && git diff --exit-code
     - name: Check for C-style comments in migration files
diff --git a/ci/README.md b/ci/README.md
index 37ef94f4f..b8dad35ac 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -385,3 +385,18 @@ definitions are in alphabetical order.
 ## `print_stack_trace.sh`
 
 This script prints stack traces for failed tests, if they left core files.
+
+## `sort_and_group_includes.sh`
+
+This script checks and fixes issues with include grouping and sorting in C files.
+
+Includes are grouped in the following groups:
+ - System includes (eg. `#include <math>`)
+ - Postgres.h include (eg. `#include "postgres.h"`)
+ - Toplevel postgres includes (includes not in a directory eg. `#include "miscadmin.h`)
+ - Postgres includes in a directory (eg. `#include "catalog/pg_type.h"`)
+ - Toplevel citus includes (includes not in a directory eg. `#include "pg_version_constants.h"`)
+ - Columnar includes (eg. `#include "columnar/columnar.h"`)
+ - Distributed includes (eg. `#include "distributed/maintenanced.h"`)
+
+Within every group the include lines are sorted alphabetically.
diff --git a/ci/fix_style.sh b/ci/fix_style.sh
index 3d6e7ae83..bb78d5f50 100755
--- a/ci/fix_style.sh
+++ b/ci/fix_style.sh
@@ -19,3 +19,4 @@ ci/disallow_long_changelog_entries.sh
 ci/normalize_expected.sh
 ci/fix_gitignore.sh
 ci/print_stack_trace.sh
+ci/sort_and_group_includes.sh
diff --git a/ci/include_grouping.py b/ci/include_grouping.py
new file mode 100755
index 000000000..4b1370d61
--- /dev/null
+++ b/ci/include_grouping.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""
+easy command line to run against all citus-style checked files:
+
+$ git ls-files \
+  | git check-attr --stdin citus-style \
+  | grep 'citus-style: set' \
+  | awk '{print $1}' \
+  | cut -d':' -f1 \
+  | xargs -n1 ./ci/include_grouping.py
+"""
+
+import collections
+import os
+import sys
+
+
+def main(args):
+    if len(args) < 2:
+        print("Usage: include_grouping.py <file>")
+        return
+
+    file = args[1]
+    if not os.path.isfile(file):
+        sys.exit(f"File '{file}' does not exist")
+
+    with open(file, "r") as in_file:
+        with open(file + ".tmp", "w") as out_file:
+            includes = []
+            skipped_lines = []
+
+            # This calls print_sorted_includes on a set of consecutive #include lines.
+            # This implicitly keeps separation of any #include lines that are contained in
+            # an #ifdef, because it will order the #include lines inside and after the
+            # #ifdef completely separately.
+            for line in in_file:
+                # if a line starts with #include we don't want to print it yet, instead we
+                # want to collect all consecutive #include lines
+                if line.startswith("#include"):
+                    includes.append(line)
+                    skipped_lines = []
+                    continue
+
+                # if we have collected any #include lines, we want to print them sorted
+                # before printing the current line. However, if the current line is empty
+                # we want to perform a lookahead to see if the next line is an #include.
+                # To maintain any separation between #include lines and their subsequent
+                # lines we keep track of all lines we have skipped inbetween.
+                if len(includes) > 0:
+                    if len(line.strip()) == 0:
+                        skipped_lines.append(line)
+                        continue
+
+                    # we have includes that need to be grouped before printing the current
+                    # line.
+                    print_sorted_includes(includes, file=out_file)
+                    includes = []
+
+                    # print any skipped lines
+                    print("".join(skipped_lines), end="", file=out_file)
+                    skipped_lines = []
+
+                print(line, end="", file=out_file)
+
+    # move out_file to file
+    os.rename(file + ".tmp", file)
+
+
+def print_sorted_includes(includes, file=sys.stdout):
+    default_group_key = 1
+    groups = collections.defaultdict(set)
+
+    # define the groups that we separate correctly. The matchers are tested in the order
+    # of their priority field. The first matcher that matches the include is used to
+    # assign the include to a group.
+    # The groups are printed in the order of their group_key.
+    matchers = [
+        {
+            "name": "system includes",
+            "matcher": lambda x: x.startswith("<"),
+            "group_key": -2,
+            "priority": 0,
+        },
+        {
+            "name": "toplevel postgres includes",
+            "matcher": lambda x: "/" not in x,
+            "group_key": 0,
+            "priority": 9,
+        },
+        {
+            "name": "postgres.h",
+            "matcher": lambda x: x.strip() in ['"postgres.h"'],
+            "group_key": -1,
+            "priority": -1,
+        },
+        {
+            "name": "toplevel citus inlcudes",
+            "matcher": lambda x: x.strip()
+            in [
+                '"citus_version.h"',
+                '"pg_version_compat.h"',
+                '"pg_version_constants.h"',
+            ],
+            "group_key": 3,
+            "priority": 0,
+        },
+        {
+            "name": "columnar includes",
+            "matcher": lambda x: x.startswith('"columnar/'),
+            "group_key": 4,
+            "priority": 1,
+        },
+        {
+            "name": "distributed includes",
+            "matcher": lambda x: x.startswith('"distributed/'),
+            "group_key": 5,
+            "priority": 1,
+        },
+    ]
+    matchers.sort(key=lambda x: x["priority"])
+
+    # throughout our codebase we have some includes where either postgres or citus
+    # includes are wrongfully included with the syntax for system includes. Before we
+    # try to match those we will change the <> to "" to make them match our system. This
+    # will also rewrite the include to the correct syntax.
+    common_system_include_error_prefixes = ["<nodes/", "<distributed/"]
+
+    # assign every include to a group
+    for include in includes:
+        # extract the group key from the include
+        include_content = include.split(" ")[1]
+
+        # fix common system includes which are secretly postgres or citus includes
+        for common_prefix in common_system_include_error_prefixes:
+            if include_content.startswith(common_prefix):
+                include_content = '"' + include_content.strip()[1:-1] + '"'
+                include = include.split(" ")[0] + " " + include_content + "\n"
+                break
+
+        group_key = default_group_key
+        for matcher in matchers:
+            if matcher["matcher"](include_content):
+                group_key = matcher["group_key"]
+                break
+
+        groups[group_key].add(include)
+
+    # iterate over all groups in the natural order of its keys
+    for i, group in enumerate(sorted(groups.items())):
+        if i > 0:
+            print(file=file)
+        includes = group[1]
+        print("".join(sorted(includes)), end="", file=file)
+
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/ci/sort_and_group_includes.sh b/ci/sort_and_group_includes.sh
new file mode 100755
index 000000000..1c3a91458
--- /dev/null
+++ b/ci/sort_and_group_includes.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -euo pipefail
+# shellcheck disable=SC1091
+source ci/ci_helpers.sh
+
+git ls-files \
+  | git check-attr --stdin citus-style \
+  | grep 'citus-style: set' \
+  | awk '{print $1}' \
+  | cut -d':' -f1 \
+  | xargs -n1 ./ci/include_grouping.py