[INFRA] Add structured logging linter script to github workflow (#3840)

#### Which Delta project/connector is this regarding?  - [ ] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [x] Other (Infra) ## Description  Add a python linter script to enforce new code to adhere to structured logging. There is a similar effort in Spark: apache/spark#47239. ## How was this patch tested?  github workflow passed. ## Does this PR introduce _any_ user-facing changes?  No
delta-io · Nov 5, 2024 · c60437b · c60437b
1 parent 796f518
commit c60437b
Show file tree

Hide file tree

Showing 2 changed files with 119 additions and 0 deletions.
diff --git a/.github/workflows/spark_test.yaml b/.github/workflows/spark_test.yaml
@@ -80,6 +80,12 @@ jobs:
           pipenv run pip install pyarrow==8.0.0
           pipenv run pip install numpy==1.20.3
         if: steps.git-diff.outputs.diff
+      - name: Scala structured logging style check
+        run: |
+          if [ -f ./dev/spark_structured_logging_style.py ]; then
+              python3 ./dev/spark_structured_logging_style.py
+          fi
+        if: steps.git-diff.outputs.diff
       - name: Run Scala/Java and Python tests
         # when changing TEST_PARALLELISM_COUNT make sure to also change it in spark_master_test.yaml
         run: |

diff --git a/dev/spark_structured_logging_style.py b/dev/spark_structured_logging_style.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Copyright (2021) The Delta Lake Project Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import sys
+import re
+import glob
+
+
+def main():
+    log_pattern = r"log(?:Info|Warning|Error)\(.*?\)\n"
+    inner_log_pattern = r'".*?"\.format\(.*\)|s?".*?(?:\$|\"\+(?!s?")).*|[^"]+\+\s*".*?"'
+    compiled_inner_log_pattern = re.compile(inner_log_pattern)
+
+    # Regex patterns for file paths to exclude from the Structured Logging style check
+    excluded_file_patterns = [
+        "[Tt]est",
+        "sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen"
+        "/CodeGenerator.scala",
+        "streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala",
+        "sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver"
+        "/SparkSQLCLIService.scala",
+        "core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala",
+    ]
+
+    nonmigrated_files = {}
+
+    target_directories = ["../spark", "../iceberg", "../hudi"]
+    scala_files = []
+    for directory in target_directories:
+        scala_files.extend(glob.glob(os.path.join(directory, "**", "*.scala"), recursive=True))
+
+    for file in scala_files:
+        skip_file = False
+        for exclude_pattern in excluded_file_patterns:
+            if re.search(exclude_pattern, file):
+                skip_file = True
+                break
+
+        if not skip_file and not os.path.isdir(file):
+            with open(file, "r") as f:
+                content = f.read()
+
+                log_statements = re.finditer(log_pattern, content, re.DOTALL)
+
+                if log_statements:
+                    nonmigrated_files[file] = []
+                    for log_statement in log_statements:
+                        log_statement_str = log_statement.group(0).strip()
+                        # trim first ( and last )
+                        first_paren_index = log_statement_str.find("(")
+                        inner_log_statement = re.sub(
+                            r"\s+", "", log_statement_str[first_paren_index + 1 : -1]
+                        )
+
+                        if compiled_inner_log_pattern.fullmatch(inner_log_statement):
+                            start_pos = log_statement.start()
+                            preceding_content = content[:start_pos]
+                            line_number = preceding_content.count("\n") + 1
+                            start_char = start_pos - preceding_content.rfind("\n") - 1
+                            nonmigrated_files[file].append((line_number, start_char))
+
+    if all(len(issues) == 0 for issues in nonmigrated_files.values()):
+        print("Structured logging style check passed.", file=sys.stderr)
+        sys.exit(0)
+    else:
+        for file_path, issues in nonmigrated_files.items():
+            for line_number, start_char in issues:
+                print(f"[error] {file_path}:{line_number}:{start_char}", file=sys.stderr)
+                print(
+                    "[error]\tPlease use the Structured Logging Framework for logging messages "
+                    'with variables. For example: log"...${{MDC(TASK_ID, taskId)}}..."'
+                    "\n\tRefer to the guidelines in the file `shims/LoggingShims.scala`.",
+                    file=sys.stderr,
+                )
+
+        sys.exit(-1)
+
+
+if __name__ == "__main__":
+    main()