apache · gaogaotiantian · Dec 7, 2025 · Dec 8, 2025
diff --git a/dev/lint-python b/dev/lint-python
@@ -18,6 +18,8 @@
 # define test binaries + versions
 FLAKE8_BUILD="flake8"
 MINIMUM_FLAKE8="3.9.0"
+RUFF_BUILD="ruff"
+MINIMUM_RUFF="0.14.0"
 MINIMUM_MYPY="1.8.0"
 MYPY_BUILD="mypy"
 PYTEST_BUILD="pytest"
@@ -52,6 +54,9 @@ while (( "$#" )); do
     --flake8)
       FLAKE8_TEST=true
       ;;
+    --ruff)
+      RUFF_TEST=true
+      ;;
     --mypy)
       MYPY_TEST=true
       ;;
@@ -69,7 +74,7 @@ while (( "$#" )); do
   shift
 done
 
-if [[ -z "$COMPILE_TEST$BLACK_TEST$PYSPARK_CUSTOM_ERRORS_CHECK_TEST$FLAKE8_TEST$MYPY_TEST$MYPY_EXAMPLES_TEST$MYPY_DATA_TEST" ]]; then
+if [[ -z "$COMPILE_TEST$BLACK_TEST$PYSPARK_CUSTOM_ERRORS_CHECK_TEST$FLAKE8_TEST$RUFF_TEST$MYPY_TEST$MYPY_EXAMPLES_TEST$MYPY_DATA_TEST" ]]; then
   COMPILE_TEST=true
   BLACK_TEST=true
   PYSPARK_CUSTOM_ERRORS_CHECK_TEST=true
@@ -270,6 +275,45 @@ flake8 checks failed."
     fi
 }
 
+function ruff_test {
+    local RUFF_VERSION=
+    local EXPECTED_RUFF=
+    local RUFF_REPORT=
+    local RUFF_STATUS=
+
+    if ! hash "$RUFF_BUILD" 2> /dev/null; then
+        echo "The ruff command was not found. Skipping for now."
+        return
+    fi
+
+    _RUFF_VERSION=($($RUFF_BUILD --version))
+    RUFF_VERSION="${_RUFF_VERSION[1]}"
+    EXPECTED_RUFF="$(satisfies_min_version $RUFF_VERSION $MINIMUM_RUFF)"
+
+    if [[ "$EXPECTED_RUFF" == "False" ]]; then
+        echo "\
+The minimum ruff version needs to be $MINIMUM_RUFF. Your current version is $RUFF_VERSION
+
+ruff checks failed."
+        exit 1
+    fi
+
+    echo "starting $RUFF_BUILD test..."
+    RUFF_REPORT=$( ($RUFF_BUILD check --config dev/pyproject.toml) 2>&1)
+    RUFF_STATUS=$?
+
+    if [ "$RUFF_STATUS" -ne 0 ]; then
+        echo "ruff checks failed:"
+        echo "$RUFF_REPORT"
+        echo "$RUFF_STATUS"
+        exit "$RUFF_STATUS"
+    else
+        echo "ruff checks passed."
+        echo
+    fi
+
+}
+
 function black_test {
     local BLACK_REPORT=
     local BLACK_STATUS=
@@ -335,6 +379,9 @@ fi
 if [[ "$FLAKE8_TEST" == "true" ]]; then
     flake8_test
 fi
+if [[ "$RUFF_TEST" == "true" ]]; then
+    ruff_test
+fi
 if [[ "$MYPY_TEST" == "true" ]] || [[ "$MYPY_EXAMPLES_TEST" == "true" ]] || [[ "$MYPY_DATA_TEST" == "true" ]]; then
     mypy_test
 fi

diff --git a/dev/pyproject.toml b/dev/pyproject.toml
@@ -24,6 +24,58 @@ testpaths = [
   "pyspark/ml/typing",
 ]
 
+[tool.ruff]
+exclude = [
+    "*/target/*",
+    "**/*.ipynb",
+    "docs/.local_ruby_bundle/",
+    "*python/pyspark/cloudpickle/*.py",
+    "*python/pyspark/ml/deepspeed/tests/*.py",
+    "*python/docs/build/*",
+    "*python/docs/source/conf.py",
+    "*python/.eggs/*",
+    "dist/*",
+    ".git/*",
+    "*python/pyspark/sql/pandas/functions.pyi",
+    "*python/pyspark/sql/column.pyi",
+    "*python/pyspark/worker.pyi",
+    "*python/pyspark/java_gateway.pyi",
+    "*python/pyspark/sql/connect/proto/*",
+    "*python/pyspark/sql/streaming/proto/*",
+    "*venv*/*",
+]
+
+[tool.ruff.lint]
+ignore = [
+    "E203", # Skip as black formatter adds a whitespace around ':'.
+    "E402", # Module top level import is disabled for optional import check, etc.
+    # TODO
+    "E721", # Use isinstance for type comparison, too many for now.
+    "E741", # Ambiguous variables like l, I or O.
+]
+
+[tool.ruff.lint.per-file-ignores]
+    # E501 is ignored as shared.py is auto-generated.
+    "python/pyspark/ml/param/shared.py" = ["E501"]
+    # E501 is ignored as we should keep the json string format in error_classes.py.
+    "python/pyspark/errors/error_classes.py" = ["E501"]
+    # Examples contain some unused variables.
+    "examples/src/main/python/sql/datasource.py" = ["F841"]
+    # Exclude * imports in test files
+    "python/pyspark/errors/tests/*.py" = ["F403"]
+    "python/pyspark/logger/tests/*.py" = ["F403"]
+    "python/pyspark/logger/tests/connect/*.py" = ["F403"]
+    "python/pyspark/ml/tests/*.py" = ["F403"]
+    "python/pyspark/mllib/tests/*.py" = ["F403"]
+    "python/pyspark/pandas/tests/*.py" = ["F401", "F403"]
+    "python/pyspark/pandas/tests/connect/*.py" = ["F401", "F403"]
+    "python/pyspark/resource/tests/*.py" = ["F403"]
+    "python/pyspark/sql/tests/*.py" = ["F403"]
+    "python/pyspark/streaming/tests/*.py" = ["F403"]
+    "python/pyspark/tests/*.py" = ["F403"]
+    "python/pyspark/testing/*.py" = ["F401"]
+    "python/pyspark/testing/tests/*.py" = ["F403"]
+
 [tool.black]
 # When changing the version, we have to update
 # GitHub workflow version and dev/reformat-python

diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
@@ -710,7 +710,7 @@ def setParams(
 
     def isLargerBetter(self) -> bool:
         """Override this function to make it run on connect"""
-        return not self.getMetricName() in [
+        return self.getMetricName() not in [
             "weightedFalsePositiveRate",
             "falsePositiveRateByLabel",
             "logLoss",

diff --git a/python/pyspark/sql/connect/merge.py b/python/pyspark/sql/connect/merge.py
@@ -19,7 +19,7 @@
 check_dependencies(__name__)
 
 import sys
-from typing import Dict, Optional, TYPE_CHECKING, List, Callable
+from typing import Dict, Optional, TYPE_CHECKING, Callable
 
 from pyspark.sql.connect import proto
 from pyspark.sql.connect.column import Column
@@ -73,9 +73,9 @@ def __init__(
 
         self._callback = callback if callback is not None else lambda _: None
         self._schema_evolution_enabled = False
-        self._matched_actions = list()  # type: List[proto.MergeAction]
-        self._not_matched_actions = list()  # type: List[proto.MergeAction]
-        self._not_matched_by_source_actions = list()  # type: List[proto.MergeAction]
+        self._matched_actions: list[proto.MergeAction] = list()
+        self._not_matched_actions: list[proto.MergeAction] = list()
+        self._not_matched_by_source_actions: list[proto.MergeAction] = list()
 
     def whenMatched(self, condition: Optional[Column] = None) -> "MergeIntoWriter.WhenMatched":
         return self.WhenMatched(self, condition)

diff --git a/python/pyspark/sql/tests/arrow/test_arrow_udf_typehints.py b/python/pyspark/sql/tests/arrow/test_arrow_udf_typehints.py
@@ -461,7 +461,7 @@ def multiply_pandas(a: pd.Series, b: pd.Series) -> pd.Series:
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.arrow.test_arrow_udf_typehints import *  # noqa: #401
+    from pyspark.sql.tests.arrow.test_arrow_udf_typehints import *  # noqa: #F401
 
     try:
         import xmlrunner

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints.py
@@ -444,7 +444,7 @@ def multiply_arrow(a: pa.Array, b: pa.Array) -> pa.Array:
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.pandas.test_pandas_udf_typehints import *  # noqa: #401
+    from pyspark.sql.tests.pandas.test_pandas_udf_typehints import *  # noqa: #F401
 
     try:
         import xmlrunner

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py b/python/pyspark/sql/tests/pandas/test_pandas_udf_typehints_with_future_annotations.py
@@ -367,7 +367,7 @@ def func(col: "Union[pd.Series, pd.DataFrame]", *, col2: "pd.DataFrame") -> "pd.
 
 
 if __name__ == "__main__":
-    from pyspark.sql.tests.pandas.test_pandas_udf_typehints_with_future_annotations import *  # noqa: #401
+    from pyspark.sql.tests.pandas.test_pandas_udf_typehints_with_future_annotations import *  # noqa: #F401
 
     try:
         import xmlrunner

diff --git a/python/pyspark/taskcontext.py b/python/pyspark/taskcontext.py
@@ -252,8 +252,6 @@ def resources(self) -> Dict[str, "ResourceInformation"]:
         dict
             a dictionary of a string resource name, and :class:`ResourceInformation`.
         """
-        from pyspark.resource import ResourceInformation
-
         return cast(Dict[str, "ResourceInformation"], self._resources)
 
 

diff --git a/python/pyspark/util.py b/python/pyspark/util.py
@@ -726,7 +726,7 @@ class PyLocalIterable:
         def __init__(self, _sock_info: "JavaArray", _serializer: "Serializer"):
             port: int
             auth_secret: str
-            jsocket_auth_server: "JavaObject"
+            self.jsocket_auth_server: "JavaObject"
             port, auth_secret, self.jsocket_auth_server = _sock_info
             self._sockfile, self._sock = _create_local_socket((port, auth_secret))
             self._serializer = _serializer
-Original file line number
+Diff line change
@@ Expand Up / @@ -252,8 +252,6 @@ def resources(self) -> Dict[str, "ResourceInformation"]: @@
             dict
                 a dictionary of a string resource name, and :class:`ResourceInformation`.
             """
-            from pyspark.resource import ResourceInformation
             return cast(Dict[str, "ResourceInformation"], self._resources)
@@ Expand Down @@