From b6faa816557a87785709766f54aba88a03cc2aa0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Wieiw=C3=B3rka?= <marek.wiewiorka@gmail.com>
Date: Thu, 12 Dec 2024 17:02:21 +0100
Subject: [PATCH] Adding black

---
 .pre-commit-config.yaml      |   5 ++
 polars_bio/overlap.py        | 110 ++++++++++++++++++++++-------------
 tests/_expected.py           |  16 +++--
 tests/test_bioframe.py       |  31 +++++++---
 tests/test_native_overlap.py |   5 +-
 tests/test_pandas_overlap.py |   5 +-
 tests/test_polars_overlap.py |   1 -
 7 files changed, 116 insertions(+), 57 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 159f79f..176ce4f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,3 +19,8 @@ repos:
     hooks:
       - id: isort
         args: [--profile, black]
+  - repo: https://github.com/ambv/black
+    rev: 24.10.0
+    hooks:
+      - id: black
+        language_version: python3.12
diff --git a/polars_bio/overlap.py b/polars_bio/overlap.py
index a5347a6..9544ea3 100644
--- a/polars_bio/overlap.py
+++ b/polars_bio/overlap.py
@@ -40,16 +40,17 @@ def __init__(self):
 ctx = Context().ctx
 
 
-def overlap(df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
-            df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
-            how="inner",
-            overlap_filter: OverlapFilter = OverlapFilter.Weak,
-            suffixes=("_1", "_2"),
-            on_cols=None,
-            col1: Union[list[str] | None] = None,
-            col2: Union[list[str] | None] = None,
-            output_type: str = "polars.LazyFrame"
-            ) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
+def overlap(
+    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
+    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
+    how="inner",
+    overlap_filter: OverlapFilter = OverlapFilter.Weak,
+    suffixes=("_1", "_2"),
+    on_cols=None,
+    col1: Union[list[str] | None] = None,
+    col2: Union[list[str] | None] = None,
+    output_type: str = "polars.LazyFrame",
+) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
     """
     Find pairs of overlapping genomic intervals.
     Bioframe inspired API.
@@ -81,42 +82,64 @@ def overlap(df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
     assert on_cols is None, "on_cols is not supported yet"
 
     assert suffixes == ("_1", "_2"), "Only default suffixes are supported"
-    assert output_type in ["polars.LazyFrame", "polars.DataFrame",
-                           "pandas.DataFrame"], "Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported"
+    assert output_type in [
+        "polars.LazyFrame",
+        "polars.DataFrame",
+        "pandas.DataFrame",
+    ], "Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported"
 
     assert how in ["inner"], "Only inner join is supported"
     if isinstance(df1, str) and isinstance(df2, str):
         ext1 = Path(df1).suffix
-        assert ext1 == '.parquet' or ext1 == ".csv", "Dataframe1 must be a Parquet or CSV file"
+        assert (
+            ext1 == ".parquet" or ext1 == ".csv"
+        ), "Dataframe1 must be a Parquet or CSV file"
         ext2 = Path(df2).suffix
-        assert ext2 == '.parquet' or ext2 == ".csv", "Dataframe1 must be a Parquet or CSV file"
+        assert (
+            ext2 == ".parquet" or ext2 == ".csv"
+        ), "Dataframe1 must be a Parquet or CSV file"
         # use suffixes to avoid column name conflicts
         df_schema1 = _get_schema(df2, suffixes[0])
         df_schema2 = _get_schema(df2, suffixes[1])
         merged_schema = pl.Schema({**df_schema1, **df_schema2})
         if output_type == "polars.LazyFrame":
-            return overlap_lazy_scan(df1, df2, merged_schema, overlap_filter=overlap_filter)
+            return overlap_lazy_scan(
+                df1, df2, merged_schema, overlap_filter=overlap_filter
+            )
         elif output_type == "polars.DataFrame":
             return overlap_scan(ctx, df1, df2, overlap_filter).to_polars()
         elif output_type == "pandas.DataFrame":
             return overlap_scan(ctx, df1, df2, overlap_filter).to_pandas()
         else:
             raise ValueError(
-                "Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported")
-    elif isinstance(df1, pl.DataFrame) and isinstance(df2, pl.DataFrame) or \
-            isinstance(df1, pl.LazyFrame) and isinstance(df2, pl.LazyFrame) or \
-            isinstance(df1, pd.DataFrame) and isinstance(df2, pd.DataFrame):
+                "Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported"
+            )
+    elif (
+        isinstance(df1, pl.DataFrame)
+        and isinstance(df2, pl.DataFrame)
+        or isinstance(df1, pl.LazyFrame)
+        and isinstance(df2, pl.LazyFrame)
+        or isinstance(df1, pd.DataFrame)
+        and isinstance(df2, pd.DataFrame)
+    ):
         if output_type == "polars.LazyFrame":
-            merged_schema = pl.Schema({**_rename_columns(df1, suffixes[0]).schema,
-                                       **_rename_columns(df2, suffixes[1]).schema})
-            return overlap_lazy_scan(df1, df2, merged_schema, col1, col2, overlap_filter)
+            merged_schema = pl.Schema(
+                {
+                    **_rename_columns(df1, suffixes[0]).schema,
+                    **_rename_columns(df2, suffixes[1]).schema,
+                }
+            )
+            return overlap_lazy_scan(
+                df1, df2, merged_schema, col1, col2, overlap_filter
+            )
         elif output_type == "polars.DataFrame":
             if isinstance(df1, pl.DataFrame) and isinstance(df2, pl.DataFrame):
                 df1 = df1.to_arrow().to_reader()
                 df2 = df2.to_arrow().to_reader()
             else:
                 raise ValueError(
-                    "Input and output dataframes must be of the same type: either polars or pandas")
+                    "Input and output dataframes must be of the same type: either polars or pandas"
+                )
             return overlap_frame(ctx, df1, df2, overlap_filter).to_polars()
         elif output_type == "pandas.DataFrame":
             if isinstance(df1, pd.DataFrame) and isinstance(df2, pd.DataFrame):
@@ -124,19 +147,22 @@ def overlap(df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
                 df2 = _df_to_arrow(df2, col2[0]).to_reader()
             else:
                 raise ValueError(
-                    "Input and output dataframes must be of the same type: either polars or pandas")
+                    "Input and output dataframes must be of the same type: either polars or pandas"
+                )
             return overlap_frame(ctx, df1, df2, overlap_filter).to_pandas()
     else:
         raise ValueError(
-            "Both dataframes must be of the same type: either polars or pandas or a path to a file")
+            "Both dataframes must be of the same type: either polars or pandas or a path to a file"
+        )
 
 
 def _rename_columns_pl(df: pl.DataFrame, suffix: str) -> pl.DataFrame:
     return df.rename({col: f"{col}{suffix}" for col in df.columns})
 
 
-def _rename_columns(df: Union[pl.DataFrame, pd.DataFrame], suffix: str) -> Union[
-    pl.DataFrame, pd.DataFrame]:
+def _rename_columns(
+    df: Union[pl.DataFrame, pd.DataFrame], suffix: str
+) -> Union[pl.DataFrame, pd.DataFrame]:
     if isinstance(df, pl.DataFrame):
         df = pl.DataFrame(schema=df.schema)
         return _rename_columns_pl(df, suffix)
@@ -149,9 +175,9 @@ def _rename_columns(df: Union[pl.DataFrame, pd.DataFrame], suffix: str) -> Union
 
 def _get_schema(path: str, suffix=None) -> pl.Schema:
     ext = Path(path).suffix
-    if ext == '.parquet':
+    if ext == ".parquet":
         df = pl.read_parquet(path)
-    elif ext == '.csv':
+    elif ext == ".csv":
         df = pl.read_csv(path)
     else:
         raise ValueError("Only CSV and Parquet files are supported")
@@ -168,7 +194,7 @@ def _string_to_largestring(table: pa.Table, column_name: str) -> pa.Table:
     return table.set_column(
         index,  # Index of the column to replace
         table.schema.field(index).name,  # Name of the column
-        pc.cast(table.column(index), pa.large_string())  # Cast to `largestring`
+        pc.cast(table.column(index), pa.large_string()),  # Cast to `largestring`
     )
 
 
@@ -184,10 +210,14 @@ def _df_to_arrow(df: pd.DataFrame, col: str) -> pa.Table:
     return _string_to_largestring(table_1, col)
 
 
-def overlap_lazy_scan(df_1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
-                      df_2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
-                      schema: pl.Schema, col1: list[str] = None, col2: list[str] = None,
-                      overlap_filter: OverlapFilter = OverlapFilter.Weak) -> pl.LazyFrame:
+def overlap_lazy_scan(
+    df_1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
+    df_2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
+    schema: pl.Schema,
+    col1: list[str] = None,
+    col2: list[str] = None,
+    overlap_filter: OverlapFilter = OverlapFilter.Weak,
+) -> pl.LazyFrame:
     overlap_function = None
     if isinstance(df_1, str) and isinstance(df_2, str):
         overlap_function = overlap_scan
@@ -203,12 +233,14 @@ def overlap_lazy_scan(df_1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame]
         raise ValueError("Only polars and pandas dataframes are supported")
 
     def _overlap_source(
-            with_columns: pl.Expr | None,
-            predicate: pl.Expr | None,
-            _n_rows: int | None,
-            _batch_size: int | None,
+        with_columns: pl.Expr | None,
+        predicate: pl.Expr | None,
+        _n_rows: int | None,
+        _batch_size: int | None,
     ) -> Iterator[pl.DataFrame]:
-        df_lazy: datafusion.DataFrame = overlap_function(ctx, df_1, df_2, overlap_filter)
+        df_lazy: datafusion.DataFrame = overlap_function(
+            ctx, df_1, df_2, overlap_filter
+        )
         df_stream = df_lazy.execute_stream()
         for r in df_stream:
             py_df = r.to_pyarrow()
diff --git a/tests/_expected.py b/tests/_expected.py
index cf76e5d..2aafeca 100644
--- a/tests/_expected.py
+++ b/tests/_expected.py
@@ -30,13 +30,17 @@
 """
 
 # Pandas
-PD_DF_OVERLAP = (mdpd.from_md(EXPECTED_OVERLAP)
-                 .astype({'pos_start_1': 'int64'})
-                 .astype({'pos_end_1': 'int64'})
-                 .astype({'pos_start_2': 'int64'})
-                 .astype({'pos_end_2': 'int64'}))
+PD_DF_OVERLAP = (
+    mdpd.from_md(EXPECTED_OVERLAP)
+    .astype({"pos_start_1": "int64"})
+    .astype({"pos_end_1": "int64"})
+    .astype({"pos_start_2": "int64"})
+    .astype({"pos_end_2": "int64"})
+)
 
-PD_DF_OVERLAP = PD_DF_OVERLAP.sort_values(by=list(PD_DF_OVERLAP.columns)).reset_index(drop=True)
+PD_DF_OVERLAP = PD_DF_OVERLAP.sort_values(by=list(PD_DF_OVERLAP.columns)).reset_index(
+    drop=True
+)
 DF_PATH1 = f"{DATA_DIR}/reads.csv"
 DF_PATH2 = f"{DATA_DIR}/targets.csv"
 PD_DF1 = pd.read_csv(DF_PATH1)
diff --git a/tests/test_bioframe.py b/tests/test_bioframe.py
index 5ec53ea..92b7fbc 100644
--- a/tests/test_bioframe.py
+++ b/tests/test_bioframe.py
@@ -7,17 +7,30 @@
 
 
 class TestOverlapBioframe:
-    result = pb.overlap(BIO_PD_DF1, BIO_PD_DF2, output_type="pandas.DataFrame", overlap_filter=OverlapFilter.Strict)
-    result_bio = bf.overlap(BIO_PD_DF1, BIO_PD_DF2,
-                            cols1=('contig','pos_start','pos_end'),
-                            cols2=('contig','pos_start','pos_end'),
-                            suffixes=('_1', '_2'),
-                            how="inner")
+    result = pb.overlap(
+        BIO_PD_DF1,
+        BIO_PD_DF2,
+        output_type="pandas.DataFrame",
+        overlap_filter=OverlapFilter.Strict,
+    )
+    result_bio = bf.overlap(
+        BIO_PD_DF1,
+        BIO_PD_DF2,
+        cols1=("contig", "pos_start", "pos_end"),
+        cols2=("contig", "pos_start", "pos_end"),
+        suffixes=("_1", "_2"),
+        how="inner",
+    )
+
     def test_overlap_count(self):
         assert len(self.result) == 54246
-        assert len(self.result) ==  len(self.result_bio)
+        assert len(self.result) == len(self.result_bio)
 
     def test_overlap_schema_rows(self):
-        expected = self.result_bio.sort_values(by=list(self.result.columns)).reset_index(drop=True)
-        result = self.result.sort_values(by=list(self.result.columns)).reset_index(drop=True)
+        expected = self.result_bio.sort_values(
+            by=list(self.result.columns)
+        ).reset_index(drop=True)
+        result = self.result.sort_values(by=list(self.result.columns)).reset_index(
+            drop=True
+        )
         pd.testing.assert_frame_equal(result, expected)
diff --git a/tests/test_native_overlap.py b/tests/test_native_overlap.py
index 69848d3..ee3631b 100644
--- a/tests/test_native_overlap.py
+++ b/tests/test_native_overlap.py
@@ -6,10 +6,13 @@
 
 class TestOverlapNative:
     result_csv = pb.overlap(DF_PATH1, DF_PATH2, output_type="pandas.DataFrame")
+
     def test_overlap_count(self):
         assert len(self.result_csv) == 16
 
     def test_overlap_schema_rows(self):
-        result_csv = self.result_csv.sort_values(by=list(self.result_csv.columns)).reset_index(drop=True)
+        result_csv = self.result_csv.sort_values(
+            by=list(self.result_csv.columns)
+        ).reset_index(drop=True)
         expected = PD_DF_OVERLAP
         pd.testing.assert_frame_equal(result_csv, expected)
diff --git a/tests/test_pandas_overlap.py b/tests/test_pandas_overlap.py
index 132ea50..bab750f 100644
--- a/tests/test_pandas_overlap.py
+++ b/tests/test_pandas_overlap.py
@@ -6,10 +6,13 @@
 
 class TestOverlapPandas:
     result = pb.overlap(PD_DF1, PD_DF2, output_type="pandas.DataFrame")
+
     def test_overlap_count(self):
         assert len(self.result) == 16
 
     def test_overlap_schema_rows(self):
-        result = self.result.sort_values(by=list(self.result.columns)).reset_index(drop=True)
+        result = self.result.sort_values(by=list(self.result.columns)).reset_index(
+            drop=True
+        )
         expected = PD_DF_OVERLAP
         pd.testing.assert_frame_equal(result, expected)
diff --git a/tests/test_polars_overlap.py b/tests/test_polars_overlap.py
index a9d8e3f..17897bd 100644
--- a/tests/test_polars_overlap.py
+++ b/tests/test_polars_overlap.py
@@ -4,7 +4,6 @@
 
 
 class TestOverlapPolars:
-
     result_frame = pb.overlap(PL_DF1, PL_DF2, output_type="polars.DataFrame")
     result_lazy = pb.overlap(PL_DF1, PL_DF2, output_type="polars.LazyFrame").collect()
     expected = PL_DF_OVERLAP