Skip to content

Commit

Permalink
Adding black
Browse files Browse the repository at this point in the history
  • Loading branch information
mwiewior committed Dec 12, 2024
1 parent a26c10b commit b6faa81
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 57 deletions.
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,8 @@ repos:
hooks:
- id: isort
args: [--profile, black]
- repo: https://github.com/ambv/black
rev: 24.10.0
hooks:
- id: black
language_version: python3.12
110 changes: 71 additions & 39 deletions polars_bio/overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,17 @@ def __init__(self):
ctx = Context().ctx


def overlap(df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
how="inner",
overlap_filter: OverlapFilter = OverlapFilter.Weak,
suffixes=("_1", "_2"),
on_cols=None,
col1: Union[list[str] | None] = None,
col2: Union[list[str] | None] = None,
output_type: str = "polars.LazyFrame"
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
def overlap(
df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
how="inner",
overlap_filter: OverlapFilter = OverlapFilter.Weak,
suffixes=("_1", "_2"),
on_cols=None,
col1: Union[list[str] | None] = None,
col2: Union[list[str] | None] = None,
output_type: str = "polars.LazyFrame",
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
"""
Find pairs of overlapping genomic intervals.
Bioframe inspired API.
Expand Down Expand Up @@ -81,62 +82,87 @@ def overlap(df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
assert on_cols is None, "on_cols is not supported yet"

assert suffixes == ("_1", "_2"), "Only default suffixes are supported"
assert output_type in ["polars.LazyFrame", "polars.DataFrame",
"pandas.DataFrame"], "Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported"
assert output_type in [
"polars.LazyFrame",
"polars.DataFrame",
"pandas.DataFrame",
], "Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported"

assert how in ["inner"], "Only inner join is supported"
if isinstance(df1, str) and isinstance(df2, str):
ext1 = Path(df1).suffix
assert ext1 == '.parquet' or ext1 == ".csv", "Dataframe1 must be a Parquet or CSV file"
assert (
ext1 == ".parquet" or ext1 == ".csv"
), "Dataframe1 must be a Parquet or CSV file"
ext2 = Path(df2).suffix
assert ext2 == '.parquet' or ext2 == ".csv", "Dataframe1 must be a Parquet or CSV file"
assert (
ext2 == ".parquet" or ext2 == ".csv"
), "Dataframe1 must be a Parquet or CSV file"
# use suffixes to avoid column name conflicts
df_schema1 = _get_schema(df2, suffixes[0])
df_schema2 = _get_schema(df2, suffixes[1])
merged_schema = pl.Schema({**df_schema1, **df_schema2})
if output_type == "polars.LazyFrame":
return overlap_lazy_scan(df1, df2, merged_schema, overlap_filter=overlap_filter)
return overlap_lazy_scan(
df1, df2, merged_schema, overlap_filter=overlap_filter
)
elif output_type == "polars.DataFrame":
return overlap_scan(ctx, df1, df2, overlap_filter).to_polars()
elif output_type == "pandas.DataFrame":
return overlap_scan(ctx, df1, df2, overlap_filter).to_pandas()
else:
raise ValueError(
"Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported")
elif isinstance(df1, pl.DataFrame) and isinstance(df2, pl.DataFrame) or \
isinstance(df1, pl.LazyFrame) and isinstance(df2, pl.LazyFrame) or \
isinstance(df1, pd.DataFrame) and isinstance(df2, pd.DataFrame):
"Only polars.LazyFrame, polars.DataFrame, and pandas.DataFrame are supported"
)
elif (
isinstance(df1, pl.DataFrame)
and isinstance(df2, pl.DataFrame)
or isinstance(df1, pl.LazyFrame)
and isinstance(df2, pl.LazyFrame)
or isinstance(df1, pd.DataFrame)
and isinstance(df2, pd.DataFrame)
):
if output_type == "polars.LazyFrame":
merged_schema = pl.Schema({**_rename_columns(df1, suffixes[0]).schema,
**_rename_columns(df2, suffixes[1]).schema})
return overlap_lazy_scan(df1, df2, merged_schema, col1, col2, overlap_filter)
merged_schema = pl.Schema(
{
**_rename_columns(df1, suffixes[0]).schema,
**_rename_columns(df2, suffixes[1]).schema,
}
)
return overlap_lazy_scan(
df1, df2, merged_schema, col1, col2, overlap_filter
)
elif output_type == "polars.DataFrame":
if isinstance(df1, pl.DataFrame) and isinstance(df2, pl.DataFrame):
df1 = df1.to_arrow().to_reader()
df2 = df2.to_arrow().to_reader()
else:
raise ValueError(
"Input and output dataframes must be of the same type: either polars or pandas")
"Input and output dataframes must be of the same type: either polars or pandas"
)
return overlap_frame(ctx, df1, df2, overlap_filter).to_polars()
elif output_type == "pandas.DataFrame":
if isinstance(df1, pd.DataFrame) and isinstance(df2, pd.DataFrame):
df1 = _df_to_arrow(df1, col1[0]).to_reader()
df2 = _df_to_arrow(df2, col2[0]).to_reader()
else:
raise ValueError(
"Input and output dataframes must be of the same type: either polars or pandas")
"Input and output dataframes must be of the same type: either polars or pandas"
)
return overlap_frame(ctx, df1, df2, overlap_filter).to_pandas()
else:
raise ValueError(
"Both dataframes must be of the same type: either polars or pandas or a path to a file")
"Both dataframes must be of the same type: either polars or pandas or a path to a file"
)


def _rename_columns_pl(df: pl.DataFrame, suffix: str) -> pl.DataFrame:
return df.rename({col: f"{col}{suffix}" for col in df.columns})


def _rename_columns(df: Union[pl.DataFrame, pd.DataFrame], suffix: str) -> Union[
pl.DataFrame, pd.DataFrame]:
def _rename_columns(
df: Union[pl.DataFrame, pd.DataFrame], suffix: str
) -> Union[pl.DataFrame, pd.DataFrame]:
if isinstance(df, pl.DataFrame):
df = pl.DataFrame(schema=df.schema)
return _rename_columns_pl(df, suffix)
Expand All @@ -149,9 +175,9 @@ def _rename_columns(df: Union[pl.DataFrame, pd.DataFrame], suffix: str) -> Union

def _get_schema(path: str, suffix=None) -> pl.Schema:
ext = Path(path).suffix
if ext == '.parquet':
if ext == ".parquet":
df = pl.read_parquet(path)
elif ext == '.csv':
elif ext == ".csv":
df = pl.read_csv(path)
else:
raise ValueError("Only CSV and Parquet files are supported")
Expand All @@ -168,7 +194,7 @@ def _string_to_largestring(table: pa.Table, column_name: str) -> pa.Table:
return table.set_column(
index, # Index of the column to replace
table.schema.field(index).name, # Name of the column
pc.cast(table.column(index), pa.large_string()) # Cast to `largestring`
pc.cast(table.column(index), pa.large_string()), # Cast to `largestring`
)


Expand All @@ -184,10 +210,14 @@ def _df_to_arrow(df: pd.DataFrame, col: str) -> pa.Table:
return _string_to_largestring(table_1, col)


def overlap_lazy_scan(df_1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
df_2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
schema: pl.Schema, col1: list[str] = None, col2: list[str] = None,
overlap_filter: OverlapFilter = OverlapFilter.Weak) -> pl.LazyFrame:
def overlap_lazy_scan(
df_1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
df_2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
schema: pl.Schema,
col1: list[str] = None,
col2: list[str] = None,
overlap_filter: OverlapFilter = OverlapFilter.Weak,
) -> pl.LazyFrame:
overlap_function = None
if isinstance(df_1, str) and isinstance(df_2, str):
overlap_function = overlap_scan
Expand All @@ -203,12 +233,14 @@ def overlap_lazy_scan(df_1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame]
raise ValueError("Only polars and pandas dataframes are supported")

def _overlap_source(
with_columns: pl.Expr | None,
predicate: pl.Expr | None,
_n_rows: int | None,
_batch_size: int | None,
with_columns: pl.Expr | None,
predicate: pl.Expr | None,
_n_rows: int | None,
_batch_size: int | None,
) -> Iterator[pl.DataFrame]:
df_lazy: datafusion.DataFrame = overlap_function(ctx, df_1, df_2, overlap_filter)
df_lazy: datafusion.DataFrame = overlap_function(
ctx, df_1, df_2, overlap_filter
)
df_stream = df_lazy.execute_stream()
for r in df_stream:
py_df = r.to_pyarrow()
Expand Down
16 changes: 10 additions & 6 deletions tests/_expected.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,17 @@
"""

# Pandas
PD_DF_OVERLAP = (mdpd.from_md(EXPECTED_OVERLAP)
.astype({'pos_start_1': 'int64'})
.astype({'pos_end_1': 'int64'})
.astype({'pos_start_2': 'int64'})
.astype({'pos_end_2': 'int64'}))
PD_DF_OVERLAP = (
mdpd.from_md(EXPECTED_OVERLAP)
.astype({"pos_start_1": "int64"})
.astype({"pos_end_1": "int64"})
.astype({"pos_start_2": "int64"})
.astype({"pos_end_2": "int64"})
)

PD_DF_OVERLAP = PD_DF_OVERLAP.sort_values(by=list(PD_DF_OVERLAP.columns)).reset_index(drop=True)
PD_DF_OVERLAP = PD_DF_OVERLAP.sort_values(by=list(PD_DF_OVERLAP.columns)).reset_index(
drop=True
)
DF_PATH1 = f"{DATA_DIR}/reads.csv"
DF_PATH2 = f"{DATA_DIR}/targets.csv"
PD_DF1 = pd.read_csv(DF_PATH1)
Expand Down
31 changes: 22 additions & 9 deletions tests/test_bioframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,30 @@


class TestOverlapBioframe:
result = pb.overlap(BIO_PD_DF1, BIO_PD_DF2, output_type="pandas.DataFrame", overlap_filter=OverlapFilter.Strict)
result_bio = bf.overlap(BIO_PD_DF1, BIO_PD_DF2,
cols1=('contig','pos_start','pos_end'),
cols2=('contig','pos_start','pos_end'),
suffixes=('_1', '_2'),
how="inner")
result = pb.overlap(
BIO_PD_DF1,
BIO_PD_DF2,
output_type="pandas.DataFrame",
overlap_filter=OverlapFilter.Strict,
)
result_bio = bf.overlap(
BIO_PD_DF1,
BIO_PD_DF2,
cols1=("contig", "pos_start", "pos_end"),
cols2=("contig", "pos_start", "pos_end"),
suffixes=("_1", "_2"),
how="inner",
)

def test_overlap_count(self):
assert len(self.result) == 54246
assert len(self.result) == len(self.result_bio)
assert len(self.result) == len(self.result_bio)

def test_overlap_schema_rows(self):
expected = self.result_bio.sort_values(by=list(self.result.columns)).reset_index(drop=True)
result = self.result.sort_values(by=list(self.result.columns)).reset_index(drop=True)
expected = self.result_bio.sort_values(
by=list(self.result.columns)
).reset_index(drop=True)
result = self.result.sort_values(by=list(self.result.columns)).reset_index(
drop=True
)
pd.testing.assert_frame_equal(result, expected)
5 changes: 4 additions & 1 deletion tests/test_native_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@

class TestOverlapNative:
result_csv = pb.overlap(DF_PATH1, DF_PATH2, output_type="pandas.DataFrame")

def test_overlap_count(self):
assert len(self.result_csv) == 16

def test_overlap_schema_rows(self):
result_csv = self.result_csv.sort_values(by=list(self.result_csv.columns)).reset_index(drop=True)
result_csv = self.result_csv.sort_values(
by=list(self.result_csv.columns)
).reset_index(drop=True)
expected = PD_DF_OVERLAP
pd.testing.assert_frame_equal(result_csv, expected)
5 changes: 4 additions & 1 deletion tests/test_pandas_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@

class TestOverlapPandas:
result = pb.overlap(PD_DF1, PD_DF2, output_type="pandas.DataFrame")

def test_overlap_count(self):
assert len(self.result) == 16

def test_overlap_schema_rows(self):
result = self.result.sort_values(by=list(self.result.columns)).reset_index(drop=True)
result = self.result.sort_values(by=list(self.result.columns)).reset_index(
drop=True
)
expected = PD_DF_OVERLAP
pd.testing.assert_frame_equal(result, expected)
1 change: 0 additions & 1 deletion tests/test_polars_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


class TestOverlapPolars:

result_frame = pb.overlap(PL_DF1, PL_DF2, output_type="polars.DataFrame")
result_lazy = pb.overlap(PL_DF1, PL_DF2, output_type="polars.LazyFrame").collect()
expected = PL_DF_OVERLAP
Expand Down

0 comments on commit b6faa81

Please sign in to comment.