-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #35 from robomics/feature/stripepy-view
Initial implementation of stripepy view
- Loading branch information
Showing
6 changed files
with
392 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -65,7 +65,7 @@ test = [ | |
|
||
dev = [ | ||
"stripepy[test]", | ||
"black", | ||
"black[jupyter]", | ||
"isort", | ||
"pre-commit", | ||
] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
# Copyright (C) 2024 Roberto Rossini <roberros@uio.no> | ||
# | ||
# SPDX-License-Identifier: MIT | ||
|
||
import pathlib | ||
import re | ||
import sys | ||
import warnings | ||
from typing import Dict, Union | ||
|
||
import h5py | ||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def _validate_input_file(path: pathlib.Path): | ||
try: | ||
with h5py.File(path) as h5: | ||
format = h5.attrs.get("format") | ||
format_version = h5.attrs.get("format-version") | ||
except Exception as e: | ||
raise RuntimeError(f'failed to open file "{path}" for reading: {e}') | ||
|
||
try: | ||
if format is None: | ||
raise RuntimeError('attribute "format" is missing') | ||
|
||
if format_version is None: | ||
raise RuntimeError('attribute "format-version" is missing') | ||
|
||
if format != "HDF5::StripePy": | ||
raise RuntimeError(f'unrecognized file format: expected "HDF5::StripePy", found "{format}"') | ||
|
||
if format_version != 1: | ||
raise RuntimeError( | ||
f'unsupported file format version "{format_version}". At present only version 1 is supported' | ||
) | ||
except RuntimeError as e: | ||
raise RuntimeError( | ||
f'failed to validate input file "{path}": {e}: file is corrupt or was not generated by StripePy.' | ||
) | ||
|
||
|
||
def _read_chroms(h5: h5py.File) -> Dict[str, int]: | ||
try: | ||
return {chrom.decode("utf-8"): size for chrom, size in zip(h5["chroms/name"], h5["chroms/length"])} | ||
|
||
except Exception as e: | ||
raise RuntimeError(f"failed to read chromosomes from file: {e}") | ||
|
||
|
||
def _read_resolution(h5: h5py.File) -> int: | ||
try: | ||
resolution = int(h5.attrs["bin-size"]) | ||
if resolution > 0: | ||
return resolution | ||
raise RuntimeError(f"expected a positive integer, found {resolution}") | ||
except Exception as e: | ||
raise RuntimeError(f"failed to read resolution attribute from file: {e}") | ||
|
||
|
||
def _normalize_df(df: pd.DataFrame) -> pd.DataFrame: | ||
new_cols = [] | ||
for col in df.columns: | ||
new_cols.append(re.sub(r"\W+", "_", col.lower())) | ||
|
||
df.columns = new_cols | ||
with warnings.catch_warnings(action="ignore"): | ||
return df.convert_dtypes() | ||
|
||
|
||
def _read_stripes(h5: h5py.File, chrom: str) -> pd.DataFrame: | ||
try: | ||
geo_cols = h5[f"/{chrom}/stripes/LT/geo-descriptors"].attrs["col_names"] | ||
bio_cols = h5[f"/{chrom}/stripes/LT/bio-descriptors"].attrs["col_names"] | ||
|
||
df1 = pd.DataFrame(h5[f"/{chrom}/stripes/LT/geo-descriptors"][:], columns=geo_cols) | ||
df1["type"] = "lt" | ||
df2 = pd.DataFrame(h5[f"/{chrom}/stripes/UT/geo-descriptors"][:], columns=geo_cols) | ||
df2["type"] = "ut" | ||
|
||
df3 = pd.DataFrame(h5[f"/{chrom}/stripes/LT/bio-descriptors"][:], columns=bio_cols)[["relative change"]] | ||
df4 = pd.DataFrame(h5[f"/{chrom}/stripes/UT/bio-descriptors"][:], columns=bio_cols)[["relative change"]] | ||
|
||
df1 = pd.concat([_normalize_df(df1), _normalize_df(df3)], axis="columns") | ||
df2 = pd.concat([_normalize_df(df2), _normalize_df(df4)], axis="columns") | ||
|
||
return pd.concat([df1, df2]).set_index("seed").sort_index() | ||
except Exception as e: | ||
raise RuntimeError(f'failed to read stripes for chromosome "{chrom}": {e}') | ||
|
||
|
||
def _stripes_to_bedpe( | ||
df: pd.DataFrame, chrom: str, size: int, resolution: int, transpose_policy: Union[str, None] | ||
) -> pd.DataFrame: | ||
num_stripes = len(df) | ||
|
||
start1_pos = df["l_boundary"] * resolution | ||
end1_pos = np.minimum(df["r_boundary"] * resolution, size) | ||
start2_pos = np.minimum(df["u_boundary"] * resolution, size) | ||
end2_pos = np.minimum(df["d_boundary"] * resolution, size) | ||
|
||
if transpose_policy is not None: | ||
if transpose_policy == "transpose_to_ut": | ||
swap = df["type"] == "lt" | ||
elif transpose_policy == "transpose_to_lt": | ||
swap = df["type"] == "ut" | ||
else: | ||
raise NotImplementedError | ||
else: | ||
swap = None | ||
|
||
if swap is not None: | ||
start1_pos[swap], start2_pos[swap] = start2_pos[swap], start1_pos[swap] | ||
end1_pos[swap], end2_pos[swap] = end2_pos[swap], end1_pos[swap] | ||
|
||
return pd.DataFrame( | ||
{ | ||
"chrom1": [chrom] * num_stripes, | ||
"start1": start1_pos, | ||
"end1": end1_pos, | ||
"chrom2": [chrom] * num_stripes, | ||
"start2": start2_pos, | ||
"end2": end2_pos, | ||
} | ||
) | ||
|
||
|
||
def _dump_stripes(h5: h5py.File, chrom: str, size: int, resolution: int, cutoff: float, transpose_policy: str): | ||
if not chrom in h5: | ||
return | ||
|
||
df = _read_stripes(h5, chrom) | ||
df = df[df["relative_change"] >= cutoff] | ||
df = _stripes_to_bedpe(df, chrom, size, resolution, transpose_policy) | ||
|
||
df.to_csv(sys.stdout, sep="\t", index=False, header=False) | ||
|
||
|
||
def run( | ||
h5_file: pathlib.Path, | ||
relative_change_threshold: float, | ||
transform: Union[str, None], | ||
): | ||
_validate_input_file(h5_file) | ||
|
||
with h5py.File(h5_file) as h5: | ||
resolution = _read_resolution(h5) | ||
chroms = _read_chroms(h5) | ||
for chrom, size in chroms.items(): | ||
_dump_stripes(h5, chrom, size, resolution, relative_change_threshold, transform) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.