Skip to content

Commit

Permalink
Merge pull request #35 from robomics/feature/stripepy-view
Browse files Browse the repository at this point in the history
Initial implementation of stripepy view
  • Loading branch information
rea1991 authored Nov 27, 2024
2 parents ada4dd2 + 47f90a8 commit af01013
Show file tree
Hide file tree
Showing 6 changed files with 392 additions and 3 deletions.
6 changes: 5 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@ repos:
- --fix=lf
- id: trailing-whitespace
# Formatters should be run late so that they can re-format any prior changes.
- repo: https://github.com/kynan/nbstripout
rev: 0.8.1
hooks:
- id: nbstripout
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 24.8.0
hooks:
- id: black
- id: black-jupyter
args: ["--line-length", "120", "--target-version", "py39"]
- repo: https://github.com/pycqa/isort
rev: 5.13.2
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ test = [

dev = [
"stripepy[test]",
"black",
"black[jupyter]",
"isort",
"pre-commit",
]
Expand Down
36 changes: 36 additions & 0 deletions src/stripepy/cli/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,39 @@ def get_avail_ref_genomes():
return sc


def _make_stripepy_view_subcommand(main_parser) -> argparse.ArgumentParser:
sc: argparse.ArgumentParser = main_parser.add_parser(
"view",
help="Fetch stripes from the HDF5 file produced by stripepy call.",
)

sc.add_argument(
dest="h5_file",
metavar="h5-file",
type=_existing_file,
help="Path to the HDF5 file generated by stripepy call.",
)

sc.add_argument(
"--relative-change-threshold",
type=float,
default=5.0,
help="Cutoff for the relative change.\n"
"The relative change is computed as the ratio between the average number of interactions\n"
"found inside a stripe and the number of interactions in a neighborhood outside of the stripe.",
)

sc.add_argument(
"--transform",
type=str,
choices=["transpose_to_ut", "transpose_to_lt", None],
default=None,
help="Control if and how stripe coordinates should be transformed.",
)

return sc


def _make_cli() -> argparse.ArgumentParser:
cli = argparse.ArgumentParser(
description="stripepy is designed to recognize linear patterns in contact maps (.hic, .mcool, .cool) "
Expand All @@ -245,6 +278,7 @@ def _make_cli() -> argparse.ArgumentParser:

_make_stripepy_call_subcommand(sub_parser)
_make_stripepy_download_subcommand(sub_parser)
_make_stripepy_view_subcommand(sub_parser)

cli.add_argument(
"-v",
Expand Down Expand Up @@ -314,5 +348,7 @@ def parse_args() -> Tuple[str, Any]:
return subcommand, _process_stripepy_call_args(args)
if subcommand == "download":
return subcommand, args
if subcommand == "view":
return subcommand, args

raise NotImplementedError
151 changes: 151 additions & 0 deletions src/stripepy/cli/view.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# Copyright (C) 2024 Roberto Rossini <roberros@uio.no>
#
# SPDX-License-Identifier: MIT

import pathlib
import re
import sys
import warnings
from typing import Dict, Union

import h5py
import numpy as np
import pandas as pd


def _validate_input_file(path: pathlib.Path):
try:
with h5py.File(path) as h5:
format = h5.attrs.get("format")
format_version = h5.attrs.get("format-version")
except Exception as e:
raise RuntimeError(f'failed to open file "{path}" for reading: {e}')

try:
if format is None:
raise RuntimeError('attribute "format" is missing')

if format_version is None:
raise RuntimeError('attribute "format-version" is missing')

if format != "HDF5::StripePy":
raise RuntimeError(f'unrecognized file format: expected "HDF5::StripePy", found "{format}"')

if format_version != 1:
raise RuntimeError(
f'unsupported file format version "{format_version}". At present only version 1 is supported'
)
except RuntimeError as e:
raise RuntimeError(
f'failed to validate input file "{path}": {e}: file is corrupt or was not generated by StripePy.'
)


def _read_chroms(h5: h5py.File) -> Dict[str, int]:
try:
return {chrom.decode("utf-8"): size for chrom, size in zip(h5["chroms/name"], h5["chroms/length"])}

except Exception as e:
raise RuntimeError(f"failed to read chromosomes from file: {e}")


def _read_resolution(h5: h5py.File) -> int:
try:
resolution = int(h5.attrs["bin-size"])
if resolution > 0:
return resolution
raise RuntimeError(f"expected a positive integer, found {resolution}")
except Exception as e:
raise RuntimeError(f"failed to read resolution attribute from file: {e}")


def _normalize_df(df: pd.DataFrame) -> pd.DataFrame:
new_cols = []
for col in df.columns:
new_cols.append(re.sub(r"\W+", "_", col.lower()))

df.columns = new_cols
with warnings.catch_warnings(action="ignore"):
return df.convert_dtypes()


def _read_stripes(h5: h5py.File, chrom: str) -> pd.DataFrame:
try:
geo_cols = h5[f"/{chrom}/stripes/LT/geo-descriptors"].attrs["col_names"]
bio_cols = h5[f"/{chrom}/stripes/LT/bio-descriptors"].attrs["col_names"]

df1 = pd.DataFrame(h5[f"/{chrom}/stripes/LT/geo-descriptors"][:], columns=geo_cols)
df1["type"] = "lt"
df2 = pd.DataFrame(h5[f"/{chrom}/stripes/UT/geo-descriptors"][:], columns=geo_cols)
df2["type"] = "ut"

df3 = pd.DataFrame(h5[f"/{chrom}/stripes/LT/bio-descriptors"][:], columns=bio_cols)[["relative change"]]
df4 = pd.DataFrame(h5[f"/{chrom}/stripes/UT/bio-descriptors"][:], columns=bio_cols)[["relative change"]]

df1 = pd.concat([_normalize_df(df1), _normalize_df(df3)], axis="columns")
df2 = pd.concat([_normalize_df(df2), _normalize_df(df4)], axis="columns")

return pd.concat([df1, df2]).set_index("seed").sort_index()
except Exception as e:
raise RuntimeError(f'failed to read stripes for chromosome "{chrom}": {e}')


def _stripes_to_bedpe(
df: pd.DataFrame, chrom: str, size: int, resolution: int, transpose_policy: Union[str, None]
) -> pd.DataFrame:
num_stripes = len(df)

start1_pos = df["l_boundary"] * resolution
end1_pos = np.minimum(df["r_boundary"] * resolution, size)
start2_pos = np.minimum(df["u_boundary"] * resolution, size)
end2_pos = np.minimum(df["d_boundary"] * resolution, size)

if transpose_policy is not None:
if transpose_policy == "transpose_to_ut":
swap = df["type"] == "lt"
elif transpose_policy == "transpose_to_lt":
swap = df["type"] == "ut"
else:
raise NotImplementedError
else:
swap = None

if swap is not None:
start1_pos[swap], start2_pos[swap] = start2_pos[swap], start1_pos[swap]
end1_pos[swap], end2_pos[swap] = end2_pos[swap], end1_pos[swap]

return pd.DataFrame(
{
"chrom1": [chrom] * num_stripes,
"start1": start1_pos,
"end1": end1_pos,
"chrom2": [chrom] * num_stripes,
"start2": start2_pos,
"end2": end2_pos,
}
)


def _dump_stripes(h5: h5py.File, chrom: str, size: int, resolution: int, cutoff: float, transpose_policy: str):
if not chrom in h5:
return

df = _read_stripes(h5, chrom)
df = df[df["relative_change"] >= cutoff]
df = _stripes_to_bedpe(df, chrom, size, resolution, transpose_policy)

df.to_csv(sys.stdout, sep="\t", index=False, header=False)


def run(
h5_file: pathlib.Path,
relative_change_threshold: float,
transform: Union[str, None],
):
_validate_input_file(h5_file)

with h5py.File(h5_file) as h5:
resolution = _read_resolution(h5)
chroms = _read_chroms(h5)
for chrom, size in chroms.items():
_dump_stripes(h5, chrom, size, resolution, relative_change_threshold, transform)
4 changes: 3 additions & 1 deletion src/stripepy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import logging

from .cli import call, download, setup
from .cli import call, download, setup, view


def _setup_mpl_backend():
Expand All @@ -31,6 +31,8 @@ def main():
return call.run(**args)
if subcommand == "download":
return download.run(**args)
if subcommand == "view":
return view.run(**args)

raise NotImplementedError

Expand Down
Loading

0 comments on commit af01013

Please sign in to comment.