diff --git a/pyproject.toml b/pyproject.toml index bef36531..d280e36e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ requires-pyhton = ">=3.9" dependencies = [ "bioframe >0.7, <1", "h5py >3, <4", - "hictkpy >=0.0.5, <1", + "hictkpy[scipy] >=1, <2", "matplotlib >=3.8, <4", "numpy >=1.26, <2", "pandas >=2.0, <3", diff --git a/src/stripepy/main.py b/src/stripepy/main.py index bdad3188..b72b5273 100644 --- a/src/stripepy/main.py +++ b/src/stripepy/main.py @@ -40,14 +40,14 @@ def main(): configs_input, configs_thresholds, configs_output = cli.parse_args() # Data loading: - c, chr_starts, chr_ends, bp_lengths = others.cmap_loading(configs_input["contact-map"], configs_input["resolution"]) + f, chr_starts, chr_ends, bp_lengths = others.cmap_loading(configs_input["contact-map"], configs_input["resolution"]) # Remove existing folders: configs_output["output_folder"] = f"{configs_output['output_folder']}/{configs_input['resolution']}" IO.remove_and_create_folder(configs_output["output_folder"]) # Extract a list of tuples where each tuple is (index, chr), e.g. (2,'chr3'): - c_pairs = others.chromosomes_to_study(list(c.chromosomes().keys()), bp_lengths, MIN_SIZE_CHROMOSOME) + c_pairs = others.chromosomes_to_study(list(f.chromosomes().keys()), bp_lengths, MIN_SIZE_CHROMOSOME) # Create HDF5 file to store candidate stripes: hf = h5py.File(f"{configs_output['output_folder']}/results.hdf5", "w") @@ -73,11 +73,7 @@ def main(): if configs_input["roi"] is not None: IO.create_folders_for_plots(f"{configs_output['output_folder']}/plots/{this_chr}") - # Extract current chromosome (only the upper-triangular part is stored by hictkpy!): - I = c.fetch(this_chr).to_coo().tolil() - I += I.T - I.setdiag(I.diagonal() / 2) - I = I.tocsr() + I = f.fetch(this_chr).to_csr("full") # RoI: RoI = others.define_RoI( diff --git a/src/stripepy/others.py b/src/stripepy/others.py index dec37c15..a5ca8cb3 100644 --- a/src/stripepy/others.py +++ b/src/stripepy/others.py @@ -7,30 +7,37 @@ from . import IO -def cmap_loading(path, resolution): - # Retrieve metadata: - file_name, file_ext = os.path.splitext(path) - file_format = file_ext.lower()[1:] # Remove the dot from the extension and convert to lowercase - - if file_format == "cool" or file_format == "hic": - c = hictkpy.File(path, resolution) - elif file_format == "mcool": - c = hictkpy.File(f"{path}::/resolutions/{resolution}", resolution) - else: - raise ValueError("Unsupported file format: " + file_format) +def cmap_loading(path: os.PathLike, resolution: int): + try: + if not isinstance(resolution, int): + raise TypeError("resolution must be an integer.") + + if resolution <= 0: + raise ValueError("resolution must be greater than zero.") + + if hictkpy.is_scool_file(path): + raise RuntimeError(".scool files are not currently supported.") + if hictkpy.is_cooler(path): + f = hictkpy.File(path) + if f.resolution() != resolution: + raise RuntimeError(f"expected {resolution} resolution, found {f.resolution()}.") + else: + f = hictkpy.MultiResFile(path)[resolution] + except RuntimeError as e: + raise RuntimeError(f'error opening file "{path}"') from e # Retrieve metadata: chr_starts = [0] # left ends of each chromosome (in matrix coordinates) chr_ends = [] # right ends of each chromosome (in matrix coordinates) chr_sizes = [] # integer bp lengths, one per chromosome - for _, bp_length in c.chromosomes().items(): + for bp_length in f.chromosomes().values(): chr_sizes.append(bp_length) chr_ends.append(chr_starts[-1] + int(np.ceil(bp_length / resolution))) chr_starts.append(chr_ends[-1]) # print(f"{chr_starts[-2]}-{chr_ends[-1]}-{chr_sizes[-1]}") chr_starts.pop(-1) - return c, chr_starts, chr_ends, chr_sizes + return f, chr_starts, chr_ends, chr_sizes def chromosomes_to_study(chromosomes, length_in_bp, min_size_allowed): diff --git a/src/stripepy/utils/evaluate.py b/src/stripepy/utils/evaluate.py index e393018c..14772fe9 100644 --- a/src/stripepy/utils/evaluate.py +++ b/src/stripepy/utils/evaluate.py @@ -976,7 +976,7 @@ def recoverable_anchors_recognition( path2GT = f"{base_path}/MoDLE-benchmark/" file_name = f"{file_name_base}_{contact_density}_{noise}" path2mcool = f"{base_path}/MoDLE-benchmark/data/{file_name}/" - c = hictkpy.File(f"{path2mcool}{file_name}.mcool::resolutions/{resolution}", resolution) + c = hictkpy.File(f"{path2mcool}{file_name}.mcool", resolution) c_names = list(c.chromosomes().keys()) c_ids = list(range(len(c_names))) c_pairs = list(zip(c_ids, c_names)) @@ -1116,7 +1116,7 @@ def recoverable_anchors_classification( path2GT = f"{base_path}/MoDLE-benchmark/" file_name = f"{file_name_base}_{contact_density}_{noise}" path2mcool = f"{base_path}/MoDLE-benchmark/data/{file_name}/" - c = hictkpy.File(f"{path2mcool}{file_name}.mcool::resolutions/{resolution}", resolution) + c = hictkpy.File(f"{path2mcool}{file_name}.mcool", resolution) c_names = list(c.chromosomes().keys()) c_ids = list(range(len(c_names))) c_pairs = list(zip(c_ids, c_names)) diff --git a/test/test_others.py b/test/test_others.py index c99797e5..c28bcfdd 100644 --- a/test/test_others.py +++ b/test/test_others.py @@ -31,7 +31,6 @@ def _generate_singleres_test_file( if chromosomes is None: chromosomes = _generate_chromosomes() - # TODO remove str() after upgrading to hictkpy v1.0.0 writer = htk.cooler.FileWriter(str(path), resolution=resolution, chromosomes=chromosomes) writer.add_pixels(pd.DataFrame({"bin1_id": [0], "bin2_id": [0], "count": 1})) writer.finalize() @@ -48,30 +47,33 @@ def _discretize(v: List[int], factor: int) -> List[int]: class TestCmapLoading: - # TODO revise error messages def test_invalid_paths(self, tmpdir): tmpdir = pathlib.Path(tmpdir) - with pytest.raises(Exception, match="Unsupported file format:"): + with pytest.raises(RuntimeError): cmap_loading(tmpdir / "foo", 5000) def test_invalid_formats(self, tmpdir): invalid_file = pathlib.Path(__file__).resolve() folder = pathlib.Path(tmpdir) - with pytest.raises(Exception, match="Unsupported file format:"): + with pytest.raises(RuntimeError): cmap_loading(invalid_file, 5000) - with pytest.raises(Exception, match="Unsupported file format:"): + with pytest.raises(RuntimeError): cmap_loading(folder, 5000) def test_invalid_resolutions(self, tmpdir): - invalid_resolutions = [-1, 0, 1000.0, 5000] with tempfile.NamedTemporaryFile(dir=tmpdir) as clr: clr.close() path_to_clr = _generate_singleres_test_file(pathlib.Path(clr.name), 1000) - for res in invalid_resolutions: - with pytest.raises(Exception): - cmap_loading(path_to_clr, res) + with pytest.raises(RuntimeError): + cmap_loading(path_to_clr, 5000) + with pytest.raises(TypeError, match="must be an integer"): + cmap_loading(path_to_clr, 1000.0) + with pytest.raises(ValueError, match="must be greater than zero"): + cmap_loading(path_to_clr, -1) + with pytest.raises(ValueError, match="must be greater than zero"): + cmap_loading(path_to_clr, 0) def test_valid_files(self, tmpdir): tmpdir = pathlib.Path(tmpdir) @@ -79,8 +81,7 @@ def test_valid_files(self, tmpdir): resolution = 1000 path_to_clr = _generate_singleres_test_file(tmpdir / "test.cool", resolution, chromosomes) - # TODO remove str() after upgrading to hictkpy v1.0.0 - f, starts, ends, sizes = cmap_loading(str(path_to_clr), resolution) + f, starts, ends, sizes = cmap_loading(path_to_clr, resolution) assert f.resolution() == 1000 assert len(starts) == len(chromosomes) assert len(starts) == len(ends) @@ -94,6 +95,5 @@ def test_valid_files(self, tmpdir): assert ends == _discretize([100_000, 150_000, 160_000], resolution) assert sizes == list(chromosomes.values()) - # TODO enable after merging https://github.com/paulsengroup/StripePy/pull/4 - # with pytest.raises(Exception): - # cmap_loading(path_to_clr, resolution + 1) + with pytest.raises(Exception): + cmap_loading(path_to_clr, resolution + 1)