Skip to content

Commit 1efa259

Browse files
authored
Merge pull request #448 from open2c/rearrange_cooler
New tool: rearrange cooler
2 parents 950b62a + 9ebd5d4 commit 1efa259

File tree

7 files changed

+389
-222
lines changed

7 files changed

+389
-222
lines changed

cooltools/api/rearrange.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
import bioframe as bf
2+
import cooler
3+
import numpy as np
4+
import pandas as pd
5+
6+
from ..lib.checks import is_compatible_viewframe
7+
import logging
8+
9+
logging.basicConfig(level=logging.INFO)
10+
11+
12+
def _generate_adjusted_chunks(
13+
clr, binmapping, chunksize=1_000_000, orientation_col="strand", nproc=1
14+
):
15+
"""Generates chunks of pixels from the cooler and adjusts their bin IDs to follow the view."""
16+
npixels = clr.pixels().shape[0]
17+
chunks = np.append(np.arange(0, npixels, chunksize), npixels)
18+
chunks = list(zip(chunks[:-1], chunks[1:]))
19+
20+
# Running this loop in parallel slows the function down
21+
for i0, i1 in chunks:
22+
chunk = clr.pixels()[i0:i1]
23+
chunk["bin1_id"] = chunk["bin1_id"].map(binmapping)
24+
chunk["bin2_id"] = chunk["bin2_id"].map(binmapping)
25+
chunk = chunk[(chunk["bin1_id"] != -1) & (chunk["bin2_id"] != -1)]
26+
if chunk.shape[0] > 0:
27+
chunk[["bin1_id", "bin2_id"]] = np.sort(
28+
chunk[["bin1_id", "bin2_id"]].astype(int)
29+
)
30+
yield chunk.reset_index(drop=True)
31+
logging.info(f"Processed {i1/npixels*100:.2f}% pixels")
32+
33+
34+
def _adjust_start_end(chromdf):
35+
chromdf["end"] = chromdf["length"].cumsum()
36+
chromdf["start"] = chromdf["end"] - chromdf["length"]
37+
return chromdf
38+
39+
40+
def _flip_bins(regdf):
41+
regdf = regdf.iloc[::-1].reset_index(drop=True)
42+
l = regdf["end"] - regdf["start"]
43+
regdf["start"] = regdf["end"].max() - regdf["end"]
44+
regdf["end"] = regdf["start"] + l
45+
return regdf
46+
47+
48+
def rearrange_bins(
49+
bins_old, view_df, new_chrom_col="new_chrom", orientation_col="strand"
50+
):
51+
"""
52+
Rearranges the input `bins_old` based on the information in the `view_df` DataFrame.
53+
54+
Parameters
55+
----------
56+
bins_old : bintable
57+
The original bintable to rearrange.
58+
view_df : viewframe
59+
Viewframe with new order of genomic regions. Can have an additional column for
60+
the new chromosome name (`new_chrom_col`), and an additional column for the
61+
strand orientation (`orientation_col`, '-' will invert the region).
62+
new_chrom_col : str, optional
63+
Column name in the view_df specifying new chromosome name for each region,
64+
by default 'new_chrom'. If None, then the default chrom column names will be used.
65+
orientation_col : str, optional
66+
Column name in the view_df specifying strand orientation of each region,
67+
by default 'strand'. The values in this column can be "+" or "-".
68+
If None, then all will be assumed "+".
69+
70+
71+
Returns
72+
-------
73+
bins_new : bintable
74+
The rearranged bintagle with the new chromosome names and orientations.
75+
bin_mapping : dict
76+
Mapping of original bin IDs to new bin IDs
77+
"""
78+
chromdict = dict(zip(view_df["name"].to_numpy(), view_df[new_chrom_col].to_numpy()))
79+
flipdict = dict(
80+
zip(view_df["name"].to_numpy(), (view_df[orientation_col] == "-").to_numpy())
81+
)
82+
bins_old = bins_old.reset_index(names=["old_id"])
83+
bins_subset = bf.assign_view(bins_old, view_df, drop_unassigned=False).dropna(
84+
subset=["view_region"]
85+
)
86+
bins_inverted = (
87+
bins_subset.groupby("view_region", group_keys=False)
88+
.apply(lambda x: _flip_bins(x) if flipdict[x.name] else x)
89+
.reset_index(drop=True)
90+
)
91+
bins_new = bf.sort_bedframe(
92+
bins_inverted,
93+
view_df=view_df,
94+
df_view_col="view_region",
95+
)
96+
bins_new["chrom"] = bins_new["view_region"].map(chromdict)
97+
bins_new["length"] = bins_new["end"] - bins_new["start"]
98+
bins_new = (
99+
bins_new.groupby("chrom", group_keys=False)
100+
.apply(_adjust_start_end)
101+
.drop(columns=["length", "view_region"])
102+
)
103+
logging.info("Rearranged bins")
104+
bin_mapping = {old_id: -1 for old_id in bins_old["old_id"].astype(int)}
105+
bin_mapping.update(
106+
{
107+
old_id: new_id
108+
for old_id, new_id in zip(bins_new["old_id"].astype(int), bins_new.index)
109+
}
110+
)
111+
logging.info("Created bin mapping")
112+
bins_new = bins_new.drop(columns=["old_id"])
113+
return bins_new, bin_mapping
114+
115+
116+
def rearrange_cooler(
117+
clr,
118+
view_df,
119+
out_cooler,
120+
new_chrom_col="new_chrom",
121+
orientation_col="strand",
122+
assembly=None,
123+
chunksize=10_000_000,
124+
mode="w",
125+
):
126+
"""Reorder cooler following a genomic view.
127+
128+
Parameters
129+
----------
130+
clr : cooler.Cooler
131+
Cooler object
132+
view_df : viewframe
133+
Viewframe with new order of genomic regions. Can have an additional column for
134+
the new chromosome name (`new_chrom_col`), and an additional column for the
135+
strand orientation (`orientation_col`, '-' will invert the region).
136+
out_cooler : str
137+
File path to save the reordered data
138+
new_chrom_col : str, optional
139+
Column name in the view_df specifying new chromosome name for each region,
140+
by default 'new_chrom'. If None, then the default chrom column names will be used.
141+
orientation_col : str, optional
142+
Column name in the view_df specifying strand orientation of each region,
143+
by default 'strand'. The values in this column can be "+" or "-".
144+
If None, then all will be assumed "+".
145+
assembly : str, optional
146+
The name of the assembly for the new cooler. If None, uses the same as in the
147+
original cooler.
148+
chunksize : int, optional
149+
The number of pixels loaded and processed per step of computation.
150+
mode : str, optional
151+
(w)rite or (a)ppend to the output cooler file. Default: w
152+
"""
153+
154+
view_df = view_df.copy()
155+
try:
156+
_ = is_compatible_viewframe(
157+
view_df[["chrom", "start", "end", "name"]],
158+
clr,
159+
check_sorting=False,
160+
raise_errors=True,
161+
)
162+
except Exception as e:
163+
raise ValueError("view_df is not a valid viewframe or incompatible") from e
164+
165+
if assembly is None:
166+
assembly = clr.info["genome-assembly"]
167+
168+
# Add repeated entries for new chromosome names if they were not requested/absent:
169+
if new_chrom_col is None:
170+
new_chrom_col = "new_chrom"
171+
if new_chrom_col in view_df.columns:
172+
logging.warn(
173+
"new_chrom_col is not provided, but new_chrom column exists."
174+
" Pre-existing new_chrom column will not be used."
175+
)
176+
while new_chrom_col in view_df.columns:
177+
new_chrom_col = (
178+
f"new_chrom_{np.random.randint(0, np.iinfo(np.int32).max)}"
179+
)
180+
if new_chrom_col not in view_df.columns:
181+
view_df.loc[:, new_chrom_col] = view_df["chrom"]
182+
183+
# Add repeated entries for strand orientation of chromosomes if they were not requested/absent:
184+
if orientation_col is None:
185+
orientation_col = "strand"
186+
if orientation_col in view_df.columns:
187+
logging.warn(
188+
"orientation_col is not provided, but strand column exists."
189+
" Pre-existing strand column will not be used."
190+
)
191+
while orientation_col in view_df.columns:
192+
orientation_col = f"strand_{np.random.randint(0, np.iinfo(np.int32).max)}"
193+
if orientation_col not in view_df.columns:
194+
view_df.loc[:, orientation_col] = "+"
195+
196+
if not np.all(
197+
view_df.groupby(new_chrom_col).apply(lambda x: np.all(np.diff(x.index) == 1))
198+
):
199+
raise ValueError("New chromosomes are not consecutive")
200+
bins_old = clr.bins()[:]
201+
# Creating new bin table
202+
bins_new, bin_mapping = rearrange_bins(
203+
bins_old, view_df, new_chrom_col=new_chrom_col, orientation_col=orientation_col
204+
)
205+
logging.info("Creating a new cooler")
206+
cooler.create_cooler(
207+
out_cooler,
208+
bins_new,
209+
_generate_adjusted_chunks(
210+
clr,
211+
bin_mapping,
212+
chunksize=chunksize,
213+
),
214+
assembly=assembly,
215+
mode=mode,
216+
mergebuf=int(1e9),
217+
)
218+
logging.info(f"Created a new cooler at {out_cooler}")

cooltools/cli/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,4 +57,5 @@ def _excepthook(exc_type, value, tb):
5757
sample,
5858
coverage,
5959
virtual4c,
60+
rearrange,
6061
)

cooltools/cli/insulation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def insulation(
118118
"""
119119
Calculate the diamond insulation scores and call insulating boundaries.
120120
121-
IN_PATH : The paths to a .cool file with a balanced Hi-C map.
121+
IN_PATH : The path to a .cool file with a balanced Hi-C map.
122122
123123
WINDOW : The window size for the insulation score calculations.
124124
Multiple space-separated values can be provided.

cooltools/cli/rearrange.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import click
2+
import cooler
3+
import pandas as pd
4+
5+
from .. import api
6+
from . import cli
7+
from .util import sniff_for_header
8+
9+
10+
@cli.command()
11+
@click.argument("in_path", metavar="IN_PATH", type=str, nargs=1)
12+
@click.argument("out_path", metavar="OUT_PATH", type=str, nargs=1)
13+
@click.option(
14+
"--view",
15+
help="Path to a BED-like file which defines which regions of the chromosomes to use"
16+
" and in what order. Using --new-chrom-col and --orientation-col you can specify the"
17+
" new chromosome names and whether to invert each region (optional)",
18+
default=None,
19+
required=True,
20+
type=str,
21+
)
22+
@click.option(
23+
"--new-chrom-col",
24+
help="Column name in the view with new chromosome names."
25+
" If not provided and there is no column named 'new_chrom' in the view file, uses"
26+
" original chromosome names",
27+
default=None,
28+
type=str,
29+
)
30+
@click.option(
31+
"--orientation-col",
32+
help="Columns name in the view with orientations of each region (+ or -)."
33+
" If not providedand there is no column named 'strand' in the view file, assumes"
34+
" all are forward oriented",
35+
default=None,
36+
type=str,
37+
)
38+
@click.option(
39+
"--assembly",
40+
help="The name of the assembly for the new cooler. If None, uses the same as in the"
41+
" original cooler.",
42+
default=None,
43+
type=str,
44+
)
45+
@click.option(
46+
"--chunksize",
47+
help="The number of pixels loaded and processed per step of computation.",
48+
type=int,
49+
default=int(1e7),
50+
show_default=True,
51+
)
52+
@click.option(
53+
"--mode",
54+
help="(w)rite or (a)ppend to the output file (default: w)",
55+
default="w",
56+
type=click.Choice(["w", "a"], case_sensitive=False),
57+
)
58+
def rearrange(
59+
in_path, out_path, view, new_chrom_col, orientation_col, assembly, chunksize, mode
60+
):
61+
"""Rearrange data from a cooler according to a new genomic view
62+
63+
Parameters
64+
----------
65+
IN_PATH : str
66+
.cool file (or URI) with data to rearrange.
67+
OUT_PATH : str
68+
.cool file (or URI) to save the rearrange data.
69+
view : str
70+
Path to a BED-like file which defines which regions of the chromosomes to use
71+
and in what order. Has to be a valid viewframe (columns corresponding to region
72+
coordinates followed by the region name), with potential additional columns.
73+
Using --new-chrom-col and --orientation-col you can specify the new chromosome
74+
names and whether to invert each region (optional).
75+
If has no header with column names, assumes the `new-chrom-col` is the fifth
76+
column and `--orientation-col` is the sixth, if they exist.
77+
new_chrom_col : str
78+
Column name in the view with new chromosome names.
79+
If not provided and there is no column named 'new_chrom' in the view file, uses
80+
original chromosome names.
81+
orientation_col : str
82+
Columns name in the view with orientations of each region (+ or -). - means the
83+
region will be inverted.
84+
If not providedand there is no column named 'strand' in the view file, assumes
85+
all are forward oriented.
86+
assembly : str
87+
The name of the assembly for the new cooler. If None, uses the same as in the
88+
original cooler.
89+
chunksize : int
90+
The number of pixels loaded and processed per step of computation.
91+
mode : str
92+
(w)rite or (a)ppend to the output file (default: w)
93+
"""
94+
clr = cooler.Cooler(in_path)
95+
default_names = ["chrom", "start", "end", "name", "new_chrom", "strand"]
96+
buf, names = sniff_for_header(view)
97+
if names is not None:
98+
# Simply take column names from the file
99+
view_df = pd.read_table(buf, header=0, sep="\t")
100+
else:
101+
# Use default names
102+
# If some are missing, pandas creates them with all NaNs
103+
view_df = pd.read_csv(buf, names=default_names, sep="\t")
104+
names = view_df.columns
105+
# If additinal column names not provided, set them to defaults
106+
# If additional columns are not in the view, raise
107+
if new_chrom_col is None:
108+
new_chrom_col = "new_chrom"
109+
elif new_chrom_col not in view_df.columns:
110+
raise ValueError(f"New chrom col {new_chrom_col} not found in view columns")
111+
if orientation_col is None:
112+
orientation_col = "strand"
113+
elif orientation_col not in view_df.columns:
114+
raise ValueError(f"Orientation col {orientation_col} not found in view columns")
115+
116+
# Fill NaNs in additional columns: if they were created here, will be filled with
117+
# default values. Allows not specifying default values in the file, i.e. only
118+
# regions that need to be inverted need to have "-" in orientation_col
119+
view_df[new_chrom_col] = view_df[new_chrom_col].fillna(view_df["chrom"])
120+
view_df[orientation_col] = view_df[orientation_col].fillna("+")
121+
api.rearrange.rearrange_cooler(
122+
clr,
123+
view_df,
124+
out_path,
125+
new_chrom_col=new_chrom_col,
126+
orientation_col=orientation_col,
127+
assembly=assembly,
128+
chunksize=chunksize,
129+
mode=mode,
130+
)

0 commit comments

Comments
 (0)