Skip to content

Commit 260bf9e

Browse files
authored
Merge pull request #112 from mirnylab/develop
Chunksize for insulation and fix expected by region
2 parents 78fc921 + 5e5580a commit 260bf9e

File tree

3 files changed

+65
-59
lines changed

3 files changed

+65
-59
lines changed

cooltools/cli/diamond_insulation.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from . import cli
66
from .. import insulation
77

8+
89
@cli.command()
910
@click.argument(
1011
"in_path",
@@ -19,7 +20,7 @@
1920
@click.option(
2021
'--ignore-diags',
2122
help='The number of diagonals to ignore. By default, equals'
22-
' the number of diagonals ignored during IC balancing.',
23+
' the number of diagonals ignored during IC balancing.',
2324
type=int,
2425
default=None,
2526
show_default=True)
@@ -46,20 +47,25 @@
4647
help='Append columns with raw scores (sum_counts, sum_balanced, n_pixels) '
4748
'to the output table.',
4849
is_flag=True)
50+
@click.option(
51+
'--chunksize',
52+
help='',
53+
type=int,
54+
default=20000000,
55+
show_default=True)
4956
@click.option(
5057
'--verbose',
5158
help='Report real-time progress.',
5259
is_flag=True)
53-
54-
55-
def diamond_insulation(in_path, window, ignore_diags, min_frac_valid_pixels,
56-
min_dist_bad_bin, window_pixels, append_raw_scores, verbose):
60+
def diamond_insulation(in_path, window, ignore_diags, min_frac_valid_pixels,
61+
min_dist_bad_bin, window_pixels, append_raw_scores,
62+
chunksize, verbose):
5763
"""
5864
Calculate the diamond insulation scores and call insulating boundaries.
5965
6066
IN_PATH : The paths to a .cool file with a balanced Hi-C map.
6167
62-
WINDOW : The window size for the insulation score calculations.
68+
WINDOW : The window size for the insulation score calculations.
6369
Multiple space-separated values can be provided.
6470
By default, the window size must be provided in units of bp.
6571
When the flag --window-pixels is set, the window sizes must
@@ -69,22 +75,19 @@ def diamond_insulation(in_path, window, ignore_diags, min_frac_valid_pixels,
6975
clr = cooler.Cooler(in_path)
7076
if window_pixels:
7177
window = [win*clr.info['bin-size'] for win in window]
72-
73-
78+
7479
ins_table = insulation.calculate_insulation_score(
7580
clr,
7681
window_bp=window,
7782
ignore_diags=ignore_diags,
7883
append_raw_scores=append_raw_scores,
84+
chunksize=chunksize,
7985
verbose=verbose)
80-
81-
86+
8287
ins_table = insulation.find_boundaries(
8388
ins_table,
8489
min_frac_valid_pixels=min_frac_valid_pixels,
8590
min_dist_bad_bin=min_dist_bad_bin,
8691
)
8792

88-
8993
print(ins_table.to_csv(sep='\t', index=False, na_rep='nan'))
90-

cooltools/expected.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -558,16 +558,19 @@ def _diagsum_symm(clr, fields, transforms, supports, span):
558558
pixels = clr.pixels()[lo:hi]
559559
pixels = cooler.annotate(pixels, bins, replace=False)
560560

561-
pixels = pixels[pixels['chrom1'] == pixels['chrom2']].copy()
561+
pixels['support1'] = assign_supports(pixels, supports, suffix='1')
562+
pixels['support2'] = assign_supports(pixels, supports, suffix='2')
563+
pixels = pixels[pixels['support1'] == pixels['support2']].copy()
564+
562565
pixels['diag'] = pixels['bin2_id'] - pixels['bin1_id']
563566
for field, t in transforms.items():
564567
pixels[field] = t(pixels)
565568

566-
pixels['support'] = assign_supports(pixels, supports, suffix='1')
567-
568-
pixel_groups = dict(iter(pixels.groupby('support')))
569-
return {int(i): group.groupby('diag')[fields].sum()
570-
for i, group in pixel_groups.items()}
569+
pixelgroups = dict(iter(pixels.groupby('support1')))
570+
return {
571+
int(i): group.groupby('diag')[fields].sum()
572+
for i, group in pixelgroups.items()
573+
}
571574

572575

573576
def _diagsum_asymm(clr, fields, transforms, contact_type, supports1, supports2, span):
@@ -586,7 +589,7 @@ def _diagsum_asymm(clr, fields, transforms, contact_type, supports1, supports2,
586589
pixels[field] = t(pixels)
587590

588591
pixels['support1'] = assign_supports(pixels, supports1, suffix='1')
589-
pixels['support2'] = assign_supports(pixels, supports1, suffix='2')
592+
pixels['support2'] = assign_supports(pixels, supports2, suffix='2')
590593

591594
pixel_groups = dict(iter(pixels.groupby(['support1', 'support2'])))
592595
return {(int(i), int(j)): group.groupby('diag')[fields].sum()

cooltools/insulation.py

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
def get_n_pixels(bad_bin_mask, window=10, ignore_diags=2):
1414
"""
1515
Calculate the number of "good" pixels in a diamond at each bin.
16-
16+
1717
"""
1818
N = len(bad_bin_mask)
1919
n_pixels = np.zeros(N)
@@ -38,7 +38,7 @@ def get_n_pixels(bad_bin_mask, window=10, ignore_diags=2):
3838
return n_pixels
3939

4040

41-
def insul_diamond(pixel_query, bins, window=10, ignore_diags=2,
41+
def insul_diamond(pixel_query, bins, window=10, ignore_diags=2,
4242
norm_by_median=True):
4343
"""
4444
Calculates the insulation score of a Hi-C interaction matrix.
@@ -66,21 +66,20 @@ def insul_diamond(pixel_query, bins, window=10, ignore_diags=2,
6666
N = hi_bin_id - lo_bin_id
6767
sum_counts = np.zeros(N)
6868
sum_balanced = np.zeros(N)
69-
70-
n_pixels = get_n_pixels(bins.weight.isnull().values,
71-
window=window,
69+
70+
n_pixels = get_n_pixels(bins.weight.isnull().values,
71+
window=window,
7272
ignore_diags=ignore_diags)
73-
74-
73+
7574
for chunk_dict in pixel_query.read_chunked():
7675
chunk = pd.DataFrame(chunk_dict, columns=[
7776
'bin1_id', 'bin2_id', 'count'])
7877
diag_pixels = chunk[chunk.bin2_id - chunk.bin1_id <= (window - 1) * 2]
79-
78+
8079
diag_pixels = cooler.annotate(diag_pixels, bins[['weight']])
8180
diag_pixels['balanced'] = (
82-
diag_pixels['count']
83-
* diag_pixels['weight1']
81+
diag_pixels['count']
82+
* diag_pixels['weight1']
8483
* diag_pixels['weight2']
8584
)
8685
valid_pixel_mask = ~diag_pixels['balanced'].isnull().values
@@ -92,18 +91,18 @@ def insul_diamond(pixel_query, bins, window=10, ignore_diags=2,
9291
for j_shift in range(0, window):
9392
if i_shift+j_shift < ignore_diags:
9493
continue
95-
94+
9695
mask = ((i + i_shift == j - j_shift) &
9796
(i + i_shift < N) & (j - j_shift >= 0))
9897

9998
sum_counts += np.bincount(
10099
i[mask] + i_shift,
101-
diag_pixels['count'].values[mask],
100+
diag_pixels['count'].values[mask],
102101
minlength=N)
103-
102+
104103
sum_balanced += np.bincount(
105-
i[mask & valid_pixel_mask] + i_shift,
106-
diag_pixels['balanced'].values[mask & valid_pixel_mask],
104+
i[mask & valid_pixel_mask] + i_shift,
105+
diag_pixels['balanced'].values[mask & valid_pixel_mask],
107106
minlength=N)
108107

109108
with warnings.catch_warnings():
@@ -123,6 +122,7 @@ def calculate_insulation_score(
123122
ignore_diags=None,
124123
chromosomes=None,
125124
append_raw_scores=False,
125+
chunksize=20000000,
126126
verbose=False,
127127
):
128128
'''Calculate the diamond insulation scores and call insulating boundaries.
@@ -142,7 +142,7 @@ def calculate_insulation_score(
142142
to the output table.
143143
verbose : bool
144144
If True, report real-time progress.
145-
145+
146146
Returns
147147
-------
148148
ins_table : pandas.DataFrame
@@ -174,13 +174,13 @@ def calculate_insulation_score(
174174
clr.open('r'),
175175
shape=(nbins, nbins),
176176
field='count',
177-
chunksize=10000000)
177+
chunksize=chunksize)
178178

179179
ins_chrom_tables = []
180180
for chrom in chromosomes:
181181
if verbose:
182182
logging.info('Processing {}'.format(chrom))
183-
183+
184184
chrom_bins = clr.bins().fetch(chrom)
185185
ins_chrom = chrom_bins[['chrom', 'start', 'end']].copy()
186186
ins_chrom['is_bad_bin'] = chrom_bins['weight'].isnull()
@@ -204,7 +204,7 @@ def calculate_insulation_score(
204204

205205
ins_chrom['log2_insulation_score_{}'.format(window_bp[j])] = ins_track
206206
ins_chrom['n_valid_pixels_{}'.format(window_bp[j])] = n_pixels
207-
207+
208208
if append_raw_scores:
209209
ins_chrom['sum_counts_{}'.format(window_bp[j])] = sum_counts
210210
ins_chrom['sum_balanced_{}'.format(window_bp[j])] = sum_balanced
@@ -219,44 +219,44 @@ def find_boundaries(
219219
ins_table,
220220
min_frac_valid_pixels=0.66,
221221
min_dist_bad_bin=0,
222-
log2_ins_key = 'log2_insulation_score_{WINDOW}',
223-
n_valid_pixels_key = 'n_valid_pixels_{WINDOW}',
224-
is_bad_bin_key = 'is_bad_bin'
225-
222+
log2_ins_key='log2_insulation_score_{WINDOW}',
223+
n_valid_pixels_key='n_valid_pixels_{WINDOW}',
224+
is_bad_bin_key='is_bad_bin'
225+
226226
):
227227
'''Call insulating boundaries.
228-
Find all local minima of the log2(insulation score) and calculate their
228+
Find all local minima of the log2(insulation score) and calculate their
229229
chromosome-wide topographic prominence.
230-
230+
231231
Parameters
232232
----------
233233
ins_table : pandas.DataFrame
234234
A bin table with columns containing log2(insulation score),
235-
the number of valid pixels per diamond and (optionally) the mask
235+
the number of valid pixels per diamond and (optionally) the mask
236236
of bad bins.
237237
min_frac_valid_pixels : float
238-
The minimal fraction of valid pixels in a diamond to be used in
238+
The minimal fraction of valid pixels in a diamond to be used in
239239
boundary picking and prominence calculation.
240240
min_dist_bad_bin : int
241-
The minimal allowed distance to a bad bin.
241+
The minimal allowed distance to a bad bin.
242242
Ignore bins that have a bad bin closer than this distance.
243243
log2_ins_key, n_valid_pixels_key : str
244244
The names of the columns containing log2_insulation_score and
245245
the number of valid pixels per diamond. When a template
246-
containing `{WINDOW}` is provided, the calculation is repeated
246+
containing `{WINDOW}` is provided, the calculation is repeated
247247
for all pairs of columns matching the template.
248-
248+
249249
Returns
250250
-------
251251
ins_table : pandas.DataFrame
252252
A bin table with appended columns with boundary prominences.
253253
'''
254-
254+
255255
if min_dist_bad_bin:
256256
ins_table = pd.concat([
257257
df.assign(dist_bad_bin = numutils.dist_to_mask(df.is_bad_bin))
258258
for chrom,df in ins_table.groupby('chrom')])
259-
259+
260260
if '{WINDOW}' in log2_ins_key:
261261
windows = set()
262262
for col in ins_table.columns:
@@ -265,33 +265,33 @@ def find_boundaries(
265265
windows.add(int(m.groups()[0]))
266266
else:
267267
windows = set([None])
268-
268+
269269
min_valid_pixels = {
270270
win:ins_table[n_valid_pixels_key.format(WINDOW=win)].max()*min_frac_valid_pixels
271271
for win in windows}
272-
272+
273273
dfs = []
274274
for chrom, df in ins_table.groupby('chrom'):
275275
df = df.reset_index(drop=True)
276276
for win in windows:
277277
mask = (df[n_valid_pixels_key.format(WINDOW=win)].values >= min_valid_pixels[win])
278-
278+
279279
if min_dist_bad_bin:
280280
mask &= (df.dist_bad_bin.values >= min_dist_bad_bin)
281-
281+
282282
ins_track = df[log2_ins_key.format(WINDOW=win)].values[mask]
283283
poss, proms = peaks.find_peak_prominence(-ins_track)
284284
ins_prom_track = np.zeros_like(ins_track) * np.nan
285285
ins_prom_track[poss] = proms
286-
286+
287287
if win is not None:
288-
bs_key = 'boundary_strength_{win}'.format(win=win)
288+
bs_key = 'boundary_strength_{win}'.format(win=win)
289289
else:
290290
bs_key = 'boundary_strength'
291-
291+
292292
df[bs_key] = np.nan
293293
df.loc[mask, bs_key] = ins_prom_track
294-
294+
295295
dfs.append(df)
296296
return pd.concat(dfs)
297297

0 commit comments

Comments
 (0)