13
13
def get_n_pixels (bad_bin_mask , window = 10 , ignore_diags = 2 ):
14
14
"""
15
15
Calculate the number of "good" pixels in a diamond at each bin.
16
-
16
+
17
17
"""
18
18
N = len (bad_bin_mask )
19
19
n_pixels = np .zeros (N )
@@ -38,7 +38,7 @@ def get_n_pixels(bad_bin_mask, window=10, ignore_diags=2):
38
38
return n_pixels
39
39
40
40
41
- def insul_diamond (pixel_query , bins , window = 10 , ignore_diags = 2 ,
41
+ def insul_diamond (pixel_query , bins , window = 10 , ignore_diags = 2 ,
42
42
norm_by_median = True ):
43
43
"""
44
44
Calculates the insulation score of a Hi-C interaction matrix.
@@ -66,21 +66,20 @@ def insul_diamond(pixel_query, bins, window=10, ignore_diags=2,
66
66
N = hi_bin_id - lo_bin_id
67
67
sum_counts = np .zeros (N )
68
68
sum_balanced = np .zeros (N )
69
-
70
- n_pixels = get_n_pixels (bins .weight .isnull ().values ,
71
- window = window ,
69
+
70
+ n_pixels = get_n_pixels (bins .weight .isnull ().values ,
71
+ window = window ,
72
72
ignore_diags = ignore_diags )
73
-
74
-
73
+
75
74
for chunk_dict in pixel_query .read_chunked ():
76
75
chunk = pd .DataFrame (chunk_dict , columns = [
77
76
'bin1_id' , 'bin2_id' , 'count' ])
78
77
diag_pixels = chunk [chunk .bin2_id - chunk .bin1_id <= (window - 1 ) * 2 ]
79
-
78
+
80
79
diag_pixels = cooler .annotate (diag_pixels , bins [['weight' ]])
81
80
diag_pixels ['balanced' ] = (
82
- diag_pixels ['count' ]
83
- * diag_pixels ['weight1' ]
81
+ diag_pixels ['count' ]
82
+ * diag_pixels ['weight1' ]
84
83
* diag_pixels ['weight2' ]
85
84
)
86
85
valid_pixel_mask = ~ diag_pixels ['balanced' ].isnull ().values
@@ -92,18 +91,18 @@ def insul_diamond(pixel_query, bins, window=10, ignore_diags=2,
92
91
for j_shift in range (0 , window ):
93
92
if i_shift + j_shift < ignore_diags :
94
93
continue
95
-
94
+
96
95
mask = ((i + i_shift == j - j_shift ) &
97
96
(i + i_shift < N ) & (j - j_shift >= 0 ))
98
97
99
98
sum_counts += np .bincount (
100
99
i [mask ] + i_shift ,
101
- diag_pixels ['count' ].values [mask ],
100
+ diag_pixels ['count' ].values [mask ],
102
101
minlength = N )
103
-
102
+
104
103
sum_balanced += np .bincount (
105
- i [mask & valid_pixel_mask ] + i_shift ,
106
- diag_pixels ['balanced' ].values [mask & valid_pixel_mask ],
104
+ i [mask & valid_pixel_mask ] + i_shift ,
105
+ diag_pixels ['balanced' ].values [mask & valid_pixel_mask ],
107
106
minlength = N )
108
107
109
108
with warnings .catch_warnings ():
@@ -123,6 +122,7 @@ def calculate_insulation_score(
123
122
ignore_diags = None ,
124
123
chromosomes = None ,
125
124
append_raw_scores = False ,
125
+ chunksize = 20000000 ,
126
126
verbose = False ,
127
127
):
128
128
'''Calculate the diamond insulation scores and call insulating boundaries.
@@ -142,7 +142,7 @@ def calculate_insulation_score(
142
142
to the output table.
143
143
verbose : bool
144
144
If True, report real-time progress.
145
-
145
+
146
146
Returns
147
147
-------
148
148
ins_table : pandas.DataFrame
@@ -174,13 +174,13 @@ def calculate_insulation_score(
174
174
clr .open ('r' ),
175
175
shape = (nbins , nbins ),
176
176
field = 'count' ,
177
- chunksize = 10000000 )
177
+ chunksize = chunksize )
178
178
179
179
ins_chrom_tables = []
180
180
for chrom in chromosomes :
181
181
if verbose :
182
182
logging .info ('Processing {}' .format (chrom ))
183
-
183
+
184
184
chrom_bins = clr .bins ().fetch (chrom )
185
185
ins_chrom = chrom_bins [['chrom' , 'start' , 'end' ]].copy ()
186
186
ins_chrom ['is_bad_bin' ] = chrom_bins ['weight' ].isnull ()
@@ -204,7 +204,7 @@ def calculate_insulation_score(
204
204
205
205
ins_chrom ['log2_insulation_score_{}' .format (window_bp [j ])] = ins_track
206
206
ins_chrom ['n_valid_pixels_{}' .format (window_bp [j ])] = n_pixels
207
-
207
+
208
208
if append_raw_scores :
209
209
ins_chrom ['sum_counts_{}' .format (window_bp [j ])] = sum_counts
210
210
ins_chrom ['sum_balanced_{}' .format (window_bp [j ])] = sum_balanced
@@ -219,44 +219,44 @@ def find_boundaries(
219
219
ins_table ,
220
220
min_frac_valid_pixels = 0.66 ,
221
221
min_dist_bad_bin = 0 ,
222
- log2_ins_key = 'log2_insulation_score_{WINDOW}' ,
223
- n_valid_pixels_key = 'n_valid_pixels_{WINDOW}' ,
224
- is_bad_bin_key = 'is_bad_bin'
225
-
222
+ log2_ins_key = 'log2_insulation_score_{WINDOW}' ,
223
+ n_valid_pixels_key = 'n_valid_pixels_{WINDOW}' ,
224
+ is_bad_bin_key = 'is_bad_bin'
225
+
226
226
):
227
227
'''Call insulating boundaries.
228
- Find all local minima of the log2(insulation score) and calculate their
228
+ Find all local minima of the log2(insulation score) and calculate their
229
229
chromosome-wide topographic prominence.
230
-
230
+
231
231
Parameters
232
232
----------
233
233
ins_table : pandas.DataFrame
234
234
A bin table with columns containing log2(insulation score),
235
- the number of valid pixels per diamond and (optionally) the mask
235
+ the number of valid pixels per diamond and (optionally) the mask
236
236
of bad bins.
237
237
min_frac_valid_pixels : float
238
- The minimal fraction of valid pixels in a diamond to be used in
238
+ The minimal fraction of valid pixels in a diamond to be used in
239
239
boundary picking and prominence calculation.
240
240
min_dist_bad_bin : int
241
- The minimal allowed distance to a bad bin.
241
+ The minimal allowed distance to a bad bin.
242
242
Ignore bins that have a bad bin closer than this distance.
243
243
log2_ins_key, n_valid_pixels_key : str
244
244
The names of the columns containing log2_insulation_score and
245
245
the number of valid pixels per diamond. When a template
246
- containing `{WINDOW}` is provided, the calculation is repeated
246
+ containing `{WINDOW}` is provided, the calculation is repeated
247
247
for all pairs of columns matching the template.
248
-
248
+
249
249
Returns
250
250
-------
251
251
ins_table : pandas.DataFrame
252
252
A bin table with appended columns with boundary prominences.
253
253
'''
254
-
254
+
255
255
if min_dist_bad_bin :
256
256
ins_table = pd .concat ([
257
257
df .assign (dist_bad_bin = numutils .dist_to_mask (df .is_bad_bin ))
258
258
for chrom ,df in ins_table .groupby ('chrom' )])
259
-
259
+
260
260
if '{WINDOW}' in log2_ins_key :
261
261
windows = set ()
262
262
for col in ins_table .columns :
@@ -265,33 +265,33 @@ def find_boundaries(
265
265
windows .add (int (m .groups ()[0 ]))
266
266
else :
267
267
windows = set ([None ])
268
-
268
+
269
269
min_valid_pixels = {
270
270
win :ins_table [n_valid_pixels_key .format (WINDOW = win )].max ()* min_frac_valid_pixels
271
271
for win in windows }
272
-
272
+
273
273
dfs = []
274
274
for chrom , df in ins_table .groupby ('chrom' ):
275
275
df = df .reset_index (drop = True )
276
276
for win in windows :
277
277
mask = (df [n_valid_pixels_key .format (WINDOW = win )].values >= min_valid_pixels [win ])
278
-
278
+
279
279
if min_dist_bad_bin :
280
280
mask &= (df .dist_bad_bin .values >= min_dist_bad_bin )
281
-
281
+
282
282
ins_track = df [log2_ins_key .format (WINDOW = win )].values [mask ]
283
283
poss , proms = peaks .find_peak_prominence (- ins_track )
284
284
ins_prom_track = np .zeros_like (ins_track ) * np .nan
285
285
ins_prom_track [poss ] = proms
286
-
286
+
287
287
if win is not None :
288
- bs_key = 'boundary_strength_{win}' .format (win = win )
288
+ bs_key = 'boundary_strength_{win}' .format (win = win )
289
289
else :
290
290
bs_key = 'boundary_strength'
291
-
291
+
292
292
df [bs_key ] = np .nan
293
293
df .loc [mask , bs_key ] = ins_prom_track
294
-
294
+
295
295
dfs .append (df )
296
296
return pd .concat (dfs )
297
297
0 commit comments