From 907a35f7e798c747b352cb962d475705e3c45d72 Mon Sep 17 00:00:00 2001 From: Phlya Date: Fri, 3 May 2024 15:16:35 +0200 Subject: [PATCH 1/4] Actually use chromsizes from the header --- pairtools/lib/scaling.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pairtools/lib/scaling.py b/pairtools/lib/scaling.py index 7e89b468..ffb85d5a 100644 --- a/pairtools/lib/scaling.py +++ b/pairtools/lib/scaling.py @@ -388,7 +388,9 @@ def compute_scaling( pairs_df = pairs elif isinstance(pairs, str) or hasattr(pairs, "buffer") or hasattr(pairs, "peek"): - pairs_df, _, _ = pairsio.read_pairs(pairs, nproc=nproc_in, chunksize=chunksize) + pairs_df, _, chromsizes = pairsio.read_pairs( + pairs, nproc=nproc_in, chunksize=chunksize + ) else: raise ValueError( "pairs must be either a path to a pairs file or a pd.DataFrame" From 9c5c77ee14da42dc5e76c875006e9b31786dc7d8 Mon Sep 17 00:00:00 2001 From: Phlya Date: Fri, 3 May 2024 15:16:42 +0200 Subject: [PATCH 2/4] black --- pairtools/lib/scaling.py | 38 ++--- pairtools/lib/stats.py | 289 +++++++++++++++++++++++---------------- 2 files changed, 194 insertions(+), 133 deletions(-) diff --git a/pairtools/lib/scaling.py b/pairtools/lib/scaling.py index ffb85d5a..0f8ce18a 100644 --- a/pairtools/lib/scaling.py +++ b/pairtools/lib/scaling.py @@ -134,7 +134,11 @@ def make_empty_cross_region_table( def bins_pairs_by_distance( - pairs_df, dist_bins, regions=None, chromsizes=None, ignore_trans=False, + pairs_df, + dist_bins, + regions=None, + chromsizes=None, + ignore_trans=False, keep_unassigned=False, ): @@ -187,7 +191,6 @@ def bins_pairs_by_distance( pairs_df.chrom2.values, pairs_df.pos2.values, regions ).T - pairs_reduced_df = pd.DataFrame( { "chrom1": pairs_df.chrom1.values, @@ -207,10 +210,11 @@ def bins_pairs_by_distance( ) if not keep_unassigned: - pairs_reduced_df = (pairs_reduced_df - .query('(start1 >= 0) and (start2 >= 0)') + pairs_reduced_df = ( + pairs_reduced_df.query("(start1 >= 0) and (start2 >= 0)") # do not test for end1 and end2, as they can be -1 if regions and not specified - .reset_index(drop=True)) + .reset_index(drop=True) + ) pairs_reduced_df["min_dist"] = np.where( pairs_reduced_df["dist_bin_idx"] > 0, @@ -219,7 +223,7 @@ def bins_pairs_by_distance( ) pairs_reduced_df["max_dist"] = np.where( - pairs_reduced_df["dist_bin_idx"] < len(dist_bins)-1, + pairs_reduced_df["dist_bin_idx"] < len(dist_bins) - 1, dist_bins[pairs_reduced_df["dist_bin_idx"]], np.iinfo(np.int64).max, ) @@ -348,7 +352,7 @@ def compute_scaling( Parameters ---------- pairs : pd.DataFrame or str or file-like object - A table with pairs of genomic coordinates representing contacts. + A table with pairs of genomic coordinates representing contacts. It can be a pandas DataFrame, a path to a pairs file, or a file-like object. regions : bioframe viewframe or None, optional Genomic regions of interest. It can be anything that can serve as input to bioframe.from_any, @@ -379,9 +383,9 @@ def compute_scaling( """ dist_bins = geomspace( - dist_range[0], + dist_range[0], dist_range[1], - int(np.round(np.log10(dist_range[1]/dist_range[0])*n_dist_bins_decade)) + int(np.round(np.log10(dist_range[1] / dist_range[0]) * n_dist_bins_decade)), ) if isinstance(pairs, pd.DataFrame): @@ -406,7 +410,7 @@ def compute_scaling( regions=regions, chromsizes=chromsizes, ignore_trans=ignore_trans, - keep_unassigned=keep_unassigned + keep_unassigned=keep_unassigned, ) sc = sc_chunk if sc is None else sc.add(sc_chunk, fill_value=0) @@ -417,7 +421,6 @@ def compute_scaling( else trans_counts.add(trans_counts_chunk, fill_value=0) ) - # if not (isinstance(regions, pd.DataFrame) and # (set(regions.columns) == set(['chrom', 'start','end']))): # raise ValueError('regions must be provided as a dict or chrom-indexed Series of chromsizes or as a bedframe.') @@ -429,10 +432,9 @@ def compute_scaling( if not ignore_trans: trans_counts.reset_index(inplace=True) - trans_counts["n_bp2"] = ( - (trans_counts["end1"] - trans_counts["start1"]) * ( + trans_counts["n_bp2"] = (trans_counts["end1"] - trans_counts["start1"]) * ( trans_counts["end2"] - trans_counts["start2"] - )) + ) return sc, trans_counts @@ -452,12 +454,12 @@ def norm_scaling_factor(bins, cfreqs, norm_window): """ lo, hi = np.searchsorted(bins, norm_window) - return cfreqs[lo:hi+1].mean() + return cfreqs[lo : hi + 1].mean() def norm_scaling(bins, cfreqs, norm_window, log_input=False): """ - Normalize a contact-frequency-vs-distance curve, by setting the average contact frequency + Normalize a contact-frequency-vs-distance curve, by setting the average contact frequency in a given window to 1.0. Args: @@ -469,14 +471,14 @@ def norm_scaling(bins, cfreqs, norm_window, log_input=False): Returns: float or array-like: The normalized contact frequencies. """ - + norm = norm_scaling_factor(bins, cfreqs, norm_window) if log_input: return cfreqs - norm else: return cfreqs / norm - + def unity_norm_scaling(bins, cfreqs, norm_range=(1e4, 1e9)): bin_lens = np.diff(bins) bin_mids = np.sqrt(bins[1:] * bins[:-1]) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index e008314b..973eae06 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -18,10 +18,10 @@ def parse_number(s): elif s.replace(".", "", 1).isdigit(): return float(s) else: - return s + return s -def flat_dict_to_nested(input_dict, sep='/'): +def flat_dict_to_nested(input_dict, sep="/"): output_dict = {} for key, value in input_dict.items(): @@ -30,8 +30,10 @@ def flat_dict_to_nested(input_dict, sep='/'): elif type(key) == str: key_parts = key.split(sep) else: - raise ValueError(f"Key type can be either str or tuple. Found key {key} of type {type(key)}.") - + raise ValueError( + f"Key type can be either str or tuple. Found key {key} of type {type(key)}." + ) + current_dict = output_dict for key_part in key_parts[:-1]: current_dict = current_dict.setdefault(key_part, {}) @@ -40,7 +42,7 @@ def flat_dict_to_nested(input_dict, sep='/'): return output_dict -def nested_dict_to_flat(d, tuple_keys=False, sep='/'): +def nested_dict_to_flat(d, tuple_keys=False, sep="/"): """Flatten a nested dictionary to a flat dictionary. Parameters @@ -56,28 +58,31 @@ def nested_dict_to_flat(d, tuple_keys=False, sep='/'): dict A flat dictionary. """ - + if tuple_keys: - join_keys = lambda k1,k2: (k1,) + k2 + join_keys = lambda k1, k2: (k1,) + k2 else: - join_keys = lambda k1,k2: (k1+sep+k2) if k2 else k1 + join_keys = lambda k1, k2: (k1 + sep + k2) if k2 else k1 out = {} for k1, v1 in d.items(): if isinstance(v1, dict): - out.update({ - join_keys(k1,k2): v2 - for k2, v2 in nested_dict_to_flat(v1, tuple_keys, sep).items() - }) + out.update( + { + join_keys(k1, k2): v2 + for k2, v2 in nested_dict_to_flat(v1, tuple_keys, sep).items() + } + ) else: if tuple_keys: out[(k1,)] = v1 else: out[k1] = v1 - + return out + def is_nested_dict(d): """Check if a dictionary is nested. @@ -100,6 +105,7 @@ def is_nested_dict(d): return False + def is_tuple_keyed_dict(d): """Check if a dictionary is tuple-keyed. @@ -116,7 +122,7 @@ def is_tuple_keyed_dict(d): if not isinstance(d, dict): return False - for k,v in d.items(): + for k, v in d.items(): if not isinstance(k, tuple): return False if isinstance(v, dict): @@ -124,6 +130,7 @@ def is_tuple_keyed_dict(d): return True + def is_str_keyed_dict(d): """Check if a dictionary is string-keyed. @@ -140,7 +147,7 @@ def is_str_keyed_dict(d): if not isinstance(d, dict): return False - for k,v in d.keys(): + for k, v in d.keys(): if not isinstance(k, str): return False if isinstance(v, dict): @@ -149,7 +156,7 @@ def is_str_keyed_dict(d): return True -def swap_levels_nested_dict(nested_dict, level1, level2, sep='/'): +def swap_levels_nested_dict(nested_dict, level1, level2, sep="/"): """Swap the order of two levels in a nested dictionary. Parameters @@ -171,23 +178,26 @@ def swap_levels_nested_dict(nested_dict, level1, level2, sep='/'): for k1, v1 in nested_dict.items(): k1_list = list(k1) k1_list[level1], k1_list[level2] = k1_list[level2], k1_list[level1] - out[tuple(k1_list)] = v1 + out[tuple(k1_list)] = v1 return out - + elif is_nested_dict(nested_dict): out = nested_dict_to_flat(nested_dict, tuple_keys=True) out = swap_levels_nested_dict(out, level1, level2) out = flat_dict_to_nested(out) return out - + elif is_str_keyed_dict(nested_dict): out = nested_dict_to_flat(nested_dict, sep=sep) out = swap_levels_nested_dict(out, level1, level2) - out = {sep.join(k):v for k,v in out.items()} + out = {sep.join(k): v for k, v in out.items()} return out - + else: - raise ValueError("Input dictionary must be either nested, string-keyed or tuple-keyed") + raise ValueError( + "Input dictionary must be either nested, string-keyed or tuple-keyed" + ) + class PairCounter(Mapping): """ @@ -205,7 +215,7 @@ class PairCounter(Mapping): DIST_FREQ_REL_DIFF_THRESHOLD = 0.05 N_DIST_BINS_DECADE_DEFAULT = 8 MIN_LOG10_DIST_DEFAULT = 0 - MAX_LOG10_DIST_DEFAULT = 9 + MAX_LOG10_DIST_DEFAULT = 9 def __init__( self, @@ -234,7 +244,7 @@ def __init__( # genomic distance bining for the ++/--/-+/+- distribution log10_dist_bin_step = 1.0 / n_dist_bins_decade self._dist_bins = np.unique( - np.r_[ + np.r_[ 0, np.round( 10 @@ -261,7 +271,7 @@ def __init__( self._stat[key]["cis"] = 0 self._stat[key]["trans"] = 0 self._stat[key]["pair_types"] = {} - + # to be removed: # self._stat[key]["dedup"] = {} @@ -307,7 +317,6 @@ def __init__( ) self._summaries_calculated = False - def __getitem__(self, key, filter="no_filter"): if isinstance(key, str): # let's strip any unintentional '/' @@ -378,40 +387,45 @@ def __getitem__(self, key, filter="no_filter"): else: raise ValueError("{} is not a valid key".format(k)) - def __iter__(self): return iter(self._stat) - def __len__(self): return len(self._stat) - def find_dist_freq_convergence_distance(self, rel_threshold): - """Finds the largest distance at which the frequency of pairs of reads - with different strands deviates from their average by the specified + """Finds the largest distance at which the frequency of pairs of reads + with different strands deviates from their average by the specified relative threshold.""" - + out = {} all_strands = ["++", "--", "-+", "+-"] for filter in self.filters: out[filter] = {} - + dist_freqs_by_strands = { - strands: np.array(list(self._stat[filter]["dist_freq"][strands].values())) - for strands in all_strands} - + strands: np.array( + list(self._stat[filter]["dist_freq"][strands].values()) + ) + for strands in all_strands + } + # Calculate the average frequency of pairs with different strands - avg_freq_all_strands = np.mean(np.vstack(list(dist_freqs_by_strands.values())), axis=0) + avg_freq_all_strands = np.mean( + np.vstack(list(dist_freqs_by_strands.values())), axis=0 + ) # Calculate the largest distance at which the frequency of pairs of at least one strand combination deviates from the average by the given threshold - rel_deviations = {strands: np.nan_to_num( - np.abs(dist_freqs_by_strands[strands] - avg_freq_all_strands) - / avg_freq_all_strands) - for strands in all_strands} - - idx_maxs = {strand:0 for strand in all_strands} + rel_deviations = { + strands: np.nan_to_num( + np.abs(dist_freqs_by_strands[strands] - avg_freq_all_strands) + / avg_freq_all_strands + ) + for strands in all_strands + } + + idx_maxs = {strand: 0 for strand in all_strands} for strands in all_strands: bin_exceeds = rel_deviations[strands] > rel_threshold if np.any(bin_exceeds): @@ -419,70 +433,83 @@ def find_dist_freq_convergence_distance(self, rel_threshold): # Find the largest distance and the strand combination where frequency of pairs deviates from the average by the given threshold: convergence_bin_idx = 0 - convergence_strands = '??' - convergence_dist = '0' + convergence_strands = "??" + convergence_dist = "0" for strands in all_strands: - if (idx_maxs[strands] > convergence_bin_idx): + if idx_maxs[strands] > convergence_bin_idx: convergence_bin_idx = idx_maxs[strands] convergence_strands = strands if idx_maxs[strands] < len(self._dist_bins): - convergence_dist = self._dist_bins[convergence_bin_idx+1] + convergence_dist = self._dist_bins[convergence_bin_idx + 1] else: convergence_dist = np.iinfo(np.int64) - - + out[filter]["convergence_dist"] = convergence_dist out[filter]["strands_w_max_convergence_dist"] = convergence_strands - out[filter]['convergence_rel_diff_threshold'] = rel_threshold + out[filter]["convergence_rel_diff_threshold"] = rel_threshold - out[filter]['n_cis_pairs_below_convergence_dist'] = { - strands:dist_freqs_by_strands[strands][:convergence_bin_idx+1].sum() for strands in all_strands + out[filter]["n_cis_pairs_below_convergence_dist"] = { + strands: dist_freqs_by_strands[strands][: convergence_bin_idx + 1].sum() + for strands in all_strands for strands in all_strands } - out[filter]['n_cis_pairs_below_convergence_dist_all_strands'] = sum( - list(out[filter]['n_cis_pairs_below_convergence_dist'].values())) + out[filter]["n_cis_pairs_below_convergence_dist_all_strands"] = sum( + list(out[filter]["n_cis_pairs_below_convergence_dist"].values()) + ) n_cis_pairs_above_convergence_dist = { - strands:dist_freqs_by_strands[strands][convergence_bin_idx+1:].sum() for strands in all_strands + strands: dist_freqs_by_strands[strands][convergence_bin_idx + 1 :].sum() + for strands in all_strands for strands in all_strands } - - out[filter]['n_cis_pairs_above_convergence_dist_all_strands'] = sum( - list(n_cis_pairs_above_convergence_dist.values())) + + out[filter]["n_cis_pairs_above_convergence_dist_all_strands"] = sum( + list(n_cis_pairs_above_convergence_dist.values()) + ) norms = dict( - cis=self._stat[filter]['cis'], - total_mapped=self._stat[filter]['total_mapped'] + cis=self._stat[filter]["cis"], + total_mapped=self._stat[filter]["total_mapped"], ) - if 'total_nodups' in self._stat[filter]: - norms['total_nodups'] = self._stat[filter]['total_nodups'] + if "total_nodups" in self._stat[filter]: + norms["total_nodups"] = self._stat[filter]["total_nodups"] for key, norm_factor in norms.items(): - out[filter][f'frac_{key}_in_cis_below_convergence_dist'] = { - strands: n_cis_pairs / norm_factor - for strands, n_cis_pairs in out[filter]['n_cis_pairs_below_convergence_dist'].items() + out[filter][f"frac_{key}_in_cis_below_convergence_dist"] = { + strands: n_cis_pairs / norm_factor + for strands, n_cis_pairs in out[filter][ + "n_cis_pairs_below_convergence_dist" + ].items() } - out[filter][f'frac_{key}_in_cis_below_convergence_dist_all_strands'] = sum( - list(out[filter][f'frac_{key}_in_cis_below_convergence_dist'].values())) + out[filter][f"frac_{key}_in_cis_below_convergence_dist_all_strands"] = ( + sum( + list( + out[filter][ + f"frac_{key}_in_cis_below_convergence_dist" + ].values() + ) + ) + ) - out[filter][f'frac_{key}_in_cis_above_convergence_dist_all_strands'] = ( - sum(list(n_cis_pairs_above_convergence_dist.values())) / norm_factor ) + out[filter][f"frac_{key}_in_cis_above_convergence_dist_all_strands"] = ( + sum(list(n_cis_pairs_above_convergence_dist.values())) / norm_factor + ) return out - def calculate_summaries(self): """calculate summary statistics (fraction of cis pairs at different cutoffs, complexity estimate) based on accumulated counts. Results are saved into self._stat["filter_name"]['summary"] """ convergence_stats = self.find_dist_freq_convergence_distance( - self.DIST_FREQ_REL_DIFF_THRESHOLD) + self.DIST_FREQ_REL_DIFF_THRESHOLD + ) for filter_name in self.filters.keys(): for cis_count in ( @@ -495,23 +522,33 @@ def calculate_summaries(self): "cis_40kb+", ): self._stat[filter_name]["summary"][f"frac_{cis_count}"] = ( - (self._stat[filter_name][cis_count] / self._stat[filter_name]["total_nodups"]) + ( + self._stat[filter_name][cis_count] + / self._stat[filter_name]["total_nodups"] + ) if self._stat[filter_name]["total_nodups"] > 0 else 0 ) - self._stat[filter_name]["summary"]["dist_freq_convergence"] = convergence_stats[filter_name] + self._stat[filter_name]["summary"]["dist_freq_convergence"] = ( + convergence_stats[filter_name] + ) self._stat[filter_name]["summary"]["frac_dups"] = ( - (self._stat[filter_name]["total_dups"] / self._stat[filter_name]["total_mapped"]) + ( + self._stat[filter_name]["total_dups"] + / self._stat[filter_name]["total_mapped"] + ) if self._stat[filter_name]["total_mapped"] > 0 else 0 ) - self._stat[filter_name]["summary"][ - "complexity_naive" - ] = estimate_library_complexity( - self._stat[filter_name]["total_mapped"], self._stat[filter_name]["total_dups"], 0 + self._stat[filter_name]["summary"]["complexity_naive"] = ( + estimate_library_complexity( + self._stat[filter_name]["total_mapped"], + self._stat[filter_name]["total_dups"], + 0, + ) ) if filter_name == "no_filter" and self._save_bytile_dups: @@ -535,7 +572,6 @@ def calculate_summaries(self): self._summaries_calculated = True - @classmethod def from_file(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): """create instance of PairCounter from file @@ -562,28 +598,31 @@ def from_file(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): "{} is not a valid stats file".format(file_handle.name) ) raw_stat[key_val_pair[0]] = parse_number(key_val_pair[1]) - - ## TODO: check if raw_stat does not contain any unknown keys + ## TODO: check if raw_stat does not contain any unknown keys # Convert flat dict to nested dict - stat_from_file._stat[default_filter].update(flat_dict_to_nested(raw_stat, sep=cls._KEY_SEP)) + stat_from_file._stat[default_filter].update( + flat_dict_to_nested(raw_stat, sep=cls._KEY_SEP) + ) - stat_from_file._stat[default_filter]['chrom_freq'] = nested_dict_to_flat( - stat_from_file._stat[default_filter]['chrom_freq'], tuple_keys=True) + stat_from_file._stat[default_filter]["chrom_freq"] = nested_dict_to_flat( + stat_from_file._stat[default_filter]["chrom_freq"], tuple_keys=True + ) - bin_to_left_val = lambda bin: int(bin.rstrip('+') if ('+' in bin) else bin.split('-')[0]) + bin_to_left_val = lambda bin: int( + bin.rstrip("+") if ("+" in bin) else bin.split("-")[0] + ) - stat_from_file._stat[default_filter]['dist_freq'] = { + stat_from_file._stat[default_filter]["dist_freq"] = { bin_to_left_val(k): v - for k,v in stat_from_file._stat[default_filter]['dist_freq'].items() - } + for k, v in stat_from_file._stat[default_filter]["dist_freq"].items() + } - stat_from_file._stat[default_filter]['dist_freq'] = swap_levels_nested_dict( - stat_from_file._stat[default_filter]['dist_freq'], 0, 1 + stat_from_file._stat[default_filter]["dist_freq"] = swap_levels_nested_dict( + stat_from_file._stat[default_filter]["dist_freq"], 0, 1 ) - return stat_from_file @classmethod @@ -601,7 +640,9 @@ def from_yaml(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): stat = yaml.safe_load(file_handle) stat_from_file = cls( n_dist_bins_decade=n_dist_bins_decade, - filters={key: val.get("filter_expression", "") for key, val in stat.items()} + filters={ + key: val.get("filter_expression", "") for key, val in stat.items() + }, ) for key, filter in stat.items(): @@ -614,7 +655,6 @@ def from_yaml(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): stat_from_file._stat = stat return stat_from_file - def add_pair( self, chrom1, @@ -679,14 +719,12 @@ def add_pair( for dist_kb in [1, 2, 4, 10, 20, 40]: if dist >= dist_kb * 1000: self._stat[filter][f"cis_{dist_kb}kb+"] += 1 - else: self._stat[filter]["trans"] += 1 else: self._stat[filter]["total_single_sided_mapped"] += 1 - def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter. @@ -834,7 +872,7 @@ def __add__(self, other, filter="no_filter"): sum_stat._stat[filter][k] = ( self._stat[filter][k] + other._stat[filter][k] ) - + # sum nested dicts/arrays in a context dependet manner: else: if k in ["pair_types", "dedup"]: @@ -848,7 +886,7 @@ def __add__(self, other, filter="no_filter"): sum_stat._stat[filter][k] = sum_dicts( self._stat[filter][k], other._stat[filter][k] ) - + elif k == "chrom_freq": # union list of keys (chr1,chr2) with potential duplicates: union_keys_with_dups = list(self._stat[filter][k].keys()) + list( @@ -863,7 +901,7 @@ def __add__(self, other, filter="no_filter"): sum_stat._stat[filter][k][union_key] = self._stat[filter][ k ].get(union_key, 0) + other._stat[filter][k].get(union_key, 0) - + elif k == "dist_freq": for dirs in sum_stat[k]: from functools import reduce @@ -879,7 +917,7 @@ def reducer(accumulator, element): {}, ) # sum_stat[k][dirs] = self._stat[filter][k][dirs] + other._stat[filter][k][dirs] - + elif k == "chromsizes": if k in self._stat[filter] and k in other._stat[filter]: if self._stat[filter][k] == other._stat[filter][k]: @@ -933,7 +971,7 @@ def flatten(self, filter="no_filter"): for i in range(len(self._dist_bins)): for dirs, freqs in v.items(): dist = self._dist_bins[i] - + # last bin is treated differently: "100000+" vs "1200-3000": if i < len(self._dist_bins) - 1: dist_next = self._dist_bins[i + 1] @@ -945,24 +983,36 @@ def flatten(self, filter="no_filter"): ["{}", "{}+", "{}"] ).format(k, dist, dirs) else: - raise ValueError("There is a mismatch between dist_freq bins in the instance") - + raise ValueError( + "There is a mismatch between dist_freq bins in the instance" + ) + # store key,value pair: - try: + try: flat_stat[formatted_key] = freqs[dist] except: # in some previous versions of stats, last bin was not reported, so we need to skip it now: - if (dist not in freqs) and (i == len(self._dist_bins) - 1): - flat_stat[formatted_key] = 0 + if (dist not in freqs) and ( + i == len(self._dist_bins) - 1 + ): + flat_stat[formatted_key] = 0 else: - raise ValueError(f"Error in {k} {dirs} {dist} {dist_next} {freqs}: source and destination bins do not match") - - elif (k in ["pair_types", "dedup", "chromsizes", 'summary']) and v: + raise ValueError( + f"Error in {k} {dirs} {dist} {dist_next} {freqs}: source and destination bins do not match" + ) + + elif (k in ["pair_types", "dedup", "chromsizes", "summary"]) and v: # 'pair_types' and 'dedup' are simple dicts inside, # treat them the exact same way: flat_stat.update( - {k+self._KEY_SEP+k2 : v2 for k2,v2 in nested_dict_to_flat(v, sep=self._KEY_SEP).items()}) - + { + k + self._KEY_SEP + k2: v2 + for k2, v2 in nested_dict_to_flat( + v, sep=self._KEY_SEP + ).items() + } + ) + elif (k == "chrom_freq") and v: for (chrom1, chrom2), freq in v.items(): formatted_key = self._KEY_SEP.join(["{}", "{}", "{}"]).format( @@ -985,8 +1035,8 @@ def format_yaml(self, filter="no_filter"): # Storing statistics for each filter for filter_name in self.filters.keys(): for k, v in self._stat[filter_name].items(): - if (k == "chrom_freq"): - v = {self._KEY_SEP.join(k2):v2 for k2, v2 in v.items()} + if k == "chrom_freq": + v = {self._KEY_SEP.join(k2): v2 for k2, v2 in v.items()} if v: formatted_stat[filter_name][k] = deepcopy(v) # return formatted dict @@ -1031,7 +1081,6 @@ def save(self, outstream, yaml=False, filter="no_filter"): for k, v in data.items(): outstream.write("{}{}{}\n".format(k, self._SEP, v)) - def save_bytile_dups(self, outstream): """save bytile duplication counts to a tab-delimited text file. Parameters @@ -1063,9 +1112,19 @@ def do_merge(output, files_to_merge, **kwargs): ) # use a factory method to instanciate PairCounter if kwargs.get("yaml", False): - stat = PairCounter.from_yaml(f, n_dist_bins_decade=kwargs.get('n_dist_bins_decade', PairCounter.N_DIST_BINS_DECADE_DEFAULT)) + stat = PairCounter.from_yaml( + f, + n_dist_bins_decade=kwargs.get( + "n_dist_bins_decade", PairCounter.N_DIST_BINS_DECADE_DEFAULT + ), + ) else: - stat = PairCounter.from_file(f, n_dist_bins_decade=kwargs.get('n_dist_bins_decade', PairCounter.N_DIST_BINS_DECADE_DEFAULT)) + stat = PairCounter.from_file( + f, + n_dist_bins_decade=kwargs.get( + "n_dist_bins_decade", PairCounter.N_DIST_BINS_DECADE_DEFAULT + ), + ) stats.append(stat) f.close() From bc0d82de95f7b3e905725db87a663bfca6e1ab88 Mon Sep 17 00:00:00 2001 From: Phlya Date: Fri, 3 May 2024 16:27:22 +0200 Subject: [PATCH 3/4] Correct test --- pairtools/cli/scaling.py | 12 +++++++++--- tests/test_scaling.py | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pairtools/cli/scaling.py b/pairtools/cli/scaling.py index 0b8512a0..35118550 100644 --- a/pairtools/cli/scaling.py +++ b/pairtools/cli/scaling.py @@ -53,7 +53,9 @@ help="Number of bins to split the distance range in log10-space, specified per a factor of 10 difference.", ) @common_io_options -def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs): +def scaling( + input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs +): """Calculate pairs scalings. INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics. @@ -63,10 +65,14 @@ def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, Output is .tsv file with scaling stats (both cis scalings and trans levels). """ - scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs) + scaling_py( + input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs + ) -def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs): +def scaling_py( + input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs +): if len(input_path) == 0: raise ValueError(f"No input paths: {input_path}") diff --git a/tests/test_scaling.py b/tests/test_scaling.py index 21046004..f9653056 100644 --- a/tests/test_scaling.py +++ b/tests/test_scaling.py @@ -23,5 +23,5 @@ def test_scaling(): output = pd.read_csv(io.StringIO(result), sep="\t", header=0) assert ( - output["n_pairs"].sum() == 9 + output["n_pairs"].sum() == 7 ) # double unmapped pairs are currently ignored by lib.scaling From 854b1c622820fed01f5f76b1b649b687e013aa48 Mon Sep 17 00:00:00 2001 From: Phlya Date: Fri, 3 May 2024 17:12:49 +0200 Subject: [PATCH 4/4] better comment --- tests/test_scaling.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_scaling.py b/tests/test_scaling.py index f9653056..5bd41614 100644 --- a/tests/test_scaling.py +++ b/tests/test_scaling.py @@ -22,6 +22,4 @@ def test_scaling(): output = pd.read_csv(io.StringIO(result), sep="\t", header=0) - assert ( - output["n_pairs"].sum() == 7 - ) # double unmapped pairs are currently ignored by lib.scaling + assert output["n_pairs"].sum() == 7 # unmapped pairs are ignored by lib.scaling