|
| 1 | +import bioframe as bf |
| 2 | +import cooler |
| 3 | +import numpy as np |
| 4 | +import pandas as pd |
| 5 | + |
| 6 | +from ..lib.checks import is_compatible_viewframe |
| 7 | +import logging |
| 8 | + |
| 9 | +logging.basicConfig(level=logging.INFO) |
| 10 | + |
| 11 | + |
| 12 | +def _generate_adjusted_chunks( |
| 13 | + clr, binmapping, chunksize=1_000_000, orientation_col="strand", nproc=1 |
| 14 | +): |
| 15 | + """Generates chunks of pixels from the cooler and adjusts their bin IDs to follow the view.""" |
| 16 | + npixels = clr.pixels().shape[0] |
| 17 | + chunks = np.append(np.arange(0, npixels, chunksize), npixels) |
| 18 | + chunks = list(zip(chunks[:-1], chunks[1:])) |
| 19 | + |
| 20 | + # Running this loop in parallel slows the function down |
| 21 | + for i0, i1 in chunks: |
| 22 | + chunk = clr.pixels()[i0:i1] |
| 23 | + chunk["bin1_id"] = chunk["bin1_id"].map(binmapping) |
| 24 | + chunk["bin2_id"] = chunk["bin2_id"].map(binmapping) |
| 25 | + chunk = chunk[(chunk["bin1_id"] != -1) & (chunk["bin2_id"] != -1)] |
| 26 | + if chunk.shape[0] > 0: |
| 27 | + chunk[["bin1_id", "bin2_id"]] = np.sort( |
| 28 | + chunk[["bin1_id", "bin2_id"]].astype(int) |
| 29 | + ) |
| 30 | + yield chunk.reset_index(drop=True) |
| 31 | + logging.info(f"Processed {i1/npixels*100:.2f}% pixels") |
| 32 | + |
| 33 | + |
| 34 | +def _adjust_start_end(chromdf): |
| 35 | + chromdf["end"] = chromdf["length"].cumsum() |
| 36 | + chromdf["start"] = chromdf["end"] - chromdf["length"] |
| 37 | + return chromdf |
| 38 | + |
| 39 | + |
| 40 | +def _flip_bins(regdf): |
| 41 | + regdf = regdf.iloc[::-1].reset_index(drop=True) |
| 42 | + l = regdf["end"] - regdf["start"] |
| 43 | + regdf["start"] = regdf["end"].max() - regdf["end"] |
| 44 | + regdf["end"] = regdf["start"] + l |
| 45 | + return regdf |
| 46 | + |
| 47 | + |
| 48 | +def rearrange_bins( |
| 49 | + bins_old, view_df, new_chrom_col="new_chrom", orientation_col="strand" |
| 50 | +): |
| 51 | + """ |
| 52 | + Rearranges the input `bins_old` based on the information in the `view_df` DataFrame. |
| 53 | +
|
| 54 | + Parameters |
| 55 | + ---------- |
| 56 | + bins_old : bintable |
| 57 | + The original bintable to rearrange. |
| 58 | + view_df : viewframe |
| 59 | + Viewframe with new order of genomic regions. Can have an additional column for |
| 60 | + the new chromosome name (`new_chrom_col`), and an additional column for the |
| 61 | + strand orientation (`orientation_col`, '-' will invert the region). |
| 62 | + new_chrom_col : str, optional |
| 63 | + Column name in the view_df specifying new chromosome name for each region, |
| 64 | + by default 'new_chrom'. If None, then the default chrom column names will be used. |
| 65 | + orientation_col : str, optional |
| 66 | + Column name in the view_df specifying strand orientation of each region, |
| 67 | + by default 'strand'. The values in this column can be "+" or "-". |
| 68 | + If None, then all will be assumed "+". |
| 69 | +
|
| 70 | +
|
| 71 | + Returns |
| 72 | + ------- |
| 73 | + bins_new : bintable |
| 74 | + The rearranged bintagle with the new chromosome names and orientations. |
| 75 | + bin_mapping : dict |
| 76 | + Mapping of original bin IDs to new bin IDs |
| 77 | + """ |
| 78 | + chromdict = dict(zip(view_df["name"].to_numpy(), view_df[new_chrom_col].to_numpy())) |
| 79 | + flipdict = dict( |
| 80 | + zip(view_df["name"].to_numpy(), (view_df[orientation_col] == "-").to_numpy()) |
| 81 | + ) |
| 82 | + bins_old = bins_old.reset_index(names=["old_id"]) |
| 83 | + bins_subset = bf.assign_view(bins_old, view_df, drop_unassigned=False).dropna( |
| 84 | + subset=["view_region"] |
| 85 | + ) |
| 86 | + bins_inverted = ( |
| 87 | + bins_subset.groupby("view_region", group_keys=False) |
| 88 | + .apply(lambda x: _flip_bins(x) if flipdict[x.name] else x) |
| 89 | + .reset_index(drop=True) |
| 90 | + ) |
| 91 | + bins_new = bf.sort_bedframe( |
| 92 | + bins_inverted, |
| 93 | + view_df=view_df, |
| 94 | + df_view_col="view_region", |
| 95 | + ) |
| 96 | + bins_new["chrom"] = bins_new["view_region"].map(chromdict) |
| 97 | + bins_new["length"] = bins_new["end"] - bins_new["start"] |
| 98 | + bins_new = ( |
| 99 | + bins_new.groupby("chrom", group_keys=False) |
| 100 | + .apply(_adjust_start_end) |
| 101 | + .drop(columns=["length", "view_region"]) |
| 102 | + ) |
| 103 | + logging.info("Rearranged bins") |
| 104 | + bin_mapping = {old_id: -1 for old_id in bins_old["old_id"].astype(int)} |
| 105 | + bin_mapping.update( |
| 106 | + { |
| 107 | + old_id: new_id |
| 108 | + for old_id, new_id in zip(bins_new["old_id"].astype(int), bins_new.index) |
| 109 | + } |
| 110 | + ) |
| 111 | + logging.info("Created bin mapping") |
| 112 | + bins_new = bins_new.drop(columns=["old_id"]) |
| 113 | + return bins_new, bin_mapping |
| 114 | + |
| 115 | + |
| 116 | +def rearrange_cooler( |
| 117 | + clr, |
| 118 | + view_df, |
| 119 | + out_cooler, |
| 120 | + new_chrom_col="new_chrom", |
| 121 | + orientation_col="strand", |
| 122 | + assembly=None, |
| 123 | + chunksize=10_000_000, |
| 124 | + mode="w", |
| 125 | +): |
| 126 | + """Reorder cooler following a genomic view. |
| 127 | +
|
| 128 | + Parameters |
| 129 | + ---------- |
| 130 | + clr : cooler.Cooler |
| 131 | + Cooler object |
| 132 | + view_df : viewframe |
| 133 | + Viewframe with new order of genomic regions. Can have an additional column for |
| 134 | + the new chromosome name (`new_chrom_col`), and an additional column for the |
| 135 | + strand orientation (`orientation_col`, '-' will invert the region). |
| 136 | + out_cooler : str |
| 137 | + File path to save the reordered data |
| 138 | + new_chrom_col : str, optional |
| 139 | + Column name in the view_df specifying new chromosome name for each region, |
| 140 | + by default 'new_chrom'. If None, then the default chrom column names will be used. |
| 141 | + orientation_col : str, optional |
| 142 | + Column name in the view_df specifying strand orientation of each region, |
| 143 | + by default 'strand'. The values in this column can be "+" or "-". |
| 144 | + If None, then all will be assumed "+". |
| 145 | + assembly : str, optional |
| 146 | + The name of the assembly for the new cooler. If None, uses the same as in the |
| 147 | + original cooler. |
| 148 | + chunksize : int, optional |
| 149 | + The number of pixels loaded and processed per step of computation. |
| 150 | + mode : str, optional |
| 151 | + (w)rite or (a)ppend to the output cooler file. Default: w |
| 152 | + """ |
| 153 | + |
| 154 | + view_df = view_df.copy() |
| 155 | + try: |
| 156 | + _ = is_compatible_viewframe( |
| 157 | + view_df[["chrom", "start", "end", "name"]], |
| 158 | + clr, |
| 159 | + check_sorting=False, |
| 160 | + raise_errors=True, |
| 161 | + ) |
| 162 | + except Exception as e: |
| 163 | + raise ValueError("view_df is not a valid viewframe or incompatible") from e |
| 164 | + |
| 165 | + if assembly is None: |
| 166 | + assembly = clr.info["genome-assembly"] |
| 167 | + |
| 168 | + # Add repeated entries for new chromosome names if they were not requested/absent: |
| 169 | + if new_chrom_col is None: |
| 170 | + new_chrom_col = "new_chrom" |
| 171 | + if new_chrom_col in view_df.columns: |
| 172 | + logging.warn( |
| 173 | + "new_chrom_col is not provided, but new_chrom column exists." |
| 174 | + " Pre-existing new_chrom column will not be used." |
| 175 | + ) |
| 176 | + while new_chrom_col in view_df.columns: |
| 177 | + new_chrom_col = ( |
| 178 | + f"new_chrom_{np.random.randint(0, np.iinfo(np.int32).max)}" |
| 179 | + ) |
| 180 | + if new_chrom_col not in view_df.columns: |
| 181 | + view_df.loc[:, new_chrom_col] = view_df["chrom"] |
| 182 | + |
| 183 | + # Add repeated entries for strand orientation of chromosomes if they were not requested/absent: |
| 184 | + if orientation_col is None: |
| 185 | + orientation_col = "strand" |
| 186 | + if orientation_col in view_df.columns: |
| 187 | + logging.warn( |
| 188 | + "orientation_col is not provided, but strand column exists." |
| 189 | + " Pre-existing strand column will not be used." |
| 190 | + ) |
| 191 | + while orientation_col in view_df.columns: |
| 192 | + orientation_col = f"strand_{np.random.randint(0, np.iinfo(np.int32).max)}" |
| 193 | + if orientation_col not in view_df.columns: |
| 194 | + view_df.loc[:, orientation_col] = "+" |
| 195 | + |
| 196 | + if not np.all( |
| 197 | + view_df.groupby(new_chrom_col).apply(lambda x: np.all(np.diff(x.index) == 1)) |
| 198 | + ): |
| 199 | + raise ValueError("New chromosomes are not consecutive") |
| 200 | + bins_old = clr.bins()[:] |
| 201 | + # Creating new bin table |
| 202 | + bins_new, bin_mapping = rearrange_bins( |
| 203 | + bins_old, view_df, new_chrom_col=new_chrom_col, orientation_col=orientation_col |
| 204 | + ) |
| 205 | + logging.info("Creating a new cooler") |
| 206 | + cooler.create_cooler( |
| 207 | + out_cooler, |
| 208 | + bins_new, |
| 209 | + _generate_adjusted_chunks( |
| 210 | + clr, |
| 211 | + bin_mapping, |
| 212 | + chunksize=chunksize, |
| 213 | + ), |
| 214 | + assembly=assembly, |
| 215 | + mode=mode, |
| 216 | + mergebuf=int(1e9), |
| 217 | + ) |
| 218 | + logging.info(f"Created a new cooler at {out_cooler}") |
0 commit comments