|
2 | 2 | #
|
3 | 3 | # SPDX-License-Identifier: MIT
|
4 | 4 |
|
| 5 | +import itertools |
| 6 | +import time |
5 | 7 | from functools import partial
|
6 | 8 | from typing import List, Optional, Tuple
|
7 | 9 |
|
8 | 10 | import numpy as np
|
| 11 | +import numpy.typing as npt |
| 12 | +import pandas as pd |
| 13 | +import scipy.sparse as ss |
| 14 | +import structlog |
9 | 15 |
|
10 | 16 | from . import TDA
|
| 17 | +from .common import pretty_format_elapsed_time |
11 | 18 | from .regressions import _compute_wQISA_predictions
|
12 | 19 |
|
13 | 20 |
|
14 |
| -def find_horizontal_domain(pd, coarse_h_domain, max_width=1e9): |
| 21 | +def find_horizontal_domain( |
| 22 | + profile: npt.NDArray[float], |
| 23 | + coarse_h_domain: Tuple[int, int, int], |
| 24 | + max_width: int = 1e9, |
| 25 | +) -> Tuple[int, int]: |
| 26 | + """ |
| 27 | + Returns |
| 28 | + ------- |
| 29 | + Tuple[int, int] |
| 30 | + the left and right coordinates of the horizontal domain |
| 31 | + """ |
15 | 32 |
|
16 | 33 | # Unpacking:
|
17 | 34 | MP, L_mP, R_mP = coarse_h_domain
|
18 | 35 |
|
19 | 36 | # Left and sides of candidate:
|
20 |
| - L_interval = np.flip(pd[L_mP : MP + 1]) |
21 |
| - R_interval = pd[MP : R_mP + 1] |
| 37 | + L_interval = np.flip(profile[L_mP : MP + 1]) |
| 38 | + R_interval = profile[MP : R_mP + 1] |
22 | 39 |
|
23 | 40 | # LEFT INTERVAL
|
24 |
| - L_interval_shifted = np.append(L_interval[1:], [max(pd) + 1], axis=0) |
| 41 | + L_interval_shifted = np.append(L_interval[1:], [max(profile) + 1], axis=0) |
25 | 42 | L_bound = np.where(L_interval - L_interval_shifted < 0)[0][0] + 1
|
26 | 43 | # L_interval_restr = L_interval[:L_bound]
|
27 | 44 | # L_interval_shifted_restr = L_interval_shifted[:L_bound]
|
28 | 45 | # L_bound = np.argmax(L_interval_restr - L_interval_shifted_restr) + 1
|
29 |
| - L_bound = min(L_bound, max_width) |
| 46 | + L_bound = np.minimum(L_bound, max_width) |
30 | 47 |
|
31 | 48 | # RIGHT INTERVAL
|
32 |
| - R_interval_shifted = np.append(R_interval[1:], [max(pd) + 1], axis=0) |
| 49 | + R_interval_shifted = np.append(R_interval[1:], [max(profile) + 1], axis=0) |
33 | 50 | R_bound = np.where(R_interval - R_interval_shifted < 0)[0][0] + 1
|
34 | 51 | # R_interval_restr = R_interval[:R_bound]
|
35 | 52 | # R_interval_shifted_restr = R_interval_shifted[:R_bound]
|
36 | 53 | # R_bound = np.argmax(R_interval_restr - R_interval_shifted_restr) + 1
|
37 |
| - R_bound = min(R_bound, max_width) |
| 54 | + R_bound = np.minimum(R_bound, max_width) |
38 | 55 |
|
39 |
| - return [max(MP - L_bound, 0), min(MP + R_bound, len(pd))] |
| 56 | + return max(MP - L_bound, 0), min(MP + R_bound, len(profile)) |
40 | 57 |
|
41 | 58 |
|
42 | 59 | def find_lower_v_domain(I, threshold_cut, max_height, min_persistence, it) -> Tuple[List, Optional[List]]:
|
@@ -114,38 +131,51 @@ def find_upper_v_domain(I, threshold_cut, max_height, min_persistence, it) -> Tu
|
114 | 131 | return [seed_site - candida_bound[0], seed_site], list(seed_site - np.array(loc_Maxima[:-1]))
|
115 | 132 |
|
116 | 133 |
|
117 |
| -def find_HIoIs(pd, seed_sites, seed_site_bounds, max_width, map=map): |
| 134 | +def find_HIoIs( |
| 135 | + pseudodistribution: npt.NDArray[float], |
| 136 | + seed_sites: npt.NDArray[int], |
| 137 | + seed_site_bounds: npt.NDArray[int], |
| 138 | + max_width: int, |
| 139 | + map_=map, |
| 140 | + logger=None, |
| 141 | +) -> pd.DataFrame: |
118 | 142 | """
|
119 |
| - :param pd: acronym for pseudo-distribution, but can be any 1D array representing a uniformly-sample |
120 |
| - scalar function works |
| 143 | + :param pseudodistribution: 1D array representing a uniformly-sample scalar function works |
121 | 144 | :param seed_sites: maximum values in the pseudo-distribution (i.e., genomic coordinates hosting linear
|
122 | 145 | patterns)
|
123 | 146 | :param seed_site_bounds: for the i-th entry of seed_sites:
|
124 | 147 | (*) seed_site_bounds[i] is the left boundary
|
125 | 148 | (*) seed_site_bounds[i+1] is the right boundary
|
126 | 149 | :param max_width: maximum width allowed
|
127 |
| - :param map: alternative implementation of the built-in map function. Can be used to e.g. run this step in parallel by passing multiprocessing.Pool().map. |
| 150 | + :param map_: alternative implementation of the built-in map function. Can be used to e.g. run this step in parallel by passing multiprocessing.Pool().map. |
128 | 151 | :return:
|
129 |
| - HIoIs list of lists, where each sublist is a pair consisting of the left and right boundaries |
| 152 | + HIoIs a pd.DataFrame the list of left and right boundary for each seed site |
130 | 153 | """
|
131 | 154 | assert len(seed_site_bounds) == len(seed_sites) + 1
|
132 | 155 |
|
| 156 | + t0 = time.time() |
| 157 | + if logger is None: |
| 158 | + logger = structlog.get_logger() |
| 159 | + |
133 | 160 | iterable_input = [
|
134 | 161 | (seed_site, seed_site_bounds[num_MP], seed_site_bounds[num_MP + 1])
|
135 | 162 | for num_MP, seed_site in enumerate(seed_sites)
|
136 | 163 | ]
|
137 | 164 |
|
138 |
| - HIoIs = list(map(partial(find_horizontal_domain, pd, max_width=max_width), iterable_input)) |
| 165 | + tasks = map_(partial(find_horizontal_domain, pseudodistribution, max_width=max_width), iterable_input) |
| 166 | + # This efficiently constructs a 2D numpy with shape (N, 2) from a list of 2-element tuples, where N is the number of seed sites. |
| 167 | + # The first and second columns contains the left and right boundaries of the horizontal domains, respectively. |
| 168 | + HIoIs = np.fromiter(itertools.chain.from_iterable(tasks), count=2 * len(seed_sites), dtype=int).reshape(-1, 2) |
| 169 | + |
| 170 | + # Handle possible overlapping intervals by ensuring that the |
| 171 | + # left bound of interval i + 1 is always greater or equal than the right bound of interval i |
| 172 | + HIoIs[1:, 0] = np.maximum(HIoIs[1:, 0], HIoIs[:-1, 1]) |
139 | 173 |
|
140 |
| - # Handle possible overlapping intervals: |
141 |
| - for i in range(len(HIoIs) - 1): |
142 |
| - current_pair = HIoIs[i] |
143 |
| - next_pair = HIoIs[i + 1] |
| 174 | + df = pd.DataFrame(data=HIoIs, columns=["left_bound", "right_bound"]) |
144 | 175 |
|
145 |
| - if current_pair[1] > next_pair[0]: # Check for intersection |
146 |
| - next_pair[0] = current_pair[1] # Modify the second pair |
| 176 | + logger.debug("find_HIoIs took %s", pretty_format_elapsed_time(t0)) |
147 | 177 |
|
148 |
| - return HIoIs |
| 178 | + return df |
149 | 179 |
|
150 | 180 |
|
151 | 181 | def find_VIoIs(
|
|
0 commit comments