From e827f2f3a26441c80f1a471760867d3e0a99f287 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:27:49 +0100 Subject: [PATCH] Optimize find_HIoIs() The optimization result in a 10% improvement in single-threaded performance --- src/stripepy/stripepy.py | 38 +++++++++++------- src/stripepy/utils/finders.py | 72 +++++++++++++++++++++++++---------- 2 files changed, 75 insertions(+), 35 deletions(-) diff --git a/src/stripepy/stripepy.py b/src/stripepy/stripepy.py index a17d5c9..fccd559 100644 --- a/src/stripepy/stripepy.py +++ b/src/stripepy/stripepy.py @@ -402,35 +402,45 @@ def step_3( LT_bounded_mPs = np.concatenate(LT_bounded_mPs, dtype=int) UT_bounded_mPs = np.concatenate(UT_bounded_mPs, dtype=int) - # List of pairs (pair = left and right boundaries): - # Choose the variable criterion between max_ascent and max_perc_descent - # ---> When variable criterion is set to max_ascent, set the variable max_ascent - # ---> When variable criterion is set to max_perc_descent, set the variable max_perc_descent + # DataFrame with the left and right boundaries for each seed site LT_HIoIs = finders.find_HIoIs( - LT_pseudo_distrib, LT_MPs, LT_bounded_mPs, int(max_width / (2 * resolution)) + 1, map=map + pseudodistribution=LT_pseudo_distrib, + seed_sites=LT_MPs, + seed_site_bounds=LT_bounded_mPs, + max_width=int(max_width / (2 * resolution)) + 1, + map_=map, + logger=logger, ) UT_HIoIs = finders.find_HIoIs( - UT_pseudo_distrib, UT_MPs, UT_bounded_mPs, int(max_width / (2 * resolution)) + 1, map=map + pseudodistribution=UT_pseudo_distrib, + seed_sites=UT_MPs, + seed_site_bounds=UT_bounded_mPs, + max_width=int(max_width / (2 * resolution)) + 1, + map_=map, + logger=logger, ) - # List of left or right boundaries: - LT_L_bounds, LT_R_bounds = map(list, zip(*LT_HIoIs)) - UT_L_bounds, UT_R_bounds = map(list, zip(*UT_HIoIs)) - logger.bind(step=(3, 1, 2)).info("updating candidate stripes with width information") stripes = result.get("stripes", "LT") - for num_cand_stripe, (LT_L_bound, LT_R_bound) in enumerate(zip(LT_L_bounds, LT_R_bounds)): - stripes[num_cand_stripe].set_horizontal_bounds(LT_L_bound, LT_R_bound) + LT_HIoIs.apply( + lambda seed: stripes[seed.name].set_horizontal_bounds(seed["left_bound"], seed["right_bound"]), + axis="columns", + ) stripes = result.get("stripes", "UT") - for num_cand_stripe, (UT_L_bound, UT_R_bound) in enumerate(zip(UT_L_bounds, UT_R_bounds)): - stripes[num_cand_stripe].set_horizontal_bounds(UT_L_bound, UT_R_bound) + UT_HIoIs.apply( + lambda seed: stripes[seed.name].set_horizontal_bounds(seed["left_bound"], seed["right_bound"]), + axis="columns", + ) logger.bind(step=(3, 1)).info("width estimation took %s", common.pretty_format_elapsed_time(start_time)) logger.bind(step=(3, 2)).info("height estimation") start_time = time.time() + LT_HIoIs = LT_HIoIs.to_numpy() # TODO remove + UT_HIoIs = UT_HIoIs.to_numpy() # TODO remove + logger.bind(step=(3, 2, 1)).info("estimating candidate stripe heights") LT_VIoIs, LT_peaks_ids = finders.find_VIoIs( L, diff --git a/src/stripepy/utils/finders.py b/src/stripepy/utils/finders.py index 37a72fd..d10f730 100644 --- a/src/stripepy/utils/finders.py +++ b/src/stripepy/utils/finders.py @@ -2,41 +2,58 @@ # # SPDX-License-Identifier: MIT +import itertools +import time from functools import partial from typing import List, Optional, Tuple import numpy as np +import numpy.typing as npt +import pandas as pd +import scipy.sparse as ss +import structlog from . import TDA +from .common import pretty_format_elapsed_time from .regressions import _compute_wQISA_predictions -def find_horizontal_domain(pd, coarse_h_domain, max_width=1e9): +def find_horizontal_domain( + profile: npt.NDArray[float], + coarse_h_domain: Tuple[int, int, int], + max_width: int = 1e9, +) -> Tuple[int, int]: + """ + Returns + ------- + Tuple[int, int] + the left and right coordinates of the horizontal domain + """ # Unpacking: MP, L_mP, R_mP = coarse_h_domain # Left and sides of candidate: - L_interval = np.flip(pd[L_mP : MP + 1]) - R_interval = pd[MP : R_mP + 1] + L_interval = np.flip(profile[L_mP : MP + 1]) + R_interval = profile[MP : R_mP + 1] # LEFT INTERVAL - L_interval_shifted = np.append(L_interval[1:], [max(pd) + 1], axis=0) + L_interval_shifted = np.append(L_interval[1:], [max(profile) + 1], axis=0) L_bound = np.where(L_interval - L_interval_shifted < 0)[0][0] + 1 # L_interval_restr = L_interval[:L_bound] # L_interval_shifted_restr = L_interval_shifted[:L_bound] # L_bound = np.argmax(L_interval_restr - L_interval_shifted_restr) + 1 - L_bound = min(L_bound, max_width) + L_bound = np.minimum(L_bound, max_width) # RIGHT INTERVAL - R_interval_shifted = np.append(R_interval[1:], [max(pd) + 1], axis=0) + R_interval_shifted = np.append(R_interval[1:], [max(profile) + 1], axis=0) R_bound = np.where(R_interval - R_interval_shifted < 0)[0][0] + 1 # R_interval_restr = R_interval[:R_bound] # R_interval_shifted_restr = R_interval_shifted[:R_bound] # R_bound = np.argmax(R_interval_restr - R_interval_shifted_restr) + 1 - R_bound = min(R_bound, max_width) + R_bound = np.minimum(R_bound, max_width) - return [max(MP - L_bound, 0), min(MP + R_bound, len(pd))] + return max(MP - L_bound, 0), min(MP + R_bound, len(profile)) def find_lower_v_domain(I, threshold_cut, max_height, min_persistence, it) -> Tuple[List, Optional[List]]: @@ -114,38 +131,51 @@ def find_upper_v_domain(I, threshold_cut, max_height, min_persistence, it) -> Tu return [seed_site - candida_bound[0], seed_site], list(seed_site - np.array(loc_Maxima[:-1])) -def find_HIoIs(pd, seed_sites, seed_site_bounds, max_width, map=map): +def find_HIoIs( + pseudodistribution: npt.NDArray[float], + seed_sites: npt.NDArray[int], + seed_site_bounds: npt.NDArray[int], + max_width: int, + map_=map, + logger=None, +) -> pd.DataFrame: """ - :param pd: acronym for pseudo-distribution, but can be any 1D array representing a uniformly-sample - scalar function works + :param pseudodistribution: 1D array representing a uniformly-sample scalar function works :param seed_sites: maximum values in the pseudo-distribution (i.e., genomic coordinates hosting linear patterns) :param seed_site_bounds: for the i-th entry of seed_sites: (*) seed_site_bounds[i] is the left boundary (*) seed_site_bounds[i+1] is the right boundary :param max_width: maximum width allowed - :param map: alternative implementation of the built-in map function. Can be used to e.g. run this step in parallel by passing multiprocessing.Pool().map. + :param map_: alternative implementation of the built-in map function. Can be used to e.g. run this step in parallel by passing multiprocessing.Pool().map. :return: - HIoIs list of lists, where each sublist is a pair consisting of the left and right boundaries + HIoIs a pd.DataFrame the list of left and right boundary for each seed site """ assert len(seed_site_bounds) == len(seed_sites) + 1 + t0 = time.time() + if logger is None: + logger = structlog.get_logger() + iterable_input = [ (seed_site, seed_site_bounds[num_MP], seed_site_bounds[num_MP + 1]) for num_MP, seed_site in enumerate(seed_sites) ] - HIoIs = list(map(partial(find_horizontal_domain, pd, max_width=max_width), iterable_input)) + tasks = map_(partial(find_horizontal_domain, pseudodistribution, max_width=max_width), iterable_input) + # This efficiently constructs a 2D numpy with shape (N, 2) from a list of 2-element tuples, where N is the number of seed sites. + # The first and second columns contains the left and right boundaries of the horizontal domains, respectively. + HIoIs = np.fromiter(itertools.chain.from_iterable(tasks), count=2 * len(seed_sites), dtype=int).reshape(-1, 2) + + # Handle possible overlapping intervals by ensuring that the + # left bound of interval i + 1 is always greater or equal than the right bound of interval i + HIoIs[1:, 0] = np.maximum(HIoIs[1:, 0], HIoIs[:-1, 1]) - # Handle possible overlapping intervals: - for i in range(len(HIoIs) - 1): - current_pair = HIoIs[i] - next_pair = HIoIs[i + 1] + df = pd.DataFrame(data=HIoIs, columns=["left_bound", "right_bound"]) - if current_pair[1] > next_pair[0]: # Check for intersection - next_pair[0] = current_pair[1] # Modify the second pair + logger.debug("find_HIoIs took %s", pretty_format_elapsed_time(time.time() - t0)) - return HIoIs + return df def find_VIoIs(