Skip to content

Commit

Permalink
Optimize find_HIoIs()
Browse files Browse the repository at this point in the history
The optimization result in a 10% improvement in single-threaded performance
  • Loading branch information
robomics committed Jan 9, 2025
1 parent 8e96618 commit e827f2f
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 35 deletions.
38 changes: 24 additions & 14 deletions src/stripepy/stripepy.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,35 +402,45 @@ def step_3(
LT_bounded_mPs = np.concatenate(LT_bounded_mPs, dtype=int)
UT_bounded_mPs = np.concatenate(UT_bounded_mPs, dtype=int)

# List of pairs (pair = left and right boundaries):
# Choose the variable criterion between max_ascent and max_perc_descent
# ---> When variable criterion is set to max_ascent, set the variable max_ascent
# ---> When variable criterion is set to max_perc_descent, set the variable max_perc_descent
# DataFrame with the left and right boundaries for each seed site
LT_HIoIs = finders.find_HIoIs(
LT_pseudo_distrib, LT_MPs, LT_bounded_mPs, int(max_width / (2 * resolution)) + 1, map=map
pseudodistribution=LT_pseudo_distrib,
seed_sites=LT_MPs,
seed_site_bounds=LT_bounded_mPs,
max_width=int(max_width / (2 * resolution)) + 1,
map_=map,
logger=logger,
)
UT_HIoIs = finders.find_HIoIs(
UT_pseudo_distrib, UT_MPs, UT_bounded_mPs, int(max_width / (2 * resolution)) + 1, map=map
pseudodistribution=UT_pseudo_distrib,
seed_sites=UT_MPs,
seed_site_bounds=UT_bounded_mPs,
max_width=int(max_width / (2 * resolution)) + 1,
map_=map,
logger=logger,
)

# List of left or right boundaries:
LT_L_bounds, LT_R_bounds = map(list, zip(*LT_HIoIs))
UT_L_bounds, UT_R_bounds = map(list, zip(*UT_HIoIs))

logger.bind(step=(3, 1, 2)).info("updating candidate stripes with width information")
stripes = result.get("stripes", "LT")
for num_cand_stripe, (LT_L_bound, LT_R_bound) in enumerate(zip(LT_L_bounds, LT_R_bounds)):
stripes[num_cand_stripe].set_horizontal_bounds(LT_L_bound, LT_R_bound)
LT_HIoIs.apply(
lambda seed: stripes[seed.name].set_horizontal_bounds(seed["left_bound"], seed["right_bound"]),
axis="columns",
)

stripes = result.get("stripes", "UT")
for num_cand_stripe, (UT_L_bound, UT_R_bound) in enumerate(zip(UT_L_bounds, UT_R_bounds)):
stripes[num_cand_stripe].set_horizontal_bounds(UT_L_bound, UT_R_bound)
UT_HIoIs.apply(
lambda seed: stripes[seed.name].set_horizontal_bounds(seed["left_bound"], seed["right_bound"]),
axis="columns",
)

logger.bind(step=(3, 1)).info("width estimation took %s", common.pretty_format_elapsed_time(start_time))

logger.bind(step=(3, 2)).info("height estimation")
start_time = time.time()

LT_HIoIs = LT_HIoIs.to_numpy() # TODO remove
UT_HIoIs = UT_HIoIs.to_numpy() # TODO remove

logger.bind(step=(3, 2, 1)).info("estimating candidate stripe heights")
LT_VIoIs, LT_peaks_ids = finders.find_VIoIs(
L,
Expand Down
72 changes: 51 additions & 21 deletions src/stripepy/utils/finders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,58 @@
#
# SPDX-License-Identifier: MIT

import itertools
import time
from functools import partial
from typing import List, Optional, Tuple

import numpy as np
import numpy.typing as npt
import pandas as pd
import scipy.sparse as ss
import structlog

from . import TDA
from .common import pretty_format_elapsed_time
from .regressions import _compute_wQISA_predictions


def find_horizontal_domain(pd, coarse_h_domain, max_width=1e9):
def find_horizontal_domain(
profile: npt.NDArray[float],
coarse_h_domain: Tuple[int, int, int],
max_width: int = 1e9,
) -> Tuple[int, int]:
"""
Returns
-------
Tuple[int, int]
the left and right coordinates of the horizontal domain
"""

# Unpacking:
MP, L_mP, R_mP = coarse_h_domain

# Left and sides of candidate:
L_interval = np.flip(pd[L_mP : MP + 1])
R_interval = pd[MP : R_mP + 1]
L_interval = np.flip(profile[L_mP : MP + 1])
R_interval = profile[MP : R_mP + 1]

# LEFT INTERVAL
L_interval_shifted = np.append(L_interval[1:], [max(pd) + 1], axis=0)
L_interval_shifted = np.append(L_interval[1:], [max(profile) + 1], axis=0)
L_bound = np.where(L_interval - L_interval_shifted < 0)[0][0] + 1
# L_interval_restr = L_interval[:L_bound]
# L_interval_shifted_restr = L_interval_shifted[:L_bound]
# L_bound = np.argmax(L_interval_restr - L_interval_shifted_restr) + 1
L_bound = min(L_bound, max_width)
L_bound = np.minimum(L_bound, max_width)

# RIGHT INTERVAL
R_interval_shifted = np.append(R_interval[1:], [max(pd) + 1], axis=0)
R_interval_shifted = np.append(R_interval[1:], [max(profile) + 1], axis=0)
R_bound = np.where(R_interval - R_interval_shifted < 0)[0][0] + 1
# R_interval_restr = R_interval[:R_bound]
# R_interval_shifted_restr = R_interval_shifted[:R_bound]
# R_bound = np.argmax(R_interval_restr - R_interval_shifted_restr) + 1
R_bound = min(R_bound, max_width)
R_bound = np.minimum(R_bound, max_width)

return [max(MP - L_bound, 0), min(MP + R_bound, len(pd))]
return max(MP - L_bound, 0), min(MP + R_bound, len(profile))


def find_lower_v_domain(I, threshold_cut, max_height, min_persistence, it) -> Tuple[List, Optional[List]]:
Expand Down Expand Up @@ -114,38 +131,51 @@ def find_upper_v_domain(I, threshold_cut, max_height, min_persistence, it) -> Tu
return [seed_site - candida_bound[0], seed_site], list(seed_site - np.array(loc_Maxima[:-1]))


def find_HIoIs(pd, seed_sites, seed_site_bounds, max_width, map=map):
def find_HIoIs(
pseudodistribution: npt.NDArray[float],
seed_sites: npt.NDArray[int],
seed_site_bounds: npt.NDArray[int],
max_width: int,
map_=map,
logger=None,
) -> pd.DataFrame:
"""
:param pd: acronym for pseudo-distribution, but can be any 1D array representing a uniformly-sample
scalar function works
:param pseudodistribution: 1D array representing a uniformly-sample scalar function works
:param seed_sites: maximum values in the pseudo-distribution (i.e., genomic coordinates hosting linear
patterns)
:param seed_site_bounds: for the i-th entry of seed_sites:
(*) seed_site_bounds[i] is the left boundary
(*) seed_site_bounds[i+1] is the right boundary
:param max_width: maximum width allowed
:param map: alternative implementation of the built-in map function. Can be used to e.g. run this step in parallel by passing multiprocessing.Pool().map.
:param map_: alternative implementation of the built-in map function. Can be used to e.g. run this step in parallel by passing multiprocessing.Pool().map.
:return:
HIoIs list of lists, where each sublist is a pair consisting of the left and right boundaries
HIoIs a pd.DataFrame the list of left and right boundary for each seed site
"""
assert len(seed_site_bounds) == len(seed_sites) + 1

t0 = time.time()
if logger is None:
logger = structlog.get_logger()

iterable_input = [
(seed_site, seed_site_bounds[num_MP], seed_site_bounds[num_MP + 1])
for num_MP, seed_site in enumerate(seed_sites)
]

HIoIs = list(map(partial(find_horizontal_domain, pd, max_width=max_width), iterable_input))
tasks = map_(partial(find_horizontal_domain, pseudodistribution, max_width=max_width), iterable_input)
# This efficiently constructs a 2D numpy with shape (N, 2) from a list of 2-element tuples, where N is the number of seed sites.
# The first and second columns contains the left and right boundaries of the horizontal domains, respectively.
HIoIs = np.fromiter(itertools.chain.from_iterable(tasks), count=2 * len(seed_sites), dtype=int).reshape(-1, 2)

# Handle possible overlapping intervals by ensuring that the
# left bound of interval i + 1 is always greater or equal than the right bound of interval i
HIoIs[1:, 0] = np.maximum(HIoIs[1:, 0], HIoIs[:-1, 1])

# Handle possible overlapping intervals:
for i in range(len(HIoIs) - 1):
current_pair = HIoIs[i]
next_pair = HIoIs[i + 1]
df = pd.DataFrame(data=HIoIs, columns=["left_bound", "right_bound"])

if current_pair[1] > next_pair[0]: # Check for intersection
next_pair[0] = current_pair[1] # Modify the second pair
logger.debug("find_HIoIs took %s", pretty_format_elapsed_time(time.time() - t0))

return HIoIs
return df


def find_VIoIs(
Expand Down

0 comments on commit e827f2f

Please sign in to comment.