Skip to content

Commit

Permalink
Merge pull request #315 from kjsanger/fix/performance
Browse files Browse the repository at this point in the history
Fix performance issue where debug code was left in production
  • Loading branch information
kjsanger authored May 23, 2024
2 parents 7ec482b + fa2b4af commit ad97422
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 28 deletions.
1 change: 0 additions & 1 deletion src/npg_irods/cli/update_secondary_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

import sqlalchemy
import structlog
from sqlalchemy.orm import Session

from npg_irods.cli.util import (
add_db_config_arguments,
Expand Down
51 changes: 24 additions & 27 deletions src/npg_irods/illumina.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from dataclasses import dataclass
from datetime import datetime, timedelta
from enum import Enum, unique
from functools import lru_cache
from pathlib import PurePath
from typing import Iterator, Optional, Type

Expand Down Expand Up @@ -352,6 +353,8 @@ def find_associated_components(item: DataObject | Collection) -> list[Component]
)
item_stem, item_suffix = split_name(item.name)

log.info("Finding components", path=item, stem=item_stem, suffix=item_suffix)

# The item itself holds the associated metadata (true for BAM and CRAM files)
if item_suffix.casefold() in [".bam", ".cram"]:
return [Component.from_avu(avu) for avu in item.metadata(SeqConcept.COMPONENT)]
Expand All @@ -363,43 +366,37 @@ def find_associated_components(item: DataObject | Collection) -> list[Component]
else:
coll = Collection(item.path.parent)

log.info("Looking in associated collection", path=coll)

if not coll.exists():
raise CollectionNotFound(
f"{errmsg} in this collection (path does not exist)", path=coll
)

bams, crams = [], []
for obj in coll.iter_contents():
if obj.rods_type != DataObject:
continue

stem, suffix = split_name(obj.name)
if stem != item_stem:
continue

# Alternatively we could use the "type" AVU to determine the type of data
if suffix.casefold() == ".bam":
bams.append(obj)
elif suffix.casefold() == ".cram":
crams.append(obj)

associated = crams if len(crams) > 0 else bams

if len(associated) == 0:
raise DataObjectNotFound(f"{errmsg} for {item} in {coll}", path=item)
if len(associated) > 1:
raise NonUniqueError(
f"{errmsg}. Multiple associated data objects for {item} "
f"found in {coll}: {associated}",
path=item,
observed=associated,
)
obj = _find_associated_am_file(coll, item_stem)

obj = associated.pop()
log.debug("LRU cache stats", cache_info=_find_associated_am_file.cache_info())

return [Component.from_avu(avu) for avu in obj.metadata(SeqConcept.COMPONENT)]


@lru_cache(maxsize=1024)
def _find_associated_am_file(coll: Collection, stem: str) -> DataObject:
assoc_cram = DataObject(coll.path / f"{stem}.cram")
assoc_bam = DataObject(coll.path / f"{stem}.bam")

if assoc_cram.exists():
return assoc_cram

if assoc_bam.exists():
return assoc_bam

raise DataObjectNotFound(
"Failed to find an associated data object bearing "
f"component metadata for stem {stem} in {coll}"
)


def requires_full_metadata(obj: DataObject) -> bool:
"""Return True if the given data object requires full metadata.
Expand Down

0 comments on commit ad97422

Please sign in to comment.