From 623795d5403021afc85ca94b5af942d709afff0c Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 10 Jan 2025 14:03:11 -0500 Subject: [PATCH 1/5] added method that retrieves missing stats and files --- bbconf/_version.py | 2 +- bbconf/bbagent.py | 1 - bbconf/modules/bedfiles.py | 59 ++++++++++++++++++++++++++++++++++++-- docs/changelog.md | 6 ++++ 4 files changed, 64 insertions(+), 4 deletions(-) diff --git a/bbconf/_version.py b/bbconf/_version.py index 17c1a62..b2385cb 100644 --- a/bbconf/_version.py +++ b/bbconf/_version.py @@ -1 +1 @@ -__version__ = "0.10.2" +__version__ = "0.10.3" diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index 1ea6d26..b1cc620 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -1,5 +1,4 @@ import logging - from functools import cached_property from pathlib import Path from typing import List, Union diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 20f7bb6..17da1ef 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -6,18 +6,18 @@ import numpy as np from geniml.bbclient import BBClient from geniml.io import RegionSet +from geniml.search.backends import QdrantBackend from gtars.tokenizers import RegionSet as GRegionSet from pephubclient.exceptions import ResponseError from pydantic import BaseModel +from qdrant_client.http.models import PointStruct from qdrant_client.models import Distance, PointIdsList, VectorParams from sqlalchemy import and_, delete, func, or_, select from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session, aliased from tqdm import tqdm -from qdrant_client.http.models import PointStruct from bbconf.config_parser.bedbaseconfig import BedBaseConfig -from geniml.search.backends import QdrantBackend from bbconf.const import DEFAULT_LICENSE, PKG_NAME, ZARR_TOKENIZED_FOLDER from bbconf.db_utils import ( Bed, @@ -1645,6 +1645,61 @@ def get_missing_plots( return results + def get_missing_stats(self, limit: int = 1000, offset: int = 0) -> List[str]: + """ + Get list of bed files that are missing statistics + + :param limit: number of results to return + :param offset: offset to start from + + :return: list of bed file identifiers + """ + + with Session(self._sa_engine) as session: + query = ( + select(BedStats) + .where(BedStats.number_of_regions.is_(None)) + .limit(limit) + .offset(offset) + ) + + results = session.scalars(query) + + results = [result.id for result in results] + + return results + + def get_missing_files(self, limit: int = 1000, offset: int = 0) -> List[str]: + """ + Get list of bed files that are missing files (bigBed files) + + :param limit: number of results to return + :param offset: offset to start from + + :return: list of bed file identifiers + """ + + with Session(self._sa_engine) as session: + # Alias for subquery + t2_alias = aliased(Files) + + # Define the subquery + subquery = select(t2_alias).where(t2_alias.name == "bigbed_file").subquery() + + query = ( + select(Bed.id) + .outerjoin(subquery, Bed.id == subquery.c.bedfile_id) + .where(subquery.c.bedfile_id.is_(None)) + .limit(limit) + .offset(offset) + ) + + results = session.scalars(query) + + results = [result for result in results] + + return results + def get_unprocessed(self, limit: int = 1000, offset: int = 0) -> BedListResult: """ Get bed files that are not processed. diff --git a/docs/changelog.md b/docs/changelog.md index 2da7241..c89245c 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [0.10.3] - 2025-XX-XX + +### Added: +- Get missing stats and files in bedfiles + + ## [0.10.2] - 2025-01-09 ### Changed: From 1d035373739349bbd51f68288f1dc3be3547fa94 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 10 Jan 2025 15:32:14 -0500 Subject: [PATCH 2/5] Added validate bedbase config --- bbconf/config_parser/utils.py | 65 +++++++++++++++++++++++++++++++++++ docs/changelog.md | 1 + manual_testing.py | 12 ++++++- 3 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 bbconf/config_parser/utils.py diff --git a/bbconf/config_parser/utils.py b/bbconf/config_parser/utils.py new file mode 100644 index 0000000..0707fe3 --- /dev/null +++ b/bbconf/config_parser/utils.py @@ -0,0 +1,65 @@ +import yacman +from pephubclient.helpers import MessageHandler as m +from pydantic_core._pydantic_core import ValidationError + +from bbconf.config_parser.models import ConfigFile +from bbconf.exceptions import BedBaseConfError +from bbconf.helpers import get_bedbase_cfg + + +def config_analyzer(config_path: str) -> bool: + """ + Read configuration file and insert default values if not set + + :param config_path: configuration file path + :return: None + :raises: raise_missing_key (if config key is missing) + """ + config_path = get_bedbase_cfg(config_path) + + print("Analyzing the configuration file {config_path}...") + + _config = yacman.YAMLConfigManager(filepath=config_path).exp + + config_dict = {} + for field_name, annotation in ConfigFile.model_fields.items(): + try: + config_dict[field_name] = annotation.annotation(**_config.get(field_name)) + except TypeError: + if annotation.is_required(): + print( + str( + BedBaseConfError( + f"`Config info: {field_name}` Field is not set in the configuration file or missing. " + ) + ) + ) + else: + print( + f"Config info: `{field_name}` Field is not set in the configuration file. Using default value." + ) + return False + try: + config_dict[field_name] = annotation.annotation() + except ValidationError as e: + print( + str( + BedBaseConfError( + f"Error in provided configuration file. Section: `{field_name}` missing values :: \n {e}" + ) + ) + ) + return False + except ValidationError as e: + print( + str( + BedBaseConfError( + f"Error in provided configuration file. Section: `{field_name}` missing values :: \n {e}" + ) + ) + ) + return False + + m.print_success("Configuration file is valid! ") + + return True diff --git a/docs/changelog.md b/docs/changelog.md index c89245c..9c931af 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -6,6 +6,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Added: - Get missing stats and files in bedfiles +- Config analyzer for quick analysis if the config is correct ## [0.10.2] - 2025-01-09 diff --git a/manual_testing.py b/manual_testing.py index ea814bd..2a732c0 100644 --- a/manual_testing.py +++ b/manual_testing.py @@ -201,6 +201,14 @@ def sql_search(): results +def config_t(): + from bbconf.config_parser.utils import config_analyzer + + is_valid = config_analyzer("/home/bnt4me/virginia/repos/bbconf/config.yaml") + + print(is_valid) + + if __name__ == "__main__": # zarr_s3() # add_s3() @@ -209,4 +217,6 @@ def sql_search(): # get_pep() # get_id_plots_missing() # neighbour_beds() - sql_search() + # sql_search() + + config_t() From 570287ea49123c8abe32b3d12583d618de760272 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Sat, 11 Jan 2025 01:27:30 -0500 Subject: [PATCH 3/5] Fixed bug in config validation --- bbconf/config_parser/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bbconf/config_parser/utils.py b/bbconf/config_parser/utils.py index 0707fe3..ded7b0d 100644 --- a/bbconf/config_parser/utils.py +++ b/bbconf/config_parser/utils.py @@ -38,7 +38,6 @@ def config_analyzer(config_path: str) -> bool: print( f"Config info: `{field_name}` Field is not set in the configuration file. Using default value." ) - return False try: config_dict[field_name] = annotation.annotation() except ValidationError as e: From c8a27335caa703930133ae54c8a71b429136768b Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 13 Jan 2025 16:06:19 -0500 Subject: [PATCH 4/5] Fixed bug in config validation 2 --- bbconf/config_parser/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbconf/config_parser/utils.py b/bbconf/config_parser/utils.py index ded7b0d..0184eca 100644 --- a/bbconf/config_parser/utils.py +++ b/bbconf/config_parser/utils.py @@ -39,7 +39,7 @@ def config_analyzer(config_path: str) -> bool: f"Config info: `{field_name}` Field is not set in the configuration file. Using default value." ) try: - config_dict[field_name] = annotation.annotation() + config_dict[field_name] = None except ValidationError as e: print( str( From 4527edad7f1ff753210167d28766cd8a2af57c0d Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 16 Jan 2025 12:27:59 -0500 Subject: [PATCH 5/5] Fixed #76 --- bbconf/modules/bedfiles.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 17da1ef..54d9abc 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -674,8 +674,6 @@ def update( """ Update bed file to the database. - !! WARNING: this method is in development. Please, void of using it! - :param identifier: bed file identifier :param stats: bed file results {statistics, plots, files, metadata} :param metadata: bed file metadata (will be saved in pephub) @@ -725,9 +723,18 @@ def update( _LOGGER.info("upload_pephub set to false. Skipping pephub..") if upload_qdrant: - self.upload_file_qdrant( - identifier, files.bed_file.path, payload=metadata.model_dump() - ) + if classification.genome_alias == "hg38": + _LOGGER.info(f"Uploading bed file to qdrant.. [{identifier}]") + self.upload_file_qdrant( + identifier, + files.bed_file.path, + bed_metadata.model_dump(exclude_none=False), + ) + _LOGGER.info(f"File uploaded to qdrant. {identifier}") + else: + _LOGGER.warning( + f"Could not upload to qdrant. Genome: {classification.genome_alias} is not supported." + ) with Session(self._sa_engine) as session: bed_statement = select(Bed).where(and_(Bed.id == identifier))