From f48973791bbefba084d0258e6955c4de3a5ddc2b Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 28 Oct 2024 12:57:21 -0400 Subject: [PATCH 01/11] Fixed uploading of tss dist plot --- bbconf/_version.py | 2 +- bbconf/models/bed_models.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bbconf/_version.py b/bbconf/_version.py index 777f190..8088f75 100644 --- a/bbconf/_version.py +++ b/bbconf/_version.py @@ -1 +1 @@ -__version__ = "0.8.0" +__version__ = "0.8.1" diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index 442ea3e..3edf0c9 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -17,6 +17,7 @@ class BedPlots(BaseModel): widths_histogram: FileModel = None neighbor_distances: FileModel = None open_chromatin: FileModel = None + tss_distance: FileModel = None model_config = ConfigDict(extra="ignore") From 58c384f8f825f9dabe559b6c2f5be71a8eef21af Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Tue, 29 Oct 2024 16:31:33 -0400 Subject: [PATCH 02/11] Fixed https://github.com/databio/bedhost/issues/148 --- bbconf/db_utils.py | 19 +++++++++++++++++++ bbconf/models/bedset_models.py | 5 +++++ bbconf/modules/bedsets.py | 8 ++++++++ 3 files changed, 32 insertions(+) diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index 054520b..a2e51ad 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -300,6 +300,25 @@ class BedSets(Base): files: Mapped[List["Files"]] = relationship("Files", back_populates="bedset") universe: Mapped["Universes"] = relationship("Universes", back_populates="bedset") + annotations: Mapped["BedSetMetadata"] = relationship( + back_populates="bedset", cascade="all, delete-orphan", lazy="joined" + ) + + +class BedSetMetadata(Base): + __tablename__ = "bedsets_metadata" + + id: Mapped[str] = mapped_column( + ForeignKey("bedsets.id", ondelete="CASCADE"), + primary_key=True, + index=True, + ) + + author: Mapped[str] = mapped_column(nullable=True, comment="Author of the bedset") + soruce: Mapped[str] = mapped_column(nullable=True, comment="Source of the bedset") + + bedset: Mapped["Bed"] = relationship("BedSets", back_populates="annotations") + class Universes(Base): __tablename__ = "universes" diff --git a/bbconf/models/bedset_models.py b/bbconf/models/bedset_models.py index 9102814..9d38234 100644 --- a/bbconf/models/bedset_models.py +++ b/bbconf/models/bedset_models.py @@ -1,4 +1,5 @@ from typing import List, Union +import datetime from pydantic import BaseModel, ConfigDict, model_validator @@ -21,10 +22,14 @@ class BedSetMetadata(BaseModel): id: str name: str md5sum: str + submission_date: datetime.datetime = None + last_update_date: datetime.datetime = None statistics: Union[BedSetStats, None] = None plots: Union[BedSetPlots, None] = None description: str = None bed_ids: List[str] = None + author: Union[str, None] = None + source: Union[str, None] = None class BedSetListResult(BaseModel): diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 3ea050e..7984788 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -77,6 +77,14 @@ def get(self, identifier: str, full: bool = False) -> BedSetMetadata: statistics=stats, plots=plots, bed_ids=list_of_bedfiles, + submission_date=bedset_obj.submission_date, + last_update_date=bedset_obj.last_update_date, + author=( + bedset_obj.annotations.author if bedset_obj.annotations else None + ), + source=( + bedset_obj.annotations.source if bedset_obj.annotations else None + ), ) return bedset_metadata From 39a008bc2e9967a40172cd1eb5224fadc349be83 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 30 Oct 2024 12:39:23 -0400 Subject: [PATCH 03/11] updated metadata for bedset --- bbconf/db_utils.py | 18 +----------------- bbconf/modules/bedsets.py | 8 ++------ 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index a2e51ad..a843e0b 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -300,24 +300,8 @@ class BedSets(Base): files: Mapped[List["Files"]] = relationship("Files", back_populates="bedset") universe: Mapped["Universes"] = relationship("Universes", back_populates="bedset") - annotations: Mapped["BedSetMetadata"] = relationship( - back_populates="bedset", cascade="all, delete-orphan", lazy="joined" - ) - - -class BedSetMetadata(Base): - __tablename__ = "bedsets_metadata" - - id: Mapped[str] = mapped_column( - ForeignKey("bedsets.id", ondelete="CASCADE"), - primary_key=True, - index=True, - ) - author: Mapped[str] = mapped_column(nullable=True, comment="Author of the bedset") - soruce: Mapped[str] = mapped_column(nullable=True, comment="Source of the bedset") - - bedset: Mapped["Bed"] = relationship("BedSets", back_populates="annotations") + source: Mapped[str] = mapped_column(nullable=True, comment="Source of the bedset") class Universes(Base): diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 7984788..c02d30f 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -79,12 +79,8 @@ def get(self, identifier: str, full: bool = False) -> BedSetMetadata: bed_ids=list_of_bedfiles, submission_date=bedset_obj.submission_date, last_update_date=bedset_obj.last_update_date, - author=( - bedset_obj.annotations.author if bedset_obj.annotations else None - ), - source=( - bedset_obj.annotations.source if bedset_obj.annotations else None - ), + author=bedset_obj.author, + source=bedset_obj.source, ) return bedset_metadata From 5af694de1107e2520783a1fc2fcaec17484adbe3 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 30 Oct 2024 13:15:37 -0400 Subject: [PATCH 04/11] added functionality for saving bedset annotation to the database --- bbconf/modules/bedsets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index c02d30f..0813c60 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -204,6 +204,8 @@ def get_bedset_pep(self, identifier: str) -> dict: "name": bedset.id, "description": bedset.description, "md5sum": bedset.md5sum, + "author": bedset.author, + "source": bedset.source, } return { @@ -219,6 +221,7 @@ def create( bedid_list: List[str], description: str = None, statistics: bool = False, + annotation: dict = None, plots: dict = None, upload_pephub: bool = False, upload_s3: bool = False, @@ -234,6 +237,7 @@ def create( :param description: bedset description :param bedid_list: list of bed file identifiers :param statistics: calculate statistics for bedset + :param annotation: bedset annotation (author, source) :param plots: dictionary with plots :param upload_pephub: upload bedset to pephub (create view in pephub) :param upload_s3: upload bedset to s3 @@ -253,6 +257,9 @@ def create( raise BedSetExistsError(identifier) self.delete(identifier) + if not isinstance(annotation, dict): + annotation = {} + if upload_pephub: try: self._create_pephub_view(identifier, description, bedid_list, no_fail) @@ -268,6 +275,8 @@ def create( bedset_means=stats.mean.model_dump() if stats else None, bedset_standard_deviation=stats.sd.model_dump() if stats else None, md5sum=compute_md5sum_bedset(bedid_list), + author=annotation.get("author"), + source=annotation.get("source"), ) if upload_s3: From 95471ce4cf2a526cc75dfc42b3d919cf8cbc93d1 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 30 Oct 2024 13:16:02 -0400 Subject: [PATCH 05/11] updated geniml requirements --- requirements/requirements-all.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index cd2e043..01f63ad 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,6 +1,6 @@ yacman >= 0.9.1 sqlalchemy >= 2.0.0 -geniml[ml] >= 0.5.1 +geniml[ml] >= 0.5.2 psycopg >= 3.1.15 colorlogs pydantic >= 2.9.0 From c3d2fa97b3e02cd187e7e3c69de1481869c7d6ff Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 31 Oct 2024 12:03:18 -0400 Subject: [PATCH 06/11] added annotation of bedfile in bedset_bedfiles function --- bbconf/modules/bedsets.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 0813c60..3f46bb9 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -9,7 +9,7 @@ from bbconf.const import PKG_NAME from bbconf.db_utils import Bed, BedFileBedSetRelation, BedSets, BedStats, Files from bbconf.exceptions import BedSetExistsError, BedSetNotFoundError -from bbconf.models.bed_models import BedStatsModel +from bbconf.models.bed_models import BedStatsModel, StandardMeta from bbconf.models.bedset_models import ( BedMetadataBasic, BedSetBedFiles, @@ -447,7 +447,16 @@ def get_bedset_bedfiles(self, identifier: str) -> BedSetBedFiles: with Session(self._db_engine.engine) as session: bedfiles_list = session.scalars(statement) results = [ - BedMetadataBasic(**bedfile_obj.__dict__) + BedMetadataBasic( + **bedfile_obj.__dict__, + annotation=StandardMeta( + **( + bedfile_obj.annotations.__dict__ + if bedfile_obj.annotations + else {} + ) + ), + ) for bedfile_obj in bedfiles_list ] From b7fd591a2fc963e2995a5731c57c33e6b19c4297 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Sun, 3 Nov 2024 21:38:12 -0500 Subject: [PATCH 07/11] updated reindex metadata --- bbconf/modules/bedfiles.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index bad4f4d..bb860f4 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -880,17 +880,19 @@ def reindex_qdrant(self) -> None: bed_region_set_obj = bb_client.load_bed(record_id) pbar.set_description(f"Processing file: {record_id}") - metadata = self._config.phc.sample.get( - namespace=self._config.config.phc.namespace, - name=self._config.config.phc.name, - tag=self._config.config.phc.tag, - sample_name=record_id, - ) + + # TODO: create different way to get metadata + # metadata = self._config.phc.sample.get( + # namespace=self._config.config.phc.namespace, + # name=self._config.config.phc.name, + # tag=self._config.config.phc.tag, + # sample_name=record_id, + # ) self.upload_file_qdrant( bed_id=record_id, bed_file=bed_region_set_obj, - payload=BedPEPHubRestrict(**metadata).model_dump(), + # payload=BedPEPHubRestrict(**metadata).model_dump(), ) pbar.write(f"File: {record_id} uploaded to qdrant successfully.") pbar.update(1) From 762a0c004f3cb3a4c5bc88ddbbe7d10df2e2da11 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Sun, 3 Nov 2024 21:48:23 -0500 Subject: [PATCH 08/11] updated reindex metadata2 --- bbconf/modules/bedfiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index bb860f4..eeb362b 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -892,7 +892,7 @@ def reindex_qdrant(self) -> None: self.upload_file_qdrant( bed_id=record_id, bed_file=bed_region_set_obj, - # payload=BedPEPHubRestrict(**metadata).model_dump(), + payload={"bed_id": record_id}, ) pbar.write(f"File: {record_id} uploaded to qdrant successfully.") pbar.update(1) From 83faf920db7aefdc50457d2bb16f0b2904389a6a Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 4 Nov 2024 14:45:05 -0500 Subject: [PATCH 09/11] reindexing improvements --- bbconf/modules/bedfiles.py | 41 ++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index eeb362b..3cc2ee1 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -385,7 +385,12 @@ def get_ids_list( count = session.execute(count_statement).one() for result in bed_ids: - result_list.append(BedMetadataBasic(**result.__dict__)) + annotation = StandardMeta( + **result.annotations.__dict__ if result.annotations else {} + ) + result_list.append( + BedMetadataBasic(**result.__dict__, annotation=annotation) + ) return BedListResult( count=count[0], @@ -865,36 +870,28 @@ def reindex_qdrant(self) -> None: """ bb_client = BBClient() - statement = select(Bed.id).where(and_(Bed.genome_alias == QDRANT_GENOME)) + annotation_result = self.get_ids_list(limit=10, genome=QDRANT_GENOME) - with Session(self._db_engine.engine) as session: - bed_ids = session.execute(statement).all() + if not annotation_result.results: + _LOGGER.error("No bed files found.") + return None + results = annotation_result.results - bed_ids = [bed_result[0] for bed_result in bed_ids] - - with tqdm(total=len(bed_ids), position=0, leave=True) as pbar: - for record_id in bed_ids: + with tqdm(total=len(results), position=0, leave=True) as pbar: + for record in results: try: - bed_region_set_obj = GRegionSet(bb_client.seek(record_id)) + bed_region_set_obj = GRegionSet(bb_client.seek(record.id)) except FileNotFoundError: - bed_region_set_obj = bb_client.load_bed(record_id) - - pbar.set_description(f"Processing file: {record_id}") + bed_region_set_obj = bb_client.load_bed(record.id) - # TODO: create different way to get metadata - # metadata = self._config.phc.sample.get( - # namespace=self._config.config.phc.namespace, - # name=self._config.config.phc.name, - # tag=self._config.config.phc.tag, - # sample_name=record_id, - # ) + pbar.set_description(f"Processing file: {record.id}") self.upload_file_qdrant( - bed_id=record_id, + bed_id=record.id, bed_file=bed_region_set_obj, - payload={"bed_id": record_id}, + payload=record.annotation.model_dump() if record.annotation else {}, ) - pbar.write(f"File: {record_id} uploaded to qdrant successfully.") + pbar.write(f"File: {record.id} uploaded to qdrant successfully.") pbar.update(1) return None From 3b5ef50b6421eefd9f1367a74dbd47a58ad996ef Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 4 Nov 2024 14:50:55 -0500 Subject: [PATCH 10/11] updated reindex limit --- bbconf/modules/bedfiles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 3cc2ee1..38ce8e8 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -870,7 +870,7 @@ def reindex_qdrant(self) -> None: """ bb_client = BBClient() - annotation_result = self.get_ids_list(limit=10, genome=QDRANT_GENOME) + annotation_result = self.get_ids_list(limit=100000, genome=QDRANT_GENOME) if not annotation_result.results: _LOGGER.error("No bed files found.") From 75b84d2604d4ccd44bf417352be8253b3db9eb7f Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 6 Nov 2024 15:54:18 -0500 Subject: [PATCH 11/11] 1. Added get_genome_list method 2. Added get_missing_plots method 3. updated changelog 4. updated version and requirements --- bbconf/_version.py | 2 +- bbconf/bbagent.py | 11 ++++++++ bbconf/db_utils.py | 6 +++-- bbconf/models/bedset_models.py | 2 +- bbconf/modules/bedfiles.py | 42 +++++++++++++++++++++++++++++-- docs/changelog.md | 9 +++++++ manual_testing.py | 13 +++++++++- requirements/requirements-all.txt | 2 +- 8 files changed, 79 insertions(+), 8 deletions(-) diff --git a/bbconf/_version.py b/bbconf/_version.py index 8088f75..3e2f46a 100644 --- a/bbconf/_version.py +++ b/bbconf/_version.py @@ -1 +1 @@ -__version__ = "0.8.1" +__version__ = "0.9.0" diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index 029a645..1b0baab 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -62,6 +62,17 @@ def get_stats(self) -> StatsReturn: genomes_number=number_of_genomes, ) + def get_list_genomes(self) -> List[str]: + """ + Get list of genomes from the database + + :return: list of genomes + """ + statement = select(distinct(Bed.genome_alias)) + with Session(self.config.db_engine.engine) as session: + genomes = session.execute(statement).all() + return [result[0] for result in genomes] + @cached_property def list_of_licenses(self) -> List[str]: """ diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index a843e0b..d929ec8 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -339,7 +339,7 @@ class TokenizedBed(Base): nullable=False, ) universe_id: Mapped[str] = mapped_column( - ForeignKey("universes.id", ondelete="CASCADE"), + ForeignKey("universes.id", ondelete="CASCADE", passive_deletes=True), primary_key=True, index=True, nullable=False, @@ -350,7 +350,9 @@ class TokenizedBed(Base): bed: Mapped["Bed"] = relationship("Bed", back_populates="tokenized") universe: Mapped["Universes"] = relationship( - "Universes", back_populates="tokenized" + "Universes", + back_populates="tokenized", + passive_deletes=True, ) diff --git a/bbconf/models/bedset_models.py b/bbconf/models/bedset_models.py index 9d38234..73bcb9a 100644 --- a/bbconf/models/bedset_models.py +++ b/bbconf/models/bedset_models.py @@ -1,5 +1,5 @@ -from typing import List, Union import datetime +from typing import List, Union from pydantic import BaseModel, ConfigDict, model_validator diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 38ce8e8..f9a60a6 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -1,6 +1,6 @@ import os from logging import getLogger -from typing import Dict, Union +from typing import Dict, List, Union import numpy as np from geniml.bbclient import BBClient @@ -10,7 +10,7 @@ from pydantic import BaseModel from qdrant_client.models import Distance, PointIdsList, VectorParams from sqlalchemy import and_, delete, func, select -from sqlalchemy.orm import Session +from sqlalchemy.orm import Session, aliased from tqdm import tqdm from bbconf.config_parser.bedbaseconfig import BedBaseConfig @@ -1179,3 +1179,41 @@ def get_tokenized_link( bed_id=bed_id, universe_id=universe_id, ) + + def get_missing_plots( + self, plot_name: str, limit: int = 1000, offset: int = 0 + ) -> List[str]: + """ + Get list of bed files that are missing plot + + :param plot_name: plot name + :param limit: number of results to return + :param offset: offset to start from + + :return: list of bed file identifiers + """ + if plot_name not in list(BedPlots.model_fields.keys()): + raise BedBaseConfError( + f"Plot name: {plot_name} is not valid. Valid names: {list(BedPlots.model_fields.keys())}" + ) + + with Session(self._sa_engine) as session: + # Alias for subquery + t2_alias = aliased(Files) + + # Define the subquery + subquery = select(t2_alias).where(t2_alias.name == plot_name).subquery() + + query = ( + select(Bed.id) + .outerjoin(subquery, Bed.id == subquery.c.bedfile_id) + .where(subquery.c.bedfile_id.is_(None)) + .limit(limit) + .offset(offset) + ) + + results = session.scalars(query) + + results = [result for result in results] + + return results diff --git a/docs/changelog.md b/docs/changelog.md index bcf871c..1eba8d4 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,15 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +# [0.9.0] - 2024-11-06 +## Changed +- Fixed bug with uploading tss dist plot\ + +## Added +- Added annotations to bedsets (author, source) +- get_genome_list method to bedfiles, that lists all available genomes +- Added method that lists all missing plots for bedfiles (get_missing_plots) + # [0.8.0] - 2024-10-23 ## Changed - Updated text to bed search (now using bivec) diff --git a/manual_testing.py b/manual_testing.py index 876b22c..0386ada 100644 --- a/manual_testing.py +++ b/manual_testing.py @@ -175,9 +175,20 @@ def get_pep(): prj +def get_id_plots_missing(): + from bbconf import BedBaseAgent + + agent = BedBaseAgent(config="/home/bnt4me/virginia/repos/bedhost/config.yaml") + + results = agent.bed.get_missing_plots("gccontent", limit=5000) + print(results) + print(agent.get_list_genomes()) + + if __name__ == "__main__": # zarr_s3() # add_s3() # get_from_s3() # biocframe() - get_pep() + # get_pep() + get_id_plots_missing() diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 01f63ad..cd2e043 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,6 +1,6 @@ yacman >= 0.9.1 sqlalchemy >= 2.0.0 -geniml[ml] >= 0.5.2 +geniml[ml] >= 0.5.1 psycopg >= 3.1.15 colorlogs pydantic >= 2.9.0