Skip to content

Commit

Permalink
Merge pull request #77 from databio/dev
Browse files Browse the repository at this point in the history
Release v0.10.3
  • Loading branch information
khoroshevskyi authored Jan 16, 2025
2 parents d5f1750 + 4527eda commit 3b71183
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 10 deletions.
2 changes: 1 addition & 1 deletion bbconf/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.10.2"
__version__ = "0.10.3"
1 change: 0 additions & 1 deletion bbconf/bbagent.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging

from functools import cached_property
from pathlib import Path
from typing import List, Union
Expand Down
64 changes: 64 additions & 0 deletions bbconf/config_parser/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import yacman
from pephubclient.helpers import MessageHandler as m
from pydantic_core._pydantic_core import ValidationError

from bbconf.config_parser.models import ConfigFile
from bbconf.exceptions import BedBaseConfError
from bbconf.helpers import get_bedbase_cfg


def config_analyzer(config_path: str) -> bool:
"""
Read configuration file and insert default values if not set
:param config_path: configuration file path
:return: None
:raises: raise_missing_key (if config key is missing)
"""
config_path = get_bedbase_cfg(config_path)

print("Analyzing the configuration file {config_path}...")

_config = yacman.YAMLConfigManager(filepath=config_path).exp

config_dict = {}
for field_name, annotation in ConfigFile.model_fields.items():
try:
config_dict[field_name] = annotation.annotation(**_config.get(field_name))
except TypeError:
if annotation.is_required():
print(
str(
BedBaseConfError(
f"`Config info: {field_name}` Field is not set in the configuration file or missing. "
)
)
)
else:
print(
f"Config info: `{field_name}` Field is not set in the configuration file. Using default value."
)
try:
config_dict[field_name] = None
except ValidationError as e:
print(
str(
BedBaseConfError(
f"Error in provided configuration file. Section: `{field_name}` missing values :: \n {e}"
)
)
)
return False
except ValidationError as e:
print(
str(
BedBaseConfError(
f"Error in provided configuration file. Section: `{field_name}` missing values :: \n {e}"
)
)
)
return False

m.print_success("Configuration file is valid! ")

return True
76 changes: 69 additions & 7 deletions bbconf/modules/bedfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@
import numpy as np
from geniml.bbclient import BBClient
from geniml.io import RegionSet
from geniml.search.backends import QdrantBackend
from gtars.tokenizers import RegionSet as GRegionSet
from pephubclient.exceptions import ResponseError
from pydantic import BaseModel
from qdrant_client.http.models import PointStruct
from qdrant_client.models import Distance, PointIdsList, VectorParams
from sqlalchemy import and_, delete, func, or_, select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session, aliased
from tqdm import tqdm
from qdrant_client.http.models import PointStruct

from bbconf.config_parser.bedbaseconfig import BedBaseConfig
from geniml.search.backends import QdrantBackend
from bbconf.const import DEFAULT_LICENSE, PKG_NAME, ZARR_TOKENIZED_FOLDER
from bbconf.db_utils import (
Bed,
Expand Down Expand Up @@ -674,8 +674,6 @@ def update(
"""
Update bed file to the database.
!! WARNING: this method is in development. Please, void of using it!
:param identifier: bed file identifier
:param stats: bed file results {statistics, plots, files, metadata}
:param metadata: bed file metadata (will be saved in pephub)
Expand Down Expand Up @@ -725,9 +723,18 @@ def update(
_LOGGER.info("upload_pephub set to false. Skipping pephub..")

if upload_qdrant:
self.upload_file_qdrant(
identifier, files.bed_file.path, payload=metadata.model_dump()
)
if classification.genome_alias == "hg38":
_LOGGER.info(f"Uploading bed file to qdrant.. [{identifier}]")
self.upload_file_qdrant(
identifier,
files.bed_file.path,
bed_metadata.model_dump(exclude_none=False),
)
_LOGGER.info(f"File uploaded to qdrant. {identifier}")
else:
_LOGGER.warning(
f"Could not upload to qdrant. Genome: {classification.genome_alias} is not supported."
)

with Session(self._sa_engine) as session:
bed_statement = select(Bed).where(and_(Bed.id == identifier))
Expand Down Expand Up @@ -1645,6 +1652,61 @@ def get_missing_plots(

return results

def get_missing_stats(self, limit: int = 1000, offset: int = 0) -> List[str]:
"""
Get list of bed files that are missing statistics
:param limit: number of results to return
:param offset: offset to start from
:return: list of bed file identifiers
"""

with Session(self._sa_engine) as session:
query = (
select(BedStats)
.where(BedStats.number_of_regions.is_(None))
.limit(limit)
.offset(offset)
)

results = session.scalars(query)

results = [result.id for result in results]

return results

def get_missing_files(self, limit: int = 1000, offset: int = 0) -> List[str]:
"""
Get list of bed files that are missing files (bigBed files)
:param limit: number of results to return
:param offset: offset to start from
:return: list of bed file identifiers
"""

with Session(self._sa_engine) as session:
# Alias for subquery
t2_alias = aliased(Files)

# Define the subquery
subquery = select(t2_alias).where(t2_alias.name == "bigbed_file").subquery()

query = (
select(Bed.id)
.outerjoin(subquery, Bed.id == subquery.c.bedfile_id)
.where(subquery.c.bedfile_id.is_(None))
.limit(limit)
.offset(offset)
)

results = session.scalars(query)

results = [result for result in results]

return results

def get_unprocessed(self, limit: int = 1000, offset: int = 0) -> BedListResult:
"""
Get bed files that are not processed.
Expand Down
7 changes: 7 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format.

## [0.10.3] - 2025-XX-XX

### Added:
- Get missing stats and files in bedfiles
- Config analyzer for quick analysis if the config is correct


## [0.10.2] - 2025-01-09

### Changed:
Expand Down
12 changes: 11 additions & 1 deletion manual_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,14 @@ def sql_search():
results


def config_t():
from bbconf.config_parser.utils import config_analyzer

is_valid = config_analyzer("/home/bnt4me/virginia/repos/bbconf/config.yaml")

print(is_valid)


if __name__ == "__main__":
# zarr_s3()
# add_s3()
Expand All @@ -209,4 +217,6 @@ def sql_search():
# get_pep()
# get_id_plots_missing()
# neighbour_beds()
sql_search()
# sql_search()

config_t()

0 comments on commit 3b71183

Please sign in to comment.