Skip to content

Commit

Permalink
Fix tests, add an example of an accession that should be ignored by i…
Browse files Browse the repository at this point in the history
…ngest: INSDC001.1
  • Loading branch information
anna-parker committed Sep 23, 2024
1 parent 2d4b13c commit ca0d0d9
Show file tree
Hide file tree
Showing 6 changed files with 236 additions and 125 deletions.
25 changes: 24 additions & 1 deletion ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,30 @@ rule rename_columns:
)


rule get_loculus_submissions:
input:
script="scripts/get_loculus_submissions.py",
config="results/config.yaml",
output:
genbankAccessions="results/exclude_insdc_accessions.tsv",
biosampleAccessions="results/exclude_biosample_accessions.tsv",
params:
log_level=LOG_LEVEL,
shell:
"""
python {input.script} \
--output-insdc-accessions {output.genbankAccessions} \
--output-biosample-accessions {output.biosampleAccessions} \
--log-level {params.log_level} \
--config-file {input.config} \
"""


rule filter_out_loculus_submissions:
input:
ncbi_dataset_tsv="results/metadata_post_rename.tsv",
exclude_biosample_accessions="results/exclude_biosample_accessions.tsv",
exclude_insdc_accessions="results/exclude_insdc_accessions.tsv",
script="scripts/filter_out_loculus_submissions.py",
config="results/config.yaml",
output:
Expand All @@ -108,9 +129,11 @@ rule filter_out_loculus_submissions:
"""
python {input.script} \
--input-metadata-tsv {input.ncbi_dataset_tsv} \
--output-metadata-tsv {output.metadata_tsv} \
--exclude-insdc-accessions {input.exclude_insdc_accessions} \
--exclude-biosample-accessions {input.exclude_biosample_accessions} \
--log-level {params.log_level} \
--config-file {input.config} \
--output-metadata-tsv {output.metadata_tsv} \
"""


Expand Down
150 changes: 26 additions & 124 deletions ingest/scripts/filter_out_loculus_submissions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
import logging
import os
from dataclasses import dataclass
from datetime import datetime
from enum import Enum

import click
import pandas as pd
import yaml
from psycopg2.extras import RealDictCursor
from psycopg2.pool import SimpleConnectionPool

logger = logging.getLogger(__name__)
logging.basicConfig(
Expand All @@ -27,106 +22,6 @@ class Config:
db_host: str


def db_init(
db_password_default: str, db_username_default: str, db_host_default: str
) -> SimpleConnectionPool:
db_password = os.getenv("DB_PASSWORD")
if not db_password:
db_password = db_password_default

db_username = os.getenv("DB_USERNAME")
if not db_username:
db_username = db_username_default

db_host = os.getenv("DB_HOST")
if not db_host:
db_host = db_host_default

return SimpleConnectionPool(
minconn=1,
maxconn=1, # Only allow one connection per organism
dbname="loculus",
user=db_username,
host=db_host,
password=db_password,
options="-c search_path=ena-submission",
)


class Status(Enum):
READY = 0
SUBMITTING = 1
SUBMITTED = 2
HAS_ERRORS = 3
WAITING = 4 # Only for assembly creation

def __str__(self):
return self.name


@dataclass
class SampleTableEntry:
accession: str
version: int
errors: str | None = None
warnings: str | None = None
status: Status = Status.READY
started_at: datetime | None = None
finished_at: datetime | None = None
result: str | None = None


@dataclass
class AssemblyTableEntry:
accession: str
version: int
errors: str | None = None
warnings: str | None = None
status: Status = Status.READY
started_at: datetime | None = None
finished_at: datetime | None = None
result: str | None = None


def get_bio_sample_accessions(db_conn_pool: SimpleConnectionPool) -> dict[str, str]:
con = db_conn_pool.getconn()
try:
with con, con.cursor(cursor_factory=RealDictCursor) as cur:
# Result is a jsonb column
query = "SELECT accession, result FROM sample_table WHERE STATUS = 'SUBMITTED'"

cur.execute(query)

results = cur.fetchall()
finally:
db_conn_pool.putconn(con)

return {result["accession"]: result["result"]["biosample_accession"] for result in results}


def get_insdc_accessions(db_conn_pool: SimpleConnectionPool) -> dict[str, str]:
con = db_conn_pool.getconn()
try:
with con, con.cursor(cursor_factory=RealDictCursor) as cur:
# Result is a jsonb column
query = "SELECT accession, result FROM assembly_table WHERE STATUS = 'SUBMITTED'"

cur.execute(query)

results = cur.fetchall()
finally:
db_conn_pool.putconn(con)

return {
result["accession"]: [
result["result"][key]
for key in result["result"]
if key.startswith("insdc_accession_full")
]
for result in results
}


@click.command()
@click.option(
"--log-level",
Expand All @@ -148,7 +43,24 @@ def get_insdc_accessions(db_conn_pool: SimpleConnectionPool) -> dict[str, str]:
required=True,
type=click.Path(),
)
def filter_out_loculus_submissions(log_level, config_file, input_metadata_tsv, output_metadata_tsv):
@click.option(
"--exclude-insdc-accessions",
required=True,
type=click.Path(),
)
@click.option(
"--exclude-biosample-accessions",
required=True,
type=click.Path(),
)
def filter_out_loculus_submissions(
log_level,
config_file,
input_metadata_tsv,
output_metadata_tsv,
exclude_insdc_accessions,
exclude_biosample_accessions,
):
logger.setLevel(log_level)
logging.getLogger("requests").setLevel(logging.INFO)

Expand All @@ -157,27 +69,17 @@ def filter_out_loculus_submissions(log_level, config_file, input_metadata_tsv, o
relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__}
config = Config(**relevant_config)
logger.info(f"Config: {config}")

db_config = db_init(config.db_password, config.db_username, config.db_host)
insdc_accessions_submitted_by_loculus = get_insdc_accessions(db_config)
all_insdc_accessions_submitted_by_loculus: set = {
item for sublist in insdc_accessions_submitted_by_loculus.values() for item in sublist
}
all_insdc_accessions_submitted_by_loculus.add("MZ424862.1")
logger.debug(f"Assembly accessions to filter out: {all_insdc_accessions_submitted_by_loculus}")
biosample_accessions_submitted_by_loculus = get_bio_sample_accessions(db_config)
logger.debug(
f"Biosample accessions to filter out: {biosample_accessions_submitted_by_loculus.values()}"
)

df = pd.read_csv(input_metadata_tsv, sep="\t", dtype=str, keep_default_na=False)
original_count = len(df)
with open(exclude_insdc_accessions, encoding="utf-8") as f:
loculus_insdc_accessions = [line.strip() for line in f]

with open(exclude_biosample_accessions, encoding="utf-8") as f:
loculus_biosample_accessions = [line.strip() for line in f]

filtered_df = df[~df["genbankAccession"].isin(all_insdc_accessions_submitted_by_loculus)]
filtered_df = filtered_df[
~filtered_df["biosampleAccession"].isin(biosample_accessions_submitted_by_loculus.values())
]
logger.info(f"Filtered out #: {(original_count - len(filtered_df))} sequences.")
filtered_df = df[~df["genbankAccession"].isin(loculus_insdc_accessions)]
filtered_df = filtered_df[~filtered_df["biosampleAccession"].isin(loculus_biosample_accessions)]
logger.info(f"Filtered out {(original_count - len(filtered_df))} sequences.")
filtered_df.to_csv(output_metadata_tsv, sep="\t", index=False)


Expand Down
Loading

0 comments on commit ca0d0d9

Please sign in to comment.