Skip to content

Commit

Permalink
* Map submission results to external metadata fields and upload results…
Browse files Browse the repository at this point in the history
to Loculus.
* Fix external metadata upload issue in backend, small fixes to upload function.
  • Loading branch information
anna-parker committed Aug 13, 2024
1 parent 750fc67 commit bc7da3e
Show file tree
Hide file tree
Showing 9 changed files with 185 additions and 46 deletions.
27 changes: 20 additions & 7 deletions backend/src/main/resources/db/migration/V1__init.sql
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,25 @@ create or replace aggregate jsonb_merge_agg(jsonb)
initcond = '{}'
);

create or replace function jsonb_concat(a jsonb, b jsonb) returns jsonb
as 'select $1 || $2'
language sql
immutable
parallel safe
;
create or replace function jsonb_concat(a jsonb, b jsonb)
returns jsonb
language plpgsql
immutable
parallel safe
as $$
declare
result jsonb;
begin
-- Merge the two JSON objects
result := a || jsonb_object_agg(key,
case
when a->key is null or a->key = 'null' then b->key
else a->key
end)
from jsonb_each_text(b) as elem(key, value);
return result;
end;
$$;

create view all_external_metadata as
select
Expand All @@ -94,7 +107,7 @@ select
-- taking the second object's value when there are duplicate keys.
case
when all_external_metadata.external_metadata is null then jsonb_build_object('metadata', (sequence_entries_preprocessed_data.processed_data->'metadata'))
else jsonb_build_object('metadata', all_external_metadata.external_metadata || (sequence_entries_preprocessed_data.processed_data->'metadata'))
else jsonb_build_object('metadata', jsonb_concat((sequence_entries_preprocessed_data.processed_data->'metadata'), all_external_metadata.external_metadata))
end as joint_metadata
from
sequence_entries_preprocessed_data
Expand Down
43 changes: 15 additions & 28 deletions ena-submission/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,34 +18,6 @@ with open("results/config.yaml", "w") as f:
f.write(yaml.dump(config))

LOG_LEVEL = config.get("log_level", "INFO")
ORGANISMS = config['organisms'].keys()


rule submit_all_external_metadata:
input:
expand("results/submitted_{organism}.json", organism=ORGANISMS)

rule submit_external_metadata:
input:
script="scripts/call_loculus.py",
metadata="results/external_metadata_{organism}.ndjson",
config="results/config.yaml",
output:
submitted="results/submitted_{organism}.json"
params:
log_level=LOG_LEVEL,
shell:
"""
if [ -s {input.metadata} ]; then
python {input.script} \
--mode submit-external-metadata \
--organism {wildcards.organism} \
--metadata {input.metadata} \
--config-file {input.config} \
--output-file {output.submitted} \
--log-level {params.log_level}
fi
"""


rule get_ena_submission_list:
Expand Down Expand Up @@ -134,6 +106,21 @@ rule create_assembly:
sample_created=touch("results/assembly_created"),
params:
log_level=LOG_LEVEL,
shell:
"""
python {input.script} \
--config-file {input.config} \
--log-level {params.log_level} \
"""

rule upload_to_loculus:
input:
script="scripts/upload_external_metadata_to_loculus.py",
config="results/config.yaml",
output:
sample_created=touch("results/uploaded_external_metadata"),
params:
log_level=LOG_LEVEL,
shell:
"""
python {input.script} \
Expand Down
6 changes: 2 additions & 4 deletions ena-submission/scripts/call_loculus.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def make_request(


def submit_external_metadata(
metadata_file,
external_metadata: dict[str, str],
config: Config,
organism: str,
):
Expand All @@ -125,9 +125,7 @@ def submit_external_metadata(
"Content-Type": "application/x-ndjson",
}

with open(metadata_file) as file:
pre_ndjson = [x.strip() for x in file.readlines()]
data = " ".join(pre_ndjson)
data = json.dumps(external_metadata)

response = make_request(HTTPMethod.POST, url, config, data=data, headers=headers, params=params)

Expand Down
4 changes: 1 addition & 3 deletions ena-submission/scripts/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,9 +473,7 @@ def assembly_table_update(db_config, config, retry_number=3, time_threshold=5):
logger.debug("Checking state in ENA")
for row in waiting:
seq_key = {"accession": row["accession"], "version": row["version"]}
check_results: CreationResults = check_ena(
ena_config, row["result"]["erz_assembly_accession"]
)
check_results: CreationResults = check_ena(ena_config, row["result"]["erz_accession"])
_last_ena_check = time
if not check_results.results:
continue
Expand Down
2 changes: 1 addition & 1 deletion ena-submission/scripts/ena_submission_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def create_ena_assembly(config, manifest_file: str, center_name=None):
errors.append(error_message)
return CreationResults(results=None, errors=errors, warnings=warnings)
assembly_results = {
"erz_assembly_accession": erz_accession,
"erz_accession": erz_accession,
}
return CreationResults(results=assembly_results, errors=errors, warnings=warnings)

Expand Down
6 changes: 3 additions & 3 deletions ena-submission/scripts/get_ena_submission_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
@dataclass
class Config:
organisms: list[dict[str, str]]
organism: str
backend_url: str
keycloak_token_url: str
keycloak_client_id: str
Expand Down Expand Up @@ -60,9 +59,10 @@ def get_data_for_submission(config, entries, db_config, organism):
if in_submission_table(accession, version, db_config):
continue
if sum(fields) > 0:
logging.warn(
logging.warning(
f"Found sequence: {key} with ena-specific-metadata fields and not submitted by us ",
f"or {config.ingest_pipeline_submitter}. Potential user error: discarding sequence.",
f"or {config.ingest_pipeline_submitter}. ",
"Potential user error: discarding sequence.",
)
continue
item["organism"] = organism
Expand Down
134 changes: 134 additions & 0 deletions ena-submission/scripts/upload_external_metadata_to_loculus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# This script collects the results of the ENA submission and uploads the results to Loculus

import logging
from dataclasses import dataclass

import click
import yaml
from call_loculus import submit_external_metadata
from submission_db_helper import (
StatusAll,
find_conditions_in_db,
get_db_config,
update_db_where_conditions,
)

logger = logging.getLogger(__name__)
logging.basicConfig(
encoding="utf-8",
level=logging.INFO,
format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ",
datefmt="%H:%M:%S",
)


@dataclass
class Config:
organisms: list[dict[str, str]]
organism: str
backend_url: str
keycloak_token_url: str
keycloak_client_id: str
username: str
password: str
ena_specific_metadata: list[str]
db_username: str
db_password: str
db_host: str


@click.command()
@click.option(
"--log-level",
default="INFO",
type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]),
)
@click.option(
"--config-file",
required=True,
type=click.Path(exists=True),
)
def upload_external_metadata(log_level, config_file):
logger.setLevel(log_level)
logging.getLogger("requests").setLevel(logging.INFO)

with open(config_file) as file:
full_config = yaml.safe_load(file)
relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__}
config = Config(**relevant_config)
logger.info(f"Config: {config}")
db_config = get_db_config(config.db_password, config.db_username, config.db_host)

while True:
# Get external metadata
conditions = {"status_all": StatusAll.SUBMITTED_ALL}
submitted_all = find_conditions_in_db(
db_config, table_name="submission_table", conditions=conditions
)
for entry in submitted_all:
accession = entry["accession"]
data = {
"accession": accession,
"version": entry["version"],
"externalMetadata": {},
}
organism = entry["organism"]
group_key = {"group_id": entry["group_id"], "organism": organism}
seq_key = {"accession": accession, "version": entry["version"]}

# Get corresponding entry in the project table for (group_id, organism)
corresponding_project = find_conditions_in_db(
db_config, table_name="project_table", conditions=group_key
)
if len(corresponding_project) == 1:
data["externalMetadata"]["bioproject_accession"] = corresponding_project[0][
"result"
]["bioproject_accession"]
else:
raise Exception
# Check corresponding entry in the sample table for (accession, version)
corresponding_sample = find_conditions_in_db(
db_config, table_name="sample_table", conditions=seq_key
)
if len(corresponding_sample) == 1:
data["externalMetadata"]["sra_run_accession"] = corresponding_sample[0]["result"][
"sra_run_accession"
]
data["externalMetadata"]["biosample_accession"] = corresponding_sample[0]["result"][
"biosample_accession"
]
else:
raise Exception
# Check corresponding entry in the assembly table for (accession, version)
corresponding_assembly = find_conditions_in_db(
db_config, table_name="assembly_table", conditions=seq_key
)
if len(corresponding_assembly) == 1:
data["externalMetadata"]["gca_accession"] = corresponding_assembly[0]["result"][
"gca_accession"
]
else:
raise Exception

try:
submit_external_metadata(
data,
config,
organism,
)

update_values = {"status_all": StatusAll.SENT_TO_LOCULUS}
update_db_where_conditions(
db_config,
table_name="submission_table",
conditions=seq_key,
update_values=update_values,
)
logger.info(f"Successfully updated external metadata for {accession}")
except:
logger.error(f"ExternalMetadata update failed for {accession}")
continue


if __name__ == "__main__":
upload_external_metadata()
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ spec:
- results/project_created
- results/sample_created
- results/assembly_created
- results/uploaded_external_metadata
volumeMounts:
- name: loculus-ena-submission-config-volume
mountPath: /package/config/config.yaml
Expand Down
8 changes: 8 additions & 0 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,14 @@ defaultOrganismConfig: &defaultOrganismConfig
header: "INSDC"
ingest: bioprojects
noInput: true
- name: gca_accession
displayName: GCA accession
customDisplay:
type: link
url: "https://www.ncbi.nlm.nih.gov/datasets/genome/__value__"
header: "INSDC"
noInput: true
oneHeader: true
- name: biosample_accession
customDisplay:
type: link
Expand Down

0 comments on commit bc7da3e

Please sign in to comment.