diff --git a/backend/src/main/resources/db/migration/V1__init.sql b/backend/src/main/resources/db/migration/V1__init.sql index e392375526..8f0128e41a 100644 --- a/backend/src/main/resources/db/migration/V1__init.sql +++ b/backend/src/main/resources/db/migration/V1__init.sql @@ -68,12 +68,25 @@ create or replace aggregate jsonb_merge_agg(jsonb) initcond = '{}' ); -create or replace function jsonb_concat(a jsonb, b jsonb) returns jsonb - as 'select $1 || $2' - language sql - immutable - parallel safe -; +create or replace function jsonb_concat(a jsonb, b jsonb) +returns jsonb +language plpgsql +immutable +parallel safe +as $$ +declare + result jsonb; +begin + -- Merge the two JSON objects + result := a || jsonb_object_agg(key, + case + when a->key is null or a->key = 'null' then b->key + else a->key + end) + from jsonb_each_text(b) as elem(key, value); + return result; +end; +$$; create view all_external_metadata as select @@ -94,7 +107,7 @@ select -- taking the second object's value when there are duplicate keys. case when all_external_metadata.external_metadata is null then jsonb_build_object('metadata', (sequence_entries_preprocessed_data.processed_data->'metadata')) - else jsonb_build_object('metadata', all_external_metadata.external_metadata || (sequence_entries_preprocessed_data.processed_data->'metadata')) + else jsonb_build_object('metadata', jsonb_concat((sequence_entries_preprocessed_data.processed_data->'metadata'), all_external_metadata.external_metadata)) end as joint_metadata from sequence_entries_preprocessed_data diff --git a/ena-submission/Snakefile b/ena-submission/Snakefile index bfc87afb49..9c490213c4 100644 --- a/ena-submission/Snakefile +++ b/ena-submission/Snakefile @@ -18,34 +18,6 @@ with open("results/config.yaml", "w") as f: f.write(yaml.dump(config)) LOG_LEVEL = config.get("log_level", "INFO") -ORGANISMS = config['organisms'].keys() - - -rule submit_all_external_metadata: - input: - expand("results/submitted_{organism}.json", organism=ORGANISMS) - -rule submit_external_metadata: - input: - script="scripts/call_loculus.py", - metadata="results/external_metadata_{organism}.ndjson", - config="results/config.yaml", - output: - submitted="results/submitted_{organism}.json" - params: - log_level=LOG_LEVEL, - shell: - """ - if [ -s {input.metadata} ]; then - python {input.script} \ - --mode submit-external-metadata \ - --organism {wildcards.organism} \ - --metadata {input.metadata} \ - --config-file {input.config} \ - --output-file {output.submitted} \ - --log-level {params.log_level} - fi - """ rule get_ena_submission_list: @@ -134,6 +106,21 @@ rule create_assembly: sample_created=touch("results/assembly_created"), params: log_level=LOG_LEVEL, + shell: + """ + python {input.script} \ + --config-file {input.config} \ + --log-level {params.log_level} \ + """ + +rule upload_to_loculus: + input: + script="scripts/upload_external_metadata_to_loculus.py", + config="results/config.yaml", + output: + sample_created=touch("results/uploaded_external_metadata"), + params: + log_level=LOG_LEVEL, shell: """ python {input.script} \ diff --git a/ena-submission/scripts/call_loculus.py b/ena-submission/scripts/call_loculus.py index eb5cc42585..9db73d84a1 100644 --- a/ena-submission/scripts/call_loculus.py +++ b/ena-submission/scripts/call_loculus.py @@ -106,7 +106,7 @@ def make_request( def submit_external_metadata( - metadata_file, + external_metadata: dict[str, str], config: Config, organism: str, ): @@ -125,9 +125,7 @@ def submit_external_metadata( "Content-Type": "application/x-ndjson", } - with open(metadata_file) as file: - pre_ndjson = [x.strip() for x in file.readlines()] - data = " ".join(pre_ndjson) + data = json.dumps(external_metadata) response = make_request(HTTPMethod.POST, url, config, data=data, headers=headers, params=params) diff --git a/ena-submission/scripts/create_assembly.py b/ena-submission/scripts/create_assembly.py index b2d941a7b6..f0b2fd9b1e 100644 --- a/ena-submission/scripts/create_assembly.py +++ b/ena-submission/scripts/create_assembly.py @@ -473,9 +473,7 @@ def assembly_table_update(db_config, config, retry_number=3, time_threshold=5): logger.debug("Checking state in ENA") for row in waiting: seq_key = {"accession": row["accession"], "version": row["version"]} - check_results: CreationResults = check_ena( - ena_config, row["result"]["erz_assembly_accession"] - ) + check_results: CreationResults = check_ena(ena_config, row["result"]["erz_accession"]) _last_ena_check = time if not check_results.results: continue diff --git a/ena-submission/scripts/ena_submission_helper.py b/ena-submission/scripts/ena_submission_helper.py index cbd1f30802..19a4f9f69e 100644 --- a/ena-submission/scripts/ena_submission_helper.py +++ b/ena-submission/scripts/ena_submission_helper.py @@ -281,7 +281,7 @@ def create_ena_assembly(config, manifest_file: str, center_name=None): errors.append(error_message) return CreationResults(results=None, errors=errors, warnings=warnings) assembly_results = { - "erz_assembly_accession": erz_accession, + "erz_accession": erz_accession, } return CreationResults(results=assembly_results, errors=errors, warnings=warnings) diff --git a/ena-submission/scripts/get_ena_submission_list.py b/ena-submission/scripts/get_ena_submission_list.py index aed9d78a33..100f7afe5a 100644 --- a/ena-submission/scripts/get_ena_submission_list.py +++ b/ena-submission/scripts/get_ena_submission_list.py @@ -22,7 +22,6 @@ @dataclass class Config: organisms: list[dict[str, str]] - organism: str backend_url: str keycloak_token_url: str keycloak_client_id: str @@ -60,9 +59,10 @@ def get_data_for_submission(config, entries, db_config, organism): if in_submission_table(accession, version, db_config): continue if sum(fields) > 0: - logging.warn( + logging.warning( f"Found sequence: {key} with ena-specific-metadata fields and not submitted by us ", - f"or {config.ingest_pipeline_submitter}. Potential user error: discarding sequence.", + f"or {config.ingest_pipeline_submitter}. ", + "Potential user error: discarding sequence.", ) continue item["organism"] = organism diff --git a/ena-submission/scripts/upload_external_metadata_to_loculus.py b/ena-submission/scripts/upload_external_metadata_to_loculus.py new file mode 100644 index 0000000000..6f4ea3387a --- /dev/null +++ b/ena-submission/scripts/upload_external_metadata_to_loculus.py @@ -0,0 +1,134 @@ +# This script collects the results of the ENA submission and uploads the results to Loculus + +import logging +from dataclasses import dataclass + +import click +import yaml +from call_loculus import submit_external_metadata +from submission_db_helper import ( + StatusAll, + find_conditions_in_db, + get_db_config, + update_db_where_conditions, +) + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.INFO, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + + +@dataclass +class Config: + organisms: list[dict[str, str]] + organism: str + backend_url: str + keycloak_token_url: str + keycloak_client_id: str + username: str + password: str + ena_specific_metadata: list[str] + db_username: str + db_password: str + db_host: str + + +@click.command() +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +@click.option( + "--config-file", + required=True, + type=click.Path(exists=True), +) +def upload_external_metadata(log_level, config_file): + logger.setLevel(log_level) + logging.getLogger("requests").setLevel(logging.INFO) + + with open(config_file) as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} + config = Config(**relevant_config) + logger.info(f"Config: {config}") + db_config = get_db_config(config.db_password, config.db_username, config.db_host) + + while True: + # Get external metadata + conditions = {"status_all": StatusAll.SUBMITTED_ALL} + submitted_all = find_conditions_in_db( + db_config, table_name="submission_table", conditions=conditions + ) + for entry in submitted_all: + accession = entry["accession"] + data = { + "accession": accession, + "version": entry["version"], + "externalMetadata": {}, + } + organism = entry["organism"] + group_key = {"group_id": entry["group_id"], "organism": organism} + seq_key = {"accession": accession, "version": entry["version"]} + + # Get corresponding entry in the project table for (group_id, organism) + corresponding_project = find_conditions_in_db( + db_config, table_name="project_table", conditions=group_key + ) + if len(corresponding_project) == 1: + data["externalMetadata"]["bioproject_accession"] = corresponding_project[0][ + "result" + ]["bioproject_accession"] + else: + raise Exception + # Check corresponding entry in the sample table for (accession, version) + corresponding_sample = find_conditions_in_db( + db_config, table_name="sample_table", conditions=seq_key + ) + if len(corresponding_sample) == 1: + data["externalMetadata"]["sra_run_accession"] = corresponding_sample[0]["result"][ + "sra_run_accession" + ] + data["externalMetadata"]["biosample_accession"] = corresponding_sample[0]["result"][ + "biosample_accession" + ] + else: + raise Exception + # Check corresponding entry in the assembly table for (accession, version) + corresponding_assembly = find_conditions_in_db( + db_config, table_name="assembly_table", conditions=seq_key + ) + if len(corresponding_assembly) == 1: + data["externalMetadata"]["gca_accession"] = corresponding_assembly[0]["result"][ + "gca_accession" + ] + else: + raise Exception + + try: + submit_external_metadata( + data, + config, + organism, + ) + + update_values = {"status_all": StatusAll.SENT_TO_LOCULUS} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + logger.info(f"Successfully updated external metadata for {accession}") + except: + logger.error(f"ExternalMetadata update failed for {accession}") + continue + + +if __name__ == "__main__": + upload_external_metadata() diff --git a/kubernetes/loculus/templates/ena-submission-deployment.yaml b/kubernetes/loculus/templates/ena-submission-deployment.yaml index 93d28fdfc9..38f3839969 100644 --- a/kubernetes/loculus/templates/ena-submission-deployment.yaml +++ b/kubernetes/loculus/templates/ena-submission-deployment.yaml @@ -119,6 +119,7 @@ spec: - results/project_created - results/sample_created - results/assembly_created + - results/uploaded_external_metadata volumeMounts: - name: loculus-ena-submission-config-volume mountPath: /package/config/config.yaml diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 7e5b677a74..d680a1f4ee 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -463,6 +463,14 @@ defaultOrganismConfig: &defaultOrganismConfig header: "INSDC" ingest: bioprojects noInput: true + - name: gca_accession + displayName: GCA accession + customDisplay: + type: link + url: "https://www.ncbi.nlm.nih.gov/datasets/genome/__value__" + header: "INSDC" + noInput: true + oneHeader: true - name: biosample_accession customDisplay: type: link