From d400725c794ad0c0049d4472371a7f4d3534f91c Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Wed, 18 Sep 2024 18:38:29 +0200 Subject: [PATCH] feat(ena-submission): Add code to submit sequences to ENA and return results to Loculus (#2417) * Add code to create project object for submission, submit to ena and keep state in database. * Create sample.xml objects to send to ENA using PHAG4E's metadata field mapping. Keep state in sample_table. Send slack notifications if submission fails. * Create assembly, add webin-cli to docker container, add call to API to check status of assembly submission. * Map submission results to external metadata fields and upload results to Loculus. * Fix external metadata upload issue in backend, add additional flyway version. * add gcaAccession field to values.yaml * use connection pooling for db requests and explicitly rollback if execute fails * Prevent SQL injection with table_name and column name validation * Update docs, add details of ena-submission to README (taking from all previous PRs). * Do not store all data from get-released-data call * Send slack notification when step fails. --------- Co-authored-by: Cornelius Roemer --- .github/workflows/e2e-k3d.yml | 2 +- .github/workflows/ena-submission-tests.yaml | 37 ++ backend/README.md | 2 + .../db/migration/V1.3__update_view.sql | 46 ++ ena-submission/Dockerfile | 16 +- ena-submission/ENA_submission.md | 31 +- ena-submission/README.md | 155 ++++- ena-submission/Snakefile | 100 +++- ena-submission/config/config.yaml | 50 +- ena-submission/config/defaults.yaml | 69 ++- ena-submission/environment.yml | 1 + .../flyway/sql/V1.1__add_center_name.sql | 2 + ena-submission/scripts/call_loculus.py | 96 ++-- ena-submission/scripts/create_assembly.py | 533 ++++++++++++++++++ ena-submission/scripts/create_project.py | 388 +++++++++++++ ena-submission/scripts/create_sample.py | 438 ++++++++++++++ .../scripts/ena_submission_helper.py | 430 ++++++++++++++ ena-submission/scripts/ena_types.py | 247 ++++++++ .../scripts/get_ena_submission_list.py | 55 +- ena-submission/scripts/notifications.py | 34 +- .../scripts/submission_db_helper.py | 460 ++++++++++++--- ena-submission/scripts/test_ena_submission.py | 271 +++++++++ .../scripts/trigger_submission_to_ena.py | 46 +- .../upload_external_metadata_to_loculus.py | 225 ++++++++ ...=> approved_ena_submission_list_test.json} | 9 +- ena-submission/test/test_project_response.xml | 8 + ena-submission/test/test_sample_request.xml | 72 +++ ena-submission/test/test_sample_response.xml | 10 + ena-submission/test/text_project_request.xml | 22 + .../templates/ena-submission-deployment.yaml | 16 +- kubernetes/loculus/values.yaml | 19 +- kubernetes/loculus/values_preview_server.yaml | 7 +- 32 files changed, 3665 insertions(+), 232 deletions(-) create mode 100644 .github/workflows/ena-submission-tests.yaml create mode 100644 backend/src/main/resources/db/migration/V1.3__update_view.sql create mode 100644 ena-submission/flyway/sql/V1.1__add_center_name.sql create mode 100644 ena-submission/scripts/create_assembly.py create mode 100644 ena-submission/scripts/create_project.py create mode 100644 ena-submission/scripts/create_sample.py create mode 100644 ena-submission/scripts/ena_submission_helper.py create mode 100644 ena-submission/scripts/ena_types.py create mode 100644 ena-submission/scripts/test_ena_submission.py create mode 100644 ena-submission/scripts/upload_external_metadata_to_loculus.py rename ena-submission/test/{ena_submission_list.json => approved_ena_submission_list_test.json} (99%) create mode 100644 ena-submission/test/test_project_response.xml create mode 100644 ena-submission/test/test_sample_request.xml create mode 100644 ena-submission/test/test_sample_response.xml create mode 100644 ena-submission/test/text_project_request.xml diff --git a/.github/workflows/e2e-k3d.yml b/.github/workflows/e2e-k3d.yml index c217e691b..704cea4f1 100644 --- a/.github/workflows/e2e-k3d.yml +++ b/.github/workflows/e2e-k3d.yml @@ -34,7 +34,7 @@ jobs: env: ALL_BROWSERS: ${{ github.ref == 'refs/heads/main' || github.event.inputs.all_browsers && 'true' || 'false' }} sha: ${{ github.event.pull_request.head.sha || github.sha }} - wait_timeout: ${{ github.ref == 'refs/heads/main' && 900 || 180 }} + wait_timeout: ${{ github.ref == 'refs/heads/main' && 900 || 240 }} steps: - name: Shorten sha run: echo "sha=${sha::7}" >> $GITHUB_ENV diff --git a/.github/workflows/ena-submission-tests.yaml b/.github/workflows/ena-submission-tests.yaml new file mode 100644 index 000000000..3cb5c8114 --- /dev/null +++ b/.github/workflows/ena-submission-tests.yaml @@ -0,0 +1,37 @@ +name: ena-submission-tests +on: + # test + pull_request: + paths: + - "ena-submission/**" + - ".github/workflows/ena-submission-tests.yml" + push: + branches: + - main + workflow_dispatch: +concurrency: + group: ci-${{ github.ref == 'refs/heads/main' && github.run_id || github.ref }}-ena-submission-tests + cancel-in-progress: true +jobs: + unitTests: + name: Unit Tests + runs-on: codebuild-loculus-ci-${{ github.run_id }}-${{ github.run_attempt }} + timeout-minutes: 15 + steps: + - uses: actions/checkout@v4 + - name: Set up micromamba + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: ena-submission/environment.yml + micromamba-version: 'latest' + init-shell: >- + bash + powershell + cache-environment: true + post-cleanup: 'all' + - name: Run tests + run: | + micromamba activate loculus-ena-submission + python3 scripts/test_ena_submission.py + shell: micromamba-shell {0} + working-directory: ena-submission \ No newline at end of file diff --git a/backend/README.md b/backend/README.md index 1bce19eb0..bcbef607c 100644 --- a/backend/README.md +++ b/backend/README.md @@ -71,6 +71,8 @@ The service listens, by default, to **port 8079**: 'metadata')) + else jsonb_build_object('metadata', (sequence_entries_preprocessed_data.processed_data->'metadata') || all_external_metadata.external_metadata) + end as joint_metadata +from + sequence_entries_preprocessed_data + left join all_external_metadata on + all_external_metadata.accession = sequence_entries_preprocessed_data.accession + and all_external_metadata.version = sequence_entries_preprocessed_data.version + and sequence_entries_preprocessed_data.pipeline_version = (select version from current_processing_pipeline); + +create view sequence_entries_view as +select + se.*, + sepd.started_processing_at, + sepd.finished_processing_at, + sepd.processed_data as processed_data, + sepd.processed_data || em.joint_metadata as joint_metadata, + sepd.errors, + sepd.warnings, + case + when se.released_at is not null then 'APPROVED_FOR_RELEASE' + when se.is_revocation then 'AWAITING_APPROVAL' + when sepd.processing_status = 'IN_PROCESSING' then 'IN_PROCESSING' + when sepd.processing_status = 'HAS_ERRORS' then 'HAS_ERRORS' + when sepd.processing_status = 'FINISHED' then 'AWAITING_APPROVAL' + else 'RECEIVED' + end as status +from + sequence_entries se + left join sequence_entries_preprocessed_data sepd on + se.accession = sepd.accession + and se.version = sepd.version + and sepd.pipeline_version = (select version from current_processing_pipeline) + left join external_metadata_view em on + se.accession = em.accession + and se.version = em.version; \ No newline at end of file diff --git a/ena-submission/Dockerfile b/ena-submission/Dockerfile index 550931d9c..158c528d2 100644 --- a/ena-submission/Dockerfile +++ b/ena-submission/Dockerfile @@ -1,5 +1,14 @@ FROM mambaorg/micromamba:1.5.8 +# Install dependencies needed for webin-cli +USER root +RUN apt-get update && apt-get install -y \ + default-jre \ + wget \ + && rm -rf /var/lib/apt/lists/* +RUN mkdir -p /package && chown -R $MAMBA_USER:$MAMBA_USER /package +USER $MAMBA_USER + COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/env.yaml COPY --chown=$MAMBA_USER:$MAMBA_USER .mambarc /tmp/.mambarc @@ -10,6 +19,11 @@ RUN micromamba config set extract_threads 1 \ # Set the environment variable to activate the conda environment ARG MAMBA_DOCKERFILE_ACTIVATE=1 -COPY --chown=$MAMBA_USER:$MAMBA_USER . /package +ENV WEBIN_CLI_VERSION 7.3.1 +USER root +RUN wget -q "https://github.com/enasequence/webin-cli/releases/download/${WEBIN_CLI_VERSION}/webin-cli-${WEBIN_CLI_VERSION}.jar" -O /package/webin-cli.jar +USER $MAMBA_USER + +COPY --chown=$MAMBA_USER:$MAMBA_USER . /package WORKDIR /package \ No newline at end of file diff --git a/ena-submission/ENA_submission.md b/ena-submission/ENA_submission.md index fae2d240d..d12501922 100644 --- a/ena-submission/ENA_submission.md +++ b/ena-submission/ENA_submission.md @@ -35,7 +35,7 @@ We require the following components: - Analysis: An analysis contains secondary analysis results derived from sequence reads (e.g. a genome assembly). -At the time of writing (October 2023), in contrast to ENA, Pathoplexus has no hierarchy of study/sample/sequence: every sequence is its own study and sample. Therefore we need to figure out how to map sequences to projects, each submitter could have exactly _one_ study pre organism (this is the approach we are currently taking), or each sequence could be associated with its own study. +In contrast to ENA, Pathoplexus has no hierarchy of study/sample/sequence. Therefore we have decided to create _one_ study per Loculus submission group and organism. For each Loculus sample we will create one sample (with metadata) and one sequence (with the sequence). ### Mapping sequences and studies @@ -277,7 +277,34 @@ The following could be implement as post-MVP features: -password YYYYYY ``` -5. Save accession numbers (these will be returned by the webin-cli) +5. Save ERZ accession numbers (these will be returned by the webin-cli) +6. Wait to receive GCA accession numbers (returned later after assignment by NCBI). This can be retrieved via https://wwwdev.ebi.ac.uk/ena/submit/report/swagger-ui/index.html + +``` +curl -X 'GET' \ + 'https://www.ebi.ac.uk/ena/submit/report/analysis-process/{erz_accession}?format=json&max-results=100' \ + -H 'accept: */*' \ + -H 'Authorization: Basic KEY' +``` + +When processing is finished the response should look like: + +``` +[ + { + "report": { + "id": "{erz_accession}", + "analysisType": "SEQUENCE_ASSEMBLY", + "acc": "chromosomes:OZ076380-OZ076381,genome:GCA_964187725.1", + "processingStatus": "COMPLETED", + "processingStart": "14-06-2024 05:07:40", + "processingEnd": "14-06-2024 05:08:19", + "processingError": null + }, + "links": [] + } +] +``` ## Promises made to ENA diff --git a/ena-submission/README.md b/ena-submission/README.md index 1ebc22c22..aaf488a6c 100644 --- a/ena-submission/README.md +++ b/ena-submission/README.md @@ -1,11 +1,94 @@ -## ENA Submission +# ENA Submission -### Developing Locally +## Snakemake Rules -The ENA submission pod creates a new schema in the loculus DB, this is managed by flyway. This means to develop locally you will have to start the postgres DB locally e.g. by using the ../deploy.py script or using +### get_ena_submission_list + +This rule runs daily in a cron job, it calls the loculus backend (`get-released-data`), obtains a new list of sequences that are ready for submission to ENA and sends this list as a compressed json file to our slack channel. Sequences are ready for submission IF: + +- data in state APPROVED_FOR_RELEASE: +- data must be state "OPEN" for use +- data must not already exist in ENA or be in the submission process, this means: + - data was not submitted by the `config.ingest_pipeline_submitter` + - data is not in the `ena-submission.submission_table` + - as an extra check we discard all sequences with `ena-specific-metadata` fields + +### all + +This rule runs in the ena-submission pod, it runs the following rules in parallel: + +#### trigger_submission_to_ena + +Download file in `github_url` every 30s. If data is not in submission table already (and not a revision) upload data to `ena-submission.submission_table`. + +#### create_project + +In a loop: + +- Get sequences in `submission_table` in state READY_TO_SUBMIT + - if (there exists an entry in the project_table for the corresponding (group_id, organism)): + - if (entry is in status SUBMITTED): update `submission_table` to SUBMITTED_PROJECT. + - else: update submission_table to SUBMITTING_PROJECT. + - else: create project entry in `project_table` for (group_id, organism). +- Get sequences in `submission_table` in state SUBMITTING_PROJECT + - if (corresponding `project_table` entry is in state SUBMITTED): update entries to state SUBMITTED_PROJECT. +- Get sequences in `project_table` in state READY, prepare submission object, set status to SUBMITTING + - if (submission succeeds): set status to SUBMITTED and fill in results: the result of a successful submission is `bioproject_accession` and an ena-internal `ena_submission_accession`. + - else: set status to HAS_ERRORS and fill in errors +- Get sequences in `project_table` in state HAS_ERRORS for over 15min and sequences in status SUBMITTING for over 15min: send slack notification + +#### create_sample + +Maps loculus metadata to ena metadata using template: https://www.ebi.ac.uk/ena/browser/view/ERC000033 + +In a loop + +- Get sequences in `submission_table` in state SUBMITTED_PROJECT + - if (there exists an entry in the `sample_table` for the corresponding (accession, version)): + - if (entry is in status SUBMITTED): update `submission_table` to SUBMITTED_SAMPLE. + - else: update submission_table to SUBMITTING_SAMPLE. + - else: create sample entry in `sample_table` for (accession, version). +- Get sequences in `submission_table` in state SUBMITTING_SAMPLE + - if (corresponding `sample_table` entry is in state SUBMITTED): update entries to state SUBMITTED_SAMPLE. +- Get sequences in `sample_table` in state READY, prepare submission object, set status to SUBMITTING + - if (submission succeeds): set status to SUBMITTED and fill in results, the results of a successful submission are an `sra_run_accession` (starting with ERS) , a `biosample_accession` (starting with SAM) and an ena-internal `ena_submission_accession`. + - else: set status to HAS_ERRORS and fill in errors +- Get sequences in `sample_table` in state HAS_ERRORS for over 15min and sequences in status SUBMITTING for over 15min: send a slack notification + +#### create_assembly + +In a loop: + +- Get sequences in `submission_table` in state SUBMITTED_SAMPLE + - if (there exists an entry in the `assembly_table` for the corresponding (accession, version)): + - if (entry is in status SUBMITTED): update `assembly_table` to SUBMITTED_ASSEMBLY. + - else: update `assembly_table` to SUBMITTING_ASSEMBLY. + - else: create assembly entry in `assembly_table` for (accession, version). +- Get sequences in `submission_table` in state SUBMITTING_SAMPLE + - if (corresponding `assembly_table` entry is in state SUBMITTED): update entries to state SUBMITTED_ASSEMBLY. +- Get sequences in `assembly_table` in state READY, prepare files: we need chromosome_list, fasta files and a manifest file, set status to WAITING + - if (submission succeeds): set status to WAITING and fill in results: ena-internal `erz_accession` + - else: set status to HAS_ERRORS and fill in errors +- Get sequences in `assembly_table` in state WAITING, every 5minutes (to not overload ENA) check if ENA has processed the assemblies and assigned them `gca_accession`. If so update the table to status SUBMITTED and fill in results +- Get sequences in `assembly_table` in state HAS_ERRORS for over 15min and sequences in status SUBMITTING for over 15min, or in state WAITING for over 48hours: send slack notification + +#### upload_to_loculus + +- Get sequences in `submission_table` state SUBMITTED_ALL. +- Get the results of all the submissions (from all other tables) +- Create a POST request to the submit-external-metadata with the results in the expected format. + - if (successful): set sequences to state SENT_TO_LOCULUS + - else: set sequences to state HAS_ERRORS_EXT_METADATA_UPLOAD +- Get sequences in `submission_table` in state HAS_ERRORS_EXT_METADATA_UPLOAD for over 15min and sequences in status SUBMITTED_ALL for over 15min: send slack notification + +## Developing Locally + +### Database + +The ENA submission service creates a new schema in the Loculus Postgres DB, managed by flyway. To develop locally you will have to start the postgres DB locally e.g. by using the `../deploy.py` script or using ```sh - docker run -d \ +docker run -d \ --name loculus_postgres \ -e POSTGRES_DB=loculus \ -e POSTGRES_USER=postgres \ @@ -14,34 +97,82 @@ The ENA submission pod creates a new schema in the loculus DB, this is managed b postgres:latest ``` -In our kubernetes pod we run flyway in a docker container, however when running locally it is best to [download the flyway CLI](https://documentation.red-gate.com/fd/command-line-184127404.html). +### Install and run flyway -You can then run flyway using the +In our kubernetes pod we run flyway in a docker container, however when running locally it is best to [download the flyway CLI](https://documentation.red-gate.com/fd/command-line-184127404.html) (or `brew install flyway` on macOS). -``` - flyway -user=postgres -password=unsecure -url=jdbc:postgresql://127.0.0.1:5432/loculus -schemas=ena-submission -locations=filesystem:./sql migrate +You can then create the schema using the following command: + +```sh +flyway -user=postgres -password=unsecure -url=jdbc:postgresql://127.0.0.1:5432/loculus -schemas=ena-submission -locations=filesystem:./flyway/sql migrate ``` If you want to test the docker image locally. It can be built and run using the commands: -``` +```sh docker build -t ena-submission-flyway . docker run -it -e FLYWAY_URL=jdbc:postgresql://127.0.0.1:5432/loculus -e FLYWAY_USER=postgres -e FLYWAY_PASSWORD=unsecure ena-submission-flyway flyway migrate ``` +### Setting up micromamba environment + +
+ + Setting up micromamba + The rest of the ena-submission pod uses micromamba: -```bash +```sh brew install micromamba micromamba shell init --shell zsh --root-prefix=~/micromamba source ~/.zshrc ``` +
+ Then activate the loculus-ena-submission environment -```bash -micromamba create -f environment.yml --platform osx-64 --rc-file .mambarc +```sh +micromamba create -f environment.yml --rc-file .mambarc micromamba activate loculus-ena-submission ``` +### Using ENA's webin-cli + +In order to submit assemblies you will also need to install ENA's `webin-cli.jar`. Their [webpage](https://ena-docs.readthedocs.io/en/latest/submit/general-guide/webin-cli.html) offers more instructions. This pipeline has been tested with `WEBIN_CLI_VERSION=7.3.1`. + +```sh +wget -q "https://github.com/enasequence/webin-cli/releases/download/${WEBIN_CLI_VERSION}/webin-cli-${WEBIN_CLI_VERSION}.jar" -O /package/webin-cli.jar +``` + +### Running snakemake + Then run snakemake using `snakemake` or `snakemake {rule}`. + +## Testing + +### Run tests + +```sh +micromamba activate loculus-ena-submission +python3 scripts/test_ena_submission.py +``` + +### Testing submission locally + +ENA-submission currently is only triggered after manual approval. + +The `get_ena_submission_list` runs as a cron-job. It queries Loculus for new sequences to submit to ENA (these are sequences that are in state OPEN, were not submitted by the INSDC_INGEST_USER, do not include ena external_metadata fields and are not yet in the submission_table of the ena-submission schema). If it finds new sequences it sends a notification to slack with all sequences. + +It is then the reviewer's turn to review these sequences. [TODO: define review criteria] If these sequences meet our criteria they should be uploaded to [pathoplexus/ena-submission](https://github.com/pathoplexus/ena-submission/blob/main/approved/approved_ena_submission_list.json) (currently we read data from the [test folder](https://github.com/pathoplexus/ena-submission/blob/main/test/approved_ena_submission_list.json) - but this will be changed to the `approved` folder in production). The `trigger_submission_to_ena` rule is constantly checking this folder for new sequences and adding them to the submission_table if they are not already there. Note we cannot yet handle revisions so these should not be added to the approved list [TODO: do not allow submission of revised sequences in `trigger_submission_to_ena`]- revisions will still have to be performed manually. + +If you would like to test `trigger_submission_to_ena` while running locally you can also use the `trigger_submission_to_ena_from_file` rule, this will read in data from `results/approved_ena_submission_list.json` (see the test folder for an example). You can also upload data to the [test folder](https://github.com/pathoplexus/ena-submission/blob/main/test/approved_ena_submission_list.json) - note that if you add fake data with a non-existent group-id the project creation will fail, additionally the `upload_to_loculus` rule will fail if these sequences do not actually exist in your loculus instance. + +All other rules query the `submission_table` for projects/samples and assemblies to submit. Once successful they add accessions to the `results` column in dictionary format. Finally, once the entire process has succeeded the new external metadata will be uploaded to Loculus. + +Note that ENA's dev server does not always finish processing and you might not receive a gcaAccession for your dev submissions. If you would like to test the full submission cycle on the ENA dev instance it makes sense to manually alter the gcaAccession in the database using `ERZ24784470`. You can connect to a preview instance via port forwarding to these changes on local database tool such as pgAdmin: + +1. Apply the preview `~/.kube/config` +2. Find the database POD using `kubectl get pods -A | grep database` +3. Connect via port-forwarding `kubectl port-forward $POD -n $NAMESPACE 5432:5432` +4. If necessary find password using `kubectl get secret` diff --git a/ena-submission/Snakefile b/ena-submission/Snakefile index 88371dbc6..26536690e 100644 --- a/ena-submission/Snakefile +++ b/ena-submission/Snakefile @@ -18,33 +18,15 @@ with open("results/config.yaml", "w") as f: f.write(yaml.dump(config)) LOG_LEVEL = config.get("log_level", "INFO") -ORGANISMS = config['organisms'].keys() -rule submit_all_external_metadata: - input: - expand("results/submitted_{organism}.json", organism=ORGANISMS) - -rule submit_external_metadata: + +rule all: input: - script="scripts/call_loculus.py", - metadata="results/external_metadata_{organism}.ndjson", - config="results/config.yaml", - output: - submitted="results/submitted_{organism}.json" - params: - log_level=LOG_LEVEL, - shell: - """ - if [ -s {input.metadata} ]; then - python {input.script} \ - --mode submit-external-metadata \ - --organism {wildcards.organism} \ - --metadata {input.metadata} \ - --config-file {input.config} \ - --output-file {output.submitted} \ - --log-level {params.log_level} - fi - """ + triggered="results/triggered", + project_created="results/project_created", + sample_created="results/sample_created", + assembly_created="results/assembly_created", + uploaded_external_metadata="results/uploaded_external_metadata", rule get_ena_submission_list: @@ -63,6 +45,7 @@ rule get_ena_submission_list: --log-level {params.log_level} \ """ + rule trigger_submission_to_ena: input: script="scripts/trigger_submission_to_ena.py", @@ -78,6 +61,7 @@ rule trigger_submission_to_ena: --log-level {params.log_level} \ """ + rule trigger_submission_to_ena_from_file: # for testing input: script="scripts/trigger_submission_to_ena.py", @@ -93,4 +77,68 @@ rule trigger_submission_to_ena_from_file: # for testing --config-file {input.config} \ --input-file {input.input_file} \ --log-level {params.log_level} \ - """ \ No newline at end of file + """ + + +rule create_project: + input: + script="scripts/create_project.py", + config="results/config.yaml", + output: + project_created=touch("results/project_created"), + params: + log_level=LOG_LEVEL, + shell: + """ + python {input.script} \ + --config-file {input.config} \ + --log-level {params.log_level} \ + """ + + +rule create_sample: + input: + script="scripts/create_sample.py", + config="results/config.yaml", + output: + sample_created=touch("results/sample_created"), + params: + log_level=LOG_LEVEL, + shell: + """ + python {input.script} \ + --config-file {input.config} \ + --log-level {params.log_level} \ + """ + + +rule create_assembly: + input: + script="scripts/create_assembly.py", + config="results/config.yaml", + output: + sample_created=touch("results/assembly_created"), + params: + log_level=LOG_LEVEL, + shell: + """ + python {input.script} \ + --config-file {input.config} \ + --log-level {params.log_level} \ + """ + + +rule upload_to_loculus: + input: + script="scripts/upload_external_metadata_to_loculus.py", + config="results/config.yaml", + output: + sample_created=touch("results/uploaded_external_metadata"), + params: + log_level=LOG_LEVEL, + shell: + """ + python {input.script} \ + --config-file {input.config} \ + --log-level {params.log_level} \ + """ diff --git a/ena-submission/config/config.yaml b/ena-submission/config/config.yaml index 98bc4331e..a06c97c7f 100644 --- a/ena-submission/config/config.yaml +++ b/ena-submission/config/config.yaml @@ -13,6 +13,7 @@ organisms: - M - S taxon_id: 3052518 + scientific_name: "Orthonairovirus haemorrhagiae" organismName: "Crimean-Congo Hemorrhagic Fever Virus" externalMetadata: - externalMetadataUpdater: ena @@ -75,10 +76,53 @@ organisms: - externalMetadataUpdater: ena name: sraRunAccession type: string - ebola-sudan: + ebola-zaire: ingest: - taxon_id: 3052460 - organismName: "Ebola Sudan" + taxon_id: 186538 + scientific_name: "Orthoebolavirus zairense" + organismName: "Ebola Zaire" + externalMetadata: + - externalMetadataUpdater: ena + name: ncbiReleaseDate + type: date + - externalMetadataUpdater: ena + name: ncbiUpdateDate + type: date + - externalMetadataUpdater: ena + name: ncbiSubmitterCountry + type: string + - externalMetadataUpdater: ena + name: insdcAccessionBase + type: string + - externalMetadataUpdater: ena + name: insdcVersion + type: int + - externalMetadataUpdater: ena + name: insdcAccessionFull + type: string + - externalMetadataUpdater: ena + name: bioprojectAccession + type: string + - externalMetadataUpdater: ena + name: biosampleAccession + type: string + - externalMetadataUpdater: ena + name: ncbiSourceDb + type: string + - externalMetadataUpdater: ena + name: ncbiVirusName + type: string + - externalMetadataUpdater: ena + name: ncbiVirusTaxId + type: int + - externalMetadataUpdater: ena + name: sraRunAccession + type: string + west-nile: + ingest: + scientific_name: "West Nile virus" + taxon_id: 3048448 + organismName: "West Nile Virus" externalMetadata: - externalMetadataUpdater: ena name: ncbiReleaseDate diff --git a/ena-submission/config/defaults.yaml b/ena-submission/config/defaults.yaml index 6058b5d50..5c1f9eb1c 100644 --- a/ena-submission/config/defaults.yaml +++ b/ena-submission/config/defaults.yaml @@ -2,6 +2,69 @@ username: external_metadata_updater password: external_metadata_updater keycloak_client_id: backend-client ingest_pipeline_submitter: insdc_ingest_user -github_username: fake_username -github_pat: fake_pat -github_url: https://api.github.com/repos/pathoplexus/ena-submission/contents/test/approved_ena_submission_list.json?ref=main +db_name: Loculus +unique_project_suffix: Loculus +ena_submission_username: fake-user +ena_submission_password: fake-password +ena_submission_url: https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit # TODO(https://github.com/loculus-project/loculus/issues/2425): update in production +github_url: https://raw.githubusercontent.com/pathoplexus/ena-submission/main/test/approved_ena_submission_list.json # TODO(https://github.com/loculus-project/loculus/issues/2425): update in production +ena_reports_service_url: https://wwwdev.ebi.ac.uk/ena/submit/report # TODO(https://github.com/loculus-project/loculus/issues/2425): update in production +#ena_checklist: ERC000033 - do not use until all fields are mapped to ENA accepted options +metadata_mapping: + 'subject exposure': + loculus_fields: [exposureEvent] + 'type exposure': + loculus_fields: [exposureEvent] + hospitalisation: + loculus_fields: [hostHealthState] + function: match + args: [Hospital] + 'illness symptoms': + loculus_fields: [signsAndSymptoms] + 'collection date': + loculus_fields: [sampleCollectionDate] + 'geographic location (country and/or sea)': + loculus_fields: [geoLocCountry] + 'geographic location (region and locality)': + loculus_fields: [geoLocAdmin1] + 'sample capture status': + loculus_fields: [purposeOfSampling] + 'host disease outcome': + loculus_fields: [hostHealthOutcome] + 'host common name': + loculus_fields: [hostNameCommon] + 'host age': + loculus_fields: [hostAge] + 'host health state': + loculus_fields: [hostHealthState] + 'host sex': + loculus_fields: [hostGender] + 'host scientific name': + loculus_fields: [hostNameScientific] + 'isolate': + loculus_fields: [specimenCollectorSampleId] + 'collecting institution': + loculus_fields: [sequencedByOrganization, authorAffiliations] + 'receipt date': + loculus_fields: [sampleReceivedDate] + 'isolation source host-associated': + loculus_fields: [anatomicalMaterial, anatomicalPart, bodyProduct] + 'isolation source non-host-associated': + loculus_fields: [environmentalSite, environmentalMaterial] + 'authors': + loculus_fields: [authors] + 'geographic location (latitude)': + loculus_fields: [geoLocLatitude] + units: DD + 'geographic location (longitude)': + loculus_fields: [geoLocLongitude] + units: DD +metadata_mapping_mandatory_field_defaults: + isolate: "not provided" + "collecting institution": "not provided" + "collector name": "not provided" + "host scientific name": "not provided" + "host sex": "not provided" + "host health state": "not provided" + "host subject id": "not provided" + "host common name": "not provided" diff --git a/ena-submission/environment.yml b/ena-submission/environment.yml index 5105f5cff..e439d9da0 100644 --- a/ena-submission/environment.yml +++ b/ena-submission/environment.yml @@ -15,3 +15,4 @@ dependencies: - unzip - psycopg2 - slack_sdk + - xmltodict diff --git a/ena-submission/flyway/sql/V1.1__add_center_name.sql b/ena-submission/flyway/sql/V1.1__add_center_name.sql new file mode 100644 index 000000000..cfc6feadf --- /dev/null +++ b/ena-submission/flyway/sql/V1.1__add_center_name.sql @@ -0,0 +1,2 @@ +ALTER TABLE submission_table ADD center_name text; +ALTER TABLE project_table ADD center_name text; \ No newline at end of file diff --git a/ena-submission/scripts/call_loculus.py b/ena-submission/scripts/call_loculus.py index e9c4dcc09..df0dda936 100644 --- a/ena-submission/scripts/call_loculus.py +++ b/ena-submission/scripts/call_loculus.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from http import HTTPMethod from pathlib import Path -from typing import Any, List +from typing import Any import click import jsonlines @@ -28,7 +28,7 @@ class Config: username: str password: str group_name: str - ena_specific_metadata: List[str] + ena_specific_metadata: list[str] def backend_url(config: Config) -> str: @@ -41,9 +41,7 @@ def organism_url(config: Config, organism: str) -> str: def get_jwt(config: Config) -> str: - """ - Get a JWT token for the given username and password - """ + """Get a JWT token for the given username and password""" external_metadata_updater_password = os.getenv("EXTERNAL_METADATA_UPDATER_PASSWORD") if not external_metadata_updater_password: @@ -59,60 +57,55 @@ def get_jwt(config: Config) -> str: keycloak_token_url = config.keycloak_token_url - response = requests.post(keycloak_token_url, data=data, headers=headers) + response = requests.post(keycloak_token_url, data=data, headers=headers, timeout=60) response.raise_for_status() jwt_keycloak = response.json() - jwt = jwt_keycloak["access_token"] - return jwt + return jwt_keycloak["access_token"] -def make_request( +def make_request( # noqa: PLR0913, PLR0917 method: HTTPMethod, url: str, config: Config, - headers: dict[str, str] | None = None, + headers: dict[str, str] = {}, params: dict[str, Any] | None = None, files: dict[str, Any] | None = None, json_body: dict[str, Any] | None = None, data: str | None = None, ) -> requests.Response: - """ - Generic request function to handle repetitive tasks like fetching JWT and setting headers. - """ + """Generic request function to handle repetitive tasks like fetching JWT and setting headers.""" jwt = get_jwt(config) - if headers: - headers["Authorization"] = f"Bearer {jwt}" - else: - headers = {"Authorization": f"Bearer {jwt}"} + headers["Authorization"] = f"Bearer {jwt}" match method: case HTTPMethod.GET: - response = requests.get(url, headers=headers, params=params) + response = requests.get(url, headers=headers, params=params, timeout=60) case HTTPMethod.POST: if files: headers.pop("Content-Type") # Remove content-type for multipart/form-data - response = requests.post(url, headers=headers, files=files, data=params) + response = requests.post(url, headers=headers, files=files, data=params, timeout=60) else: response = requests.post( - url, headers=headers, json=json_body, params=params, data=data + url, headers=headers, json=json_body, params=params, data=data, timeout=60 ) case _: - raise ValueError(f"Unsupported HTTP method: {method}") + msg = f"Unsupported HTTP method: {method}" + raise ValueError(msg) if not response.ok: - response.raise_for_status() + msg = f"Error: {response.status_code} - {response.text}" + raise requests.exceptions.HTTPError(msg) + return response def submit_external_metadata( - metadata_file, + external_metadata: dict[str, str], config: Config, organism: str, -): - """ - Submit metadata to Loculus. - """ +) -> requests.Response: + """Submit metadata to Loculus.""" endpoint: str = "submit-external-metadata" url = f"{organism_url(config, organism)}/{endpoint}" @@ -125,19 +118,42 @@ def submit_external_metadata( "Content-Type": "application/x-ndjson", } - with open(metadata_file) as file: - pre_ndjson = [x.strip() for x in file.readlines()] - data = " ".join(pre_ndjson) + data = json.dumps(external_metadata) response = make_request(HTTPMethod.POST, url, config, data=data, headers=headers, params=params) if not response.ok: - response.raise_for_status() + msg = f"Error: {response.status_code} - {response.text}" + raise requests.exceptions.HTTPError(msg) return response -def get_released_data(config: Config, organism: str) -> dict[str, Any]: +def get_group_info(config: Config, group_id: int) -> dict[str, Any]: + """Get group info given id""" + + # TODO: only get a list of released accessionVersions and compare with submission DB. + url = f"{backend_url(config)}/groups/{group_id}" + + headers = {"Content-Type": "application/json"} + + response = make_request(HTTPMethod.GET, url, config, headers=headers) + + entries: list[dict[str, Any]] = [] + try: + entries = list(jsonlines.Reader(response.iter_lines()).iter()) + except jsonlines.Error as err: + response_summary = response.text + if len(response_summary) > 100: + response_summary = response_summary[:50] + "\n[..]\n" + response_summary[-50:] + logger.error(f"Error decoding JSON from /groups/{group_id}: {response_summary}") + raise ValueError from err + + return entries + + +# TODO: Better return type, Any is too broad +def fetch_released_entries(config: Config, organism: str) -> dict[str, Any]: """Get sequences that are ready for release""" # TODO: only get a list of released accessionVersions and compare with submission DB. @@ -146,9 +162,6 @@ def get_released_data(config: Config, organism: str) -> dict[str, Any]: headers = {"Content-Type": "application/json"} response = make_request(HTTPMethod.GET, url, config, headers=headers) - if not response.ok: - logger.error(response.json()) - response.raise_for_status() entries: list[dict[str, Any]] = [] try: @@ -160,7 +173,14 @@ def get_released_data(config: Config, organism: str) -> dict[str, Any]: logger.error(f"Error decoding JSON from /get-released-data: {response_summary}") raise ValueError() from err - data_dict: dict[str, Any] = {rec["metadata"]["accessionVersion"]: rec for rec in entries} + # Only keep unalignedNucleotideSequences and metadata + data_dict: dict[str, Any] = { + rec["metadata"]["accessionVersion"]: { + "metadata": rec["metadata"], + "unalignedNucleotideSequences": rec["unalignedNucleotideSequences"], + } + for rec in entries + } return data_dict @@ -227,7 +247,7 @@ def record_factory(*args, **kwargs): logging.setLogRecordFactory(record_factory) - with open(config_file) as file: + with open(config_file, encoding="utf-8") as file: full_config = yaml.safe_load(file) relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} config = Config(**relevant_config) @@ -242,7 +262,7 @@ def record_factory(*args, **kwargs): if mode == "get-released-data": logger.info("Getting released sequences") - response = get_released_data(config, organism, remove_if_has_metadata) + response = fetch_released_entries(config, organism, remove_if_has_metadata) if response: Path(output_file).write_text(json.dumps(response), encoding="utf-8") else: diff --git a/ena-submission/scripts/create_assembly.py b/ena-submission/scripts/create_assembly.py new file mode 100644 index 000000000..b2d37323e --- /dev/null +++ b/ena-submission/scripts/create_assembly.py @@ -0,0 +1,533 @@ +import json +import logging +import time +from dataclasses import dataclass +from datetime import datetime, timedelta + +import click +import pytz +import yaml +from ena_submission_helper import ( + CreationResults, + check_ena, + create_chromosome_list, + create_ena_assembly, + create_fasta, + create_manifest, + get_ena_config, +) +from ena_types import ( + AssemblyChromosomeListFile, + AssemblyChromosomeListFileObject, + AssemblyManifest, + AssemblyType, + ChromosomeType, + MoleculeType, +) +from notifications import SlackConfig, send_slack_notification, slack_conn_init +from psycopg2.pool import SimpleConnectionPool +from submission_db_helper import ( + AssemblyTableEntry, + Status, + StatusAll, + add_to_assembly_table, + db_init, + find_conditions_in_db, + find_errors_in_db, + find_waiting_in_db, + update_db_where_conditions, +) + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.INFO, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + + +@dataclass +class Config: + organisms: list[dict[str, str]] + backend_url: str + keycloak_token_url: str + keycloak_client_id: str + username: str + password: str + db_username: str + db_password: str + db_host: str + db_name: str + unique_project_suffix: str + ena_submission_url: str + ena_submission_password: str + ena_submission_username: str + ena_reports_service_url: str + slack_hook: str + slack_token: str + slack_channel_id: str + + +def create_chromosome_list_object( + unaligned_sequences: dict[str, str], seq_key: dict[str, str] +) -> str: + # Use https://www.ebi.ac.uk/ena/browser/view/GCA_900094155.1?show=chromosomes as a template + # Use https://www.ebi.ac.uk/ena/browser/view/GCA_000854165.1?show=chromosomes for multi-segment + + chromosome_type = ChromosomeType.SEGMENTED + + entries: list[AssemblyChromosomeListFileObject] = [] + + if len(unaligned_sequences.keys()) > 1: + for segment_name, item in unaligned_sequences.items(): + if item: # Only list sequenced segments + entry = AssemblyChromosomeListFileObject( + object_name=f"{seq_key["accession"]}.{seq_key["version"]}_{segment_name}", + chromosome_name=segment_name, + chromosome_type=chromosome_type, + ) + entries.append(entry) + else: + entry = AssemblyChromosomeListFileObject( + object_name=f"{seq_key["accession"]}.{seq_key["version"]}", + chromosome_name="main", + chromosome_type=chromosome_type, + ) + entries.append(entry) + + return AssemblyChromosomeListFile(chromosomes=entries) + + +def create_manifest_object( + config: Config, + sample_table_entry: dict[str, str], + project_table_entry: dict[str, str], + submission_table_entry: dict[str, str], + seq_key: dict[str, str], + group_key: dict[str, str], + test=False, +) -> AssemblyManifest: + sample_accession = sample_table_entry["result"]["ena_sample_accession"] + study_accession = project_table_entry["result"]["bioproject_accession"] + + metadata = submission_table_entry["metadata"] + unaligned_nucleotide_sequences = submission_table_entry["unaligned_nucleotide_sequences"] + organism_metadata = config.organisms[group_key["organism"]]["ingest"] + chromosome_list_object = create_chromosome_list_object(unaligned_nucleotide_sequences, seq_key) + chromosome_list_file = create_chromosome_list(list_object=chromosome_list_object) + fasta_file = create_fasta( + unaligned_sequences=unaligned_nucleotide_sequences, + chromosome_list=chromosome_list_object, + ) + program = ( + metadata["sequencingInstrument"] if metadata.get("sequencingInstrument") else "Unknown" + ) + platform = metadata["sequencingProtocol"] if metadata.get("sequencingProtocol") else "Unknown" + try: + coverage = ( + ( + int(metadata["depthOfCoverage"]) + if int(metadata["depthOfCoverage"]) == float(metadata["depthOfCoverage"]) + else float(metadata["depthOfCoverage"]) + ) + if metadata.get("depthOfCoverage") + else 1 + ) + except ValueError: + coverage = 1 + try: + moleculetype = ( + MoleculeType(metadata["moleculeType"]) + if organism_metadata.get("moleculeType") + else None + ) + except ValueError: + moleculetype = None + description = ( + f"Original sequence submitted to {config.db_name} with accession: " + f"{seq_key["accession"]}, version: {seq_key["version"]}" + ) + assembly_name = ( + seq_key["accession"] + + f"{datetime.now(tz=pytz.utc)}".replace(" ", "_").replace("+", "_").replace(":", "_") + if test # This is the alias that needs to be unique + else seq_key["accession"] + ) + + return AssemblyManifest( + study=study_accession, + sample=sample_accession, + assemblyname=assembly_name, + assembly_type=AssemblyType.ISOLATE, + coverage=coverage, + program=program, + platform=platform, + fasta=fasta_file, + chromosome_list=chromosome_list_file, + description=description, + moleculetype=moleculetype, + ) + + +def submission_table_start(db_config: SimpleConnectionPool): + """ + 1. Find all entries in submission_table in state SUBMITTED_SAMPLE + 2. If (exists an entry in the assembly_table for (accession, version)): + a. If (in state SUBMITTED) update state in submission_table to SUBMITTED_ALL + b. Else update state to SUBMITTING_ASSEMBLY + 3. Else create corresponding entry in assembly_table + """ + conditions = {"status_all": StatusAll.SUBMITTED_SAMPLE} + ready_to_submit = find_conditions_in_db( + db_config, table_name="submission_table", conditions=conditions + ) + if len(ready_to_submit) > 0: + logging.debug( + f"Found {len(ready_to_submit)} entries in submission_table in status SUBMITTED_SAMPLE" + ) + for row in ready_to_submit: + seq_key = {"accession": row["accession"], "version": row["version"]} + + # 1. check if there exists an entry in the assembly_table for seq_key + corresponding_assembly = find_conditions_in_db( + db_config, table_name="assembly_table", conditions=seq_key + ) + if len(corresponding_assembly) == 1: + if corresponding_assembly[0]["status"] == str(Status.SUBMITTED): + update_values = {"status_all": StatusAll.SUBMITTED_ALL} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + else: + update_values = {"status_all": StatusAll.SUBMITTING_ASSEMBLY} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + else: + # If not: create assembly_entry, change status to SUBMITTING_ASSEMBLY + assembly_table_entry = AssemblyTableEntry(**seq_key) + succeeded = add_to_assembly_table(db_config, assembly_table_entry) + if succeeded: + update_values = {"status_all": StatusAll.SUBMITTING_ASSEMBLY} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + + +def submission_table_update(db_config: SimpleConnectionPool): + """ + 1. Find all entries in submission_table in state SUBMITTING_ASSEMBLY + 2. If (exists an entry in the assembly_table for (accession, version)): + a. If (in state SUBMITTED) update state in submission_table to SUBMITTED_ALL + 3. Else throw Error + """ + conditions = {"status_all": StatusAll.SUBMITTING_ASSEMBLY} + submitting_assembly = find_conditions_in_db( + db_config, table_name="submission_table", conditions=conditions + ) + if len(submitting_assembly) > 0: + logger.debug( + f"Found {len(submitting_assembly)} entries in submission_table in" + " status SUBMITTING_ASSEMBLY" + ) + for row in submitting_assembly: + seq_key = {"accession": row["accession"], "version": row["version"]} + + corresponding_assembly = find_conditions_in_db( + db_config, table_name="assembly_table", conditions=seq_key + ) + if len(corresponding_assembly) == 1 and corresponding_assembly[0]["status"] == str( + Status.SUBMITTED + ): + update_values = {"status_all": StatusAll.SUBMITTED_ALL} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + if len(corresponding_assembly) == 0: + error_msg = ( + "Entry in submission_table in status SUBMITTING_ASSEMBLY", + " with no corresponding assembly", + ) + raise RuntimeError(error_msg) + + +def assembly_table_create(db_config: SimpleConnectionPool, config: Config, retry_number: int = 3): + """ + 1. Find all entries in assembly_table in state READY + 2. Create temporary files: chromosome_list_file, fasta_file, manifest_file + 3. Update assembly_table to state SUBMITTING (only proceed if update succeeds) + 4. If (create_ena_assembly succeeds): update state to SUBMITTED with results + 3. Else update state to HAS_ERRORS with error messages + """ + ena_config = get_ena_config( + config.ena_submission_username, + config.ena_submission_password, + config.ena_submission_url, + config.ena_reports_service_url, + ) + conditions = {"status": Status.READY} + ready_to_submit_assembly = find_conditions_in_db( + db_config, table_name="assembly_table", conditions=conditions + ) + if len(ready_to_submit_assembly) > 0: + logger.debug( + f"Found {len(ready_to_submit_assembly)} entries in assembly_table in status READY" + ) + for row in ready_to_submit_assembly: + seq_key = {"accession": row["accession"], "version": row["version"]} + sample_data_in_submission_table = find_conditions_in_db( + db_config, table_name="submission_table", conditions=seq_key + ) + if len(sample_data_in_submission_table) == 0: + error_msg = f"Entry {row["accession"]} not found in submitting_table" + raise RuntimeError(error_msg) + group_key = { + "group_id": sample_data_in_submission_table[0]["group_id"], + "organism": sample_data_in_submission_table[0]["organism"], + } + center_name = sample_data_in_submission_table[0]["center_name"] + + results_in_sample_table = find_conditions_in_db( + db_config, table_name="sample_table", conditions=seq_key + ) + if len(results_in_sample_table) == 0: + error_msg = f"Entry {row["accession"]} not found in sample_table" + raise RuntimeError(error_msg) + + results_in_project_table = find_conditions_in_db( + db_config, table_name="project_table", conditions=group_key + ) + if len(results_in_project_table) == 0: + error_msg = f"Entry {row["accession"]} not found in project_table" + raise RuntimeError(error_msg) + + manifest_object = create_manifest_object( + config, + results_in_sample_table[0], + results_in_project_table[0], + sample_data_in_submission_table[0], + seq_key, + group_key, + test=True, # TODO(https://github.com/loculus-project/loculus/issues/2425): remove in production + ) + manifest_file = create_manifest(manifest_object) + + update_values = {"status": Status.SUBMITTING} + number_rows_updated = update_db_where_conditions( + db_config, + table_name="assembly_table", + conditions=seq_key, + update_values=update_values, + ) + if number_rows_updated != 1: + # state not correctly updated - do not start submission + logger.warning( + "assembly_table: Status update from READY to SUBMITTING failed " + "- not starting submission." + ) + continue + logger.info(f"Starting assembly creation for accession {row["accession"]}") + assembly_creation_results: CreationResults = create_ena_assembly( + ena_config, manifest_file, center_name=center_name + ) + if assembly_creation_results.results: + update_values = { + "status": Status.WAITING, + "result": json.dumps(assembly_creation_results.results), + } + number_rows_updated = 0 + tries = 0 + while number_rows_updated != 1 and tries < retry_number: + if tries > 0: + logger.warning( + f"Assembly created but DB update failed - reentry DB update #{tries}." + ) + number_rows_updated = update_db_where_conditions( + db_config, + table_name="assembly_table", + conditions=seq_key, + update_values=update_values, + ) + tries += 1 + if number_rows_updated == 1: + logger.info( + f"Assembly submission for accession {row["accession"]} succeeded! - waiting for ENA accession" + ) + else: + update_values = { + "status": Status.HAS_ERRORS, + "errors": json.dumps(assembly_creation_results.errors), + } + number_rows_updated = 0 + tries = 0 + while number_rows_updated != 1 and tries < retry_number: + if tries > 0: + logger.warning( + f"Assembly creation failed and DB update failed - reentry DB update #{tries}." + ) + number_rows_updated = update_db_where_conditions( + db_config, + table_name="assembly_table", + conditions=seq_key, + update_values=update_values, + ) + tries += 1 + + +_last_ena_check: datetime | None = None + + +def assembly_table_update( + db_config: SimpleConnectionPool, config: Config, retry_number: int = 3, time_threshold: int = 5 +): + """ + - time_threshold (minutes) + 1. Find all entries in assembly_table in state WAITING + 2. If over time_threshold since last check, check if accession exists in ENA + 3. If (exists): update state to SUBMITTED with results + """ + global _last_ena_check # noqa: PLW0602 + ena_config = get_ena_config( + config.ena_submission_username, + config.ena_submission_password, + config.ena_submission_url, + config.ena_reports_service_url, + ) + conditions = {"status": Status.WAITING} + waiting = find_conditions_in_db(db_config, table_name="assembly_table", conditions=conditions) + if len(waiting) > 0: + logger.debug(f"Found {len(waiting)} entries in assembly_table in status READY") + # Check if ENA has assigned an accession, don't do this too frequently + time = datetime.now(tz=pytz.utc) + if not _last_ena_check or time - timedelta(minutes=time_threshold) > _last_ena_check: + logger.debug("Checking state in ENA") + for row in waiting: + seq_key = {"accession": row["accession"], "version": row["version"]} + check_results: CreationResults = check_ena(ena_config, row["result"]["erz_accession"]) + _last_ena_check = time + if not check_results.results: + continue + + update_values = { + "status": Status.SUBMITTED, + "result": json.dumps(check_results.results), + "finished_at": datetime.now(tz=pytz.utc), + } + number_rows_updated = 0 + tries = 0 + while number_rows_updated != 1 and tries < retry_number: + if tries > 0: + logger.warning( + f"Assembly in ENA but DB update failed - reentry DB update #{tries}." + ) + number_rows_updated = update_db_where_conditions( + db_config, + table_name="assembly_table", + conditions=seq_key, + update_values=update_values, + ) + tries += 1 + if number_rows_updated == 1: + logger.info( + f"Assembly submission for accession {row["accession"]} succeeded and accession returned!" + ) + + +def assembly_table_handle_errors( + db_config: SimpleConnectionPool, + config: Config, + slack_config: SlackConfig, + time_threshold: int = 15, + time_threshold_waiting: int = 48, + slack_time_threshold: int = 12, +): + """ + - time_threshold: (minutes) + - time_threshold_waiting: (hours) + - slack_time_threshold: (hours) + 1. Find all entries in assembly_table in state HAS_ERRORS or SUBMITTING over time_threshold + 2. If time since last slack_notification is over slack_time_threshold send notification + """ + entries_with_errors = find_errors_in_db( + db_config, "assembly_table", time_threshold=time_threshold + ) + if len(entries_with_errors) > 0: + error_msg = ( + f"{config.backend_url}: ENA Submission pipeline found {len(entries_with_errors)} entries" + f" in assembly_table in status HAS_ERRORS or SUBMITTING for over {time_threshold}m" + ) + send_slack_notification( + error_msg, + slack_config, + time=datetime.now(tz=pytz.utc), + time_threshold=slack_time_threshold, + ) + # TODO: Query ENA to check if assembly has in fact been created + # If created update assembly_table + # If not retry 3 times, then raise for manual intervention + entries_waiting = find_waiting_in_db( + db_config, "assembly_table", time_threshold=time_threshold_waiting + ) + if len(entries_waiting) > 0: + error_msg = ( + f"ENA Submission pipeline found {len(entries_waiting)} entries in assembly_table in" + f" status WAITING for over {time_threshold_waiting}h" + ) + send_slack_notification( + config, error_msg, time=datetime.now(tz=pytz.utc), time_threshold=slack_time_threshold + ) + + +@click.command() +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +@click.option( + "--config-file", + required=True, + type=click.Path(exists=True), +) +def create_assembly(log_level, config_file): + logger.setLevel(log_level) + logging.getLogger("requests").setLevel(logging.INFO) + + with open(config_file) as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} + config = Config(**relevant_config) + logger.info(f"Config: {config}") + + db_config = db_init(config.db_password, config.db_username, config.db_host) + slack_config = slack_conn_init( + slack_hook_default=config.slack_hook, + slack_token_default=config.slack_token, + slack_channel_id_default=config.slack_channel_id, + ) + + while True: + submission_table_start(db_config) + submission_table_update(db_config) + + assembly_table_create(db_config, config, retry_number=3) + assembly_table_update(db_config, config) + assembly_table_handle_errors(db_config, config, slack_config) + time.sleep(2) + + +if __name__ == "__main__": + create_assembly() diff --git a/ena-submission/scripts/create_project.py b/ena-submission/scripts/create_project.py new file mode 100644 index 000000000..564dbf3f7 --- /dev/null +++ b/ena-submission/scripts/create_project.py @@ -0,0 +1,388 @@ +import json +import logging +import time +from dataclasses import dataclass +from datetime import datetime + +import click +import pytz +import yaml +from call_loculus import get_group_info +from ena_submission_helper import CreationResults, create_ena_project, get_ena_config +from ena_types import ( + OrganismType, + ProjectLink, + ProjectLinks, + ProjectSet, + ProjectType, + SubmissionProject, + XmlAttribute, + XrefType, +) +from notifications import SlackConfig, send_slack_notification, slack_conn_init +from psycopg2.pool import SimpleConnectionPool +from submission_db_helper import ( + ProjectTableEntry, + Status, + StatusAll, + add_to_project_table, + db_init, + find_conditions_in_db, + find_errors_in_db, + update_db_where_conditions, +) + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.INFO, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + + +@dataclass +class Config: + organisms: dict[dict[str, str]] + backend_url: str + keycloak_token_url: str + keycloak_client_id: str + username: str + password: str + db_username: str + db_password: str + db_host: str + db_name: str + unique_project_suffix: str + ena_submission_url: str + ena_submission_password: str + ena_submission_username: str + ena_reports_service_url: str + slack_hook: str + slack_token: str + slack_channel_id: str + + +def construct_project_set_object( + group_info: dict[str, str], + config: Config, + entry: dict[str, str], + test=False, +) -> ProjectSet: + """ + Construct project set object, using: + - entry in project_table + - group_info of corresponding group_id + - config information, such as ingest metadata for that organism + + If test=True add a timestamp to the alias suffix to allow for multiple + submissions of the same project for testing. + (ENA blocks multiple submissions with the same alias) + """ + metadata_dict = config.organisms[entry["organism"]]["ingest"] + if test: + alias = XmlAttribute( + f"{entry["group_id"]}:{entry["organism"]}:{config.unique_project_suffix}:{datetime.now(tz=pytz.utc)}" + ) # TODO(https://github.com/loculus-project/loculus/issues/2425): remove in production + else: + alias = XmlAttribute( + f"{entry["group_id"]}:{entry["organism"]}:{config.unique_project_suffix}" + ) + + project_type = ProjectType( + center_name=XmlAttribute(group_info["institution"]), + alias=alias, + name=metadata_dict["scientific_name"], + title=f"{metadata_dict["scientific_name"]}: Genome sequencing", + description=( + f"Automated upload of {metadata_dict["scientific_name"]} sequences submitted by {group_info["institution"]} from {config.db_name}", # noqa: E501 + ), + submission_project=SubmissionProject( + organism=OrganismType( + taxon_id=metadata_dict["taxon_id"], + scientific_name=metadata_dict["scientific_name"], + ) + ), + project_links=ProjectLinks( + project_link=ProjectLink(xref_link=XrefType(db=config.db_name, id=entry["group_id"])) + ), + ) + return ProjectSet(project=[project_type]) + + +def submission_table_start(db_config: SimpleConnectionPool): + """ + 1. Find all entries in submission_table in state READY_TO_SUBMIT + 2. If (exists an entry in the project_table for (group_id, organism)): + a. If (in state SUBMITTED) update state in submission_table to SUBMITTED_PROJECT + b. Else update state to SUBMITTING_PROJECT + 3. Else create corresponding entry in project_table + """ + conditions = {"status_all": StatusAll.READY_TO_SUBMIT} + ready_to_submit = find_conditions_in_db( + db_config, table_name="submission_table", conditions=conditions + ) + logger.debug( + f"Found {len(ready_to_submit)} entries in submission_table in status READY_TO_SUBMIT" + ) + for row in ready_to_submit: + group_key = {"group_id": row["group_id"], "organism": row["organism"]} + seq_key = {"accession": row["accession"], "version": row["version"]} + + # Check if there exists an entry in the project table for (group_id, organism) + corresponding_project = find_conditions_in_db( + db_config, table_name="project_table", conditions=group_key + ) + if len(corresponding_project) == 1: + if corresponding_project[0]["status"] == str(Status.SUBMITTED): + update_values = { + "status_all": StatusAll.SUBMITTED_PROJECT, + "center_name": corresponding_project[0]["center_name"], + } + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + else: + update_values = {"status_all": StatusAll.SUBMITTING_PROJECT} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + else: + # If not: create project_entry, change status to SUBMITTING_PROJECT + entry = { + "group_id": row["group_id"], + "organism": row["organism"], + } + project_table_entry = ProjectTableEntry(**entry) + succeeded = add_to_project_table(db_config, project_table_entry) + if succeeded: + update_values = {"status_all": StatusAll.SUBMITTING_PROJECT} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + + +def submission_table_update(db_config: SimpleConnectionPool): + """ + 1. Find all entries in submission_table in state SUBMITTING_PROJECT + 2. If (exists an entry in the project_table for (group_id, organism)): + a. If (in state SUBMITTED) update state in submission_table to SUBMITTED_PROJECT + 3. Else throw Error + """ + conditions = {"status_all": StatusAll.SUBMITTING_PROJECT} + submitting_project = find_conditions_in_db( + db_config, table_name="submission_table", conditions=conditions + ) + logger.debug( + ( + f"Found {len(submitting_project)} entries in submission_table in", + " status SUBMITTING_PROJECT", + ) + ) + for row in submitting_project: + group_key = {"group_id": row["group_id"], "organism": row["organism"]} + seq_key = {"accession": row["accession"], "version": row["version"]} + + # 1. check if there exists an entry in the project table for (group_id, organism) + corresponding_project = find_conditions_in_db( + db_config, table_name="project_table", conditions=group_key + ) + if len(corresponding_project) == 1 and corresponding_project[0]["status"] == str( + Status.SUBMITTED + ): + update_values = { + "status_all": StatusAll.SUBMITTED_PROJECT, + "center_name": corresponding_project[0]["center_name"], + } + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + if len(corresponding_project) == 0: + error_msg = ( + "Entry in submission_table in status SUBMITTING_PROJECT", + " with no corresponding project", + ) + raise RuntimeError(error_msg) + + +def project_table_create(db_config: SimpleConnectionPool, config: Config, retry_number: int = 3): + """ + 1. Find all entries in project_table in state READY + 2. Create project_set: get_group_info from loculus, use entry and config for other fields + 3. Update project_table to state SUBMITTING (only proceed if update succeeds) + 4. If (create_ena_project succeeds): update state to SUBMITTED with results + 3. Else update state to HAS_ERRORS with error messages + """ + ena_config = get_ena_config( + config.ena_submission_username, + config.ena_submission_password, + config.ena_submission_url, + config.ena_reports_service_url, + ) + conditions = {"status": Status.READY} + ready_to_submit_project = find_conditions_in_db( + db_config, table_name="project_table", conditions=conditions + ) + logger.debug(f"Found {len(ready_to_submit_project)} entries in project_table in status READY") + for row in ready_to_submit_project: + group_key = {"group_id": row["group_id"], "organism": row["organism"]} + + try: + group_info = get_group_info(config, row["group_id"])[0]["group"] + except Exception as e: + logger.error(f"Was unable to get group info for group: {row["group_id"]}, {e}") + continue + + project_set = construct_project_set_object(group_info, config, row, test=True) + update_values = { + "status": Status.SUBMITTING, + "started_at": datetime.now(tz=pytz.utc), + "center_name": group_info["institution"], + } + number_rows_updated = update_db_where_conditions( + db_config, + table_name="project_table", + conditions=group_key, + update_values=update_values, + ) + if number_rows_updated != 1: + # state not correctly updated - do not start submission + logger.warning( + ( + "Project_table: Status update from READY to SUBMITTING failed ", + "- not starting submission.", + ) + ) + continue + logger.info(f"Starting Project creation for group_id {row["group_id"]}") + project_creation_results: CreationResults = create_ena_project(ena_config, project_set) + if project_creation_results.results: + update_values = { + "status": Status.SUBMITTED, + "result": json.dumps(project_creation_results.results), + "finished_at": datetime.now(tz=pytz.utc), + } + number_rows_updated = 0 + tries = 0 + while number_rows_updated != 1 and tries < retry_number: + if tries > 0: + # If state not correctly added retry + logger.warning( + f"Project created but DB update failed - reentry DB update #{tries}." + ) + number_rows_updated = update_db_where_conditions( + db_config, + table_name="project_table", + conditions=group_key, + update_values=update_values, + ) + tries += 1 + if number_rows_updated == 1: + logger.info(f"Project creation for group_id {row["group_id"]} succeeded!") + else: + update_values = { + "status": Status.HAS_ERRORS, + "errors": json.dumps(project_creation_results.errors), + "started_at": datetime.now(tz=pytz.utc), + } + number_rows_updated = 0 + tries = 0 + while number_rows_updated != 1 and tries < retry_number: + if tries > 0: + # If state not correctly added retry + logger.warning( + f"Project creation failed and DB update failed - reentry DB update #{tries}." + ) + number_rows_updated = update_db_where_conditions( + db_config, + table_name="project_table", + conditions=group_key, + update_values=update_values, + ) + tries += 1 + + +def project_table_handle_errors( + db_config: SimpleConnectionPool, + config: Config, + slack_config: SlackConfig, + time_threshold: int = 15, + slack_time_threshold: int = 12, +): + """ + - time_threshold: (minutes) + - slack_time_threshold: (hours) + + 1. Find all entries in project_table in state HAS_ERRORS or SUBMITTING over time_threshold + 2. If time since last slack_notification is over slack_time_threshold send notification + """ + entries_with_errors = find_errors_in_db( + db_config, "project_table", time_threshold=time_threshold + ) + if len(entries_with_errors) > 0: + error_msg = ( + f"{config.backend_url}: ENA Submission pipeline found {len(entries_with_errors)} entries in project_table in " + f" in project_table in status HAS_ERRORS or SUBMITTING for over {time_threshold}m" + ) + send_slack_notification( + error_msg, + slack_config, + time=datetime.now(tz=pytz.utc), + time_threshold=slack_time_threshold, + ) + # TODO: Query ENA to check if project has in fact been created + # If created update project_table + # If not retry 3 times, then raise for manual intervention + + +@click.command() +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +@click.option( + "--config-file", + required=True, + type=click.Path(exists=True), +) +def create_project(log_level, config_file): + logger.setLevel(log_level) + logging.getLogger("requests").setLevel(logging.INFO) + + with open(config_file, encoding="utf-8") as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} + config = Config(**relevant_config) + logger.info(f"Config: {config}") + + db_config = db_init(config.db_password, config.db_username, config.db_host) + slack_config = slack_conn_init( + slack_hook_default=config.slack_hook, + slack_token_default=config.slack_token, + slack_channel_id_default=config.slack_channel_id, + ) + + while True: + submission_table_start(db_config) + submission_table_update(db_config) + + project_table_create(db_config, config) + project_table_handle_errors(db_config, config, slack_config) + time.sleep(2) + + +if __name__ == "__main__": + create_project() diff --git a/ena-submission/scripts/create_sample.py b/ena-submission/scripts/create_sample.py new file mode 100644 index 000000000..82f9dad5d --- /dev/null +++ b/ena-submission/scripts/create_sample.py @@ -0,0 +1,438 @@ +import json +import logging +import re +import time +from dataclasses import dataclass +from datetime import datetime + +import click +import pytz +import yaml +from ena_submission_helper import CreationResults, create_ena_sample, get_ena_config +from ena_types import ( + ProjectLink, + SampleAttribute, + SampleAttributes, + SampleLinks, + SampleName, + SampleSetType, + SampleType, + XmlAttribute, + XrefType, +) +from notifications import SlackConfig, send_slack_notification, slack_conn_init +from psycopg2.pool import SimpleConnectionPool +from submission_db_helper import ( + SampleTableEntry, + Status, + StatusAll, + add_to_sample_table, + db_init, + find_conditions_in_db, + find_errors_in_db, + update_db_where_conditions, +) + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.INFO, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + + +@dataclass +class Config: + organisms: list[dict[str, str]] + metadata_mapping: dict[str, dict[str, str]] + metadata_mapping_mandatory_field_defaults: dict[str, str] + ena_checklist: str + use_ena_checklist: bool + backend_url: str + keycloak_token_url: str + keycloak_client_id: str + username: str + password: str + db_username: str + db_password: str + db_host: str + db_name: str + unique_project_suffix: str + ena_submission_url: str + ena_submission_password: str + ena_submission_username: str + ena_reports_service_url: str + slack_hook: str + slack_token: str + slack_channel_id: str + + +def get_sample_attributes(config: Config, sample_metadata: dict[str, str], row: dict[str, str]): + list_sample_attributes = [] + mapped_fields = [] + for field in config.metadata_mapping: + loculus_metadata_field_names = config.metadata_mapping[field]["loculus_fields"] + loculus_metadata_field_values = [ + sample_metadata.get(metadata) for metadata in loculus_metadata_field_names + ] + if ( + "function" in config.metadata_mapping[field] + and "args" in config.metadata_mapping[field] + ): + function = config.metadata_mapping[field]["function"] + args = [i for i in config.metadata_mapping[field]["args"] if i] + full_field_values = [i for i in loculus_metadata_field_values if i] + if function != "match": + logging.warning( + f"Unknown function: {function} with args: {args} for {row["accession"]}" + ) + continue + if function == "match" and (len(full_field_values) == len(args)): + value = True + for i in range(len(full_field_values)): + if not re.match( + args[i], + full_field_values[i], + re.IGNORECASE, + ): + value = False + break + else: + continue + else: + value = ";".join( + [str(metadata) for metadata in loculus_metadata_field_values if metadata] + ) + if value: + list_sample_attributes.append( + SampleAttribute( + tag=field, value=value, units=config.metadata_mapping[field].get("units") + ) + ) + mapped_fields.append(field) + for field, default in config.metadata_mapping_mandatory_field_defaults.items(): + if field not in mapped_fields: + list_sample_attributes.append( + SampleAttribute( + tag=field, + value=default, + ) + ) + return list_sample_attributes + + +def construct_sample_set_object( + config: Config, + sample_data_in_submission_table: dict[str, str], + entry: dict[str, str], + test=False, +): + """ + Construct sample set object, using: + - entry in sample_table + - sample_data_in_submission_table: corresponding entry in submission_table + - config information, such as ingest metadata for that organism + If test=True add a timestamp to the alias suffix to allow for multiple + submissions of the same project for testing. + (ENA blocks multiple submissions with the same alias) + """ + sample_metadata = sample_data_in_submission_table["metadata"] + center_name = sample_data_in_submission_table["center_name"] + organism = sample_data_in_submission_table["organism"] + organism_metadata = config.organisms[organism]["ingest"] + if test: + alias = XmlAttribute( + f"{entry["accession"]}:{organism}:{config.unique_project_suffix}:{datetime.now(tz=pytz.utc)}" + ) # TODO(https://github.com/loculus-project/loculus/issues/2425): remove in production + else: + alias = XmlAttribute(f"{entry["accession"]}:{organism}:{config.unique_project_suffix}") + list_sample_attributes = get_sample_attributes(config, sample_metadata, entry) + if config.ena_checklist: + # default is https://www.ebi.ac.uk/ena/browser/view/ERC000011 + sample_checklist = SampleAttribute( + tag="ENA-CHECKLIST", + value=config.ena_checklist, + ) + list_sample_attributes.append(sample_checklist) + sample_type = SampleType( + center_name=XmlAttribute(center_name), + alias=alias, + title=f"{organism_metadata["scientific_name"]}: Genome sequencing", + description=( + f"Automated upload of {organism_metadata["scientific_name"]} sequences submitted by " + f"{center_name} from {config.db_name}" + ), + sample_name=SampleName( + taxon_id=organism_metadata["taxon_id"], + scientific_name=organism_metadata["scientific_name"], + ), + sample_links=SampleLinks( + sample_link=ProjectLink(xref_link=XrefType(db=config.db_name, id=entry["accession"])) + ), + sample_attributes=SampleAttributes(sample_attribute=list_sample_attributes), + ) + return SampleSetType(sample=[sample_type]) + + +def submission_table_start(db_config: SimpleConnectionPool): + """ + 1. Find all entries in submission_table in state SUBMITTED_PROJECT + 2. If (exists an entry in the sample_table for (accession, version)): + a. If (in state SUBMITTED) update state in submission_table to SUBMITTED_SAMPLE + b. Else update state to SUBMITTING_SAMPLE + 3. Else create corresponding entry in sample_table + """ + # Check submission_table for newly added sequences + conditions = {"status_all": StatusAll.SUBMITTED_PROJECT} + ready_to_submit = find_conditions_in_db( + db_config, table_name="submission_table", conditions=conditions + ) + logging.debug( + f"Found {len(ready_to_submit)} entries in submission_table in status SUBMITTED_PROJECT" + ) + for row in ready_to_submit: + seq_key = {"accession": row["accession"], "version": row["version"]} + + # 1. check if there exists an entry in the sample table for seq_key + corresponding_sample = find_conditions_in_db( + db_config, table_name="sample_table", conditions=seq_key + ) + if len(corresponding_sample) == 1: + if corresponding_sample[0]["status"] == str(Status.SUBMITTED): + update_values = {"status_all": StatusAll.SUBMITTED_SAMPLE} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + else: + update_values = {"status_all": StatusAll.SUBMITTING_SAMPLE} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + else: + # If not: create sample_entry, change status to SUBMITTING_SAMPLE + sample_table_entry = SampleTableEntry(**seq_key) + succeeded = add_to_sample_table(db_config, sample_table_entry) + if succeeded: + update_values = {"status_all": StatusAll.SUBMITTING_SAMPLE} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + + +def submission_table_update(db_config: SimpleConnectionPool): + """ + 1. Find all entries in submission_table in state SUBMITTING_SAMPLE + 2. If (exists an entry in the sample_table for (accession, version)): + a. If (in state SUBMITTED) update state in submission_table to SUBMITTED_SAMPLE + 3. Else throw Error + """ + conditions = {"status_all": StatusAll.SUBMITTING_SAMPLE} + submitting_sample = find_conditions_in_db( + db_config, table_name="submission_table", conditions=conditions + ) + logger.debug( + f"Found {len(submitting_sample)} entries in submission_table in" " status SUBMITTING_SAMPLE" + ) + for row in submitting_sample: + seq_key = {"accession": row["accession"], "version": row["version"]} + + # 1. check if there exists an entry in the sample table for seq_key + corresponding_sample = find_conditions_in_db( + db_config, table_name="sample_table", conditions=seq_key + ) + if len(corresponding_sample) == 1 and corresponding_sample[0]["status"] == str( + Status.SUBMITTED + ): + update_values = {"status_all": StatusAll.SUBMITTED_SAMPLE} + update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + if len(corresponding_sample) == 0: + error_msg = ( + "Entry in submission_table in status SUBMITTING_SAMPLE", + " with no corresponding sample", + ) + raise RuntimeError(error_msg) + + +def sample_table_create(db_config: SimpleConnectionPool, config: Config, retry_number: int = 3): + """ + 1. Find all entries in sample_table in state READY + 2. Create sample_set_object: use metadata, center_name, organism, and ingest fields + from submission_table + 3. Update sample_table to state SUBMITTING (only proceed if update succeeds) + 4. If (create_ena_sample succeeds): update state to SUBMITTED with results + 3. Else update state to HAS_ERRORS with error messages + """ + ena_config = get_ena_config( + config.ena_submission_username, + config.ena_submission_password, + config.ena_submission_url, + config.ena_reports_service_url, + ) + conditions = {"status": Status.READY} + ready_to_submit_sample = find_conditions_in_db( + db_config, table_name="sample_table", conditions=conditions + ) + logger.debug(f"Found {len(ready_to_submit_sample)} entries in sample_table in status READY") + for row in ready_to_submit_sample: + seq_key = {"accession": row["accession"], "version": row["version"]} + sample_data_in_submission_table = find_conditions_in_db( + db_config, table_name="submission_table", conditions=seq_key + ) + + sample_set = construct_sample_set_object( + config, + sample_data_in_submission_table[0], + row, + test=True, # TODO(https://github.com/loculus-project/loculus/issues/2425): remove in production + ) + update_values = { + "status": Status.SUBMITTING, + "started_at": datetime.now(tz=pytz.utc), + } + number_rows_updated = update_db_where_conditions( + db_config, + table_name="sample_table", + conditions=seq_key, + update_values=update_values, + ) + if number_rows_updated != 1: + # state not correctly updated - do not start submission + logger.warning( + "sample_table: Status update from READY to SUBMITTING failed " + "- not starting submission." + ) + continue + logger.info(f"Starting sample creation for accession {row["accession"]}") + sample_creation_results: CreationResults = create_ena_sample(ena_config, sample_set) + if sample_creation_results.results: + update_values = { + "status": Status.SUBMITTED, + "result": json.dumps(sample_creation_results.results), + "finished_at": datetime.now(tz=pytz.utc), + } + number_rows_updated = 0 + tries = 0 + while number_rows_updated != 1 and tries < retry_number: + if tries > 0: + # If state not correctly added retry + logger.warning( + f"Sample created but DB update failed - reentry DB update #{tries}." + ) + number_rows_updated = update_db_where_conditions( + db_config, + table_name="sample_table", + conditions=seq_key, + update_values=update_values, + ) + tries += 1 + if number_rows_updated == 1: + logger.info(f"Sample creation for accession {row["accession"]} succeeded!") + else: + update_values = { + "status": Status.HAS_ERRORS, + "errors": json.dumps(sample_creation_results.errors), + "started_at": datetime.now(tz=pytz.utc), + } + number_rows_updated = 0 + tries = 0 + while number_rows_updated != 1 and tries < retry_number: + if tries > 0: + # If state not correctly added retry + logger.warning( + f"sample creation failed and DB update failed - reentry DB update #{tries}." + ) + number_rows_updated = update_db_where_conditions( + db_config, + table_name="sample_table", + conditions=seq_key, + update_values=update_values, + ) + tries += 1 + + +def sample_table_handle_errors( + db_config: SimpleConnectionPool, + config: Config, + slack_config: SlackConfig, + time_threshold: int = 15, + slack_time_threshold: int = 12, +): + """ + - time_threshold: (minutes) + - slack_time_threshold: (hours) + 1. Find all entries in sample_table in state HAS_ERRORS or SUBMITTING over time_threshold + 2. If time since last slack_notification is over slack_time_threshold send notification + """ + entries_with_errors = find_errors_in_db( + db_config, "sample_table", time_threshold=time_threshold + ) + if len(entries_with_errors) > 0: + error_msg = ( + f"{config.backend_url}: ENA Submission pipeline found {len(entries_with_errors)} entries" + f" in sample_table in status HAS_ERRORS or SUBMITTING for over {time_threshold}m" + ) + send_slack_notification( + error_msg, + slack_config, + time=datetime.now(tz=pytz.utc), + time_threshold=slack_time_threshold, + ) + # TODO: Query ENA to check if sample has in fact been created + # If created update sample_table + # If not retry 3 times, then raise for manual intervention + + +@click.command() +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +@click.option( + "--config-file", + required=True, + type=click.Path(exists=True), +) +def create_sample(log_level, config_file): + logger.setLevel(log_level) + logging.getLogger("requests").setLevel(logging.INFO) + + with open(config_file, encoding="utf-8") as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} + config = Config(**relevant_config) + logger.info(f"Config: {config}") + + db_config = db_init(config.db_password, config.db_username, config.db_host) + slack_config = slack_conn_init( + slack_hook_default=config.slack_hook, + slack_token_default=config.slack_token, + slack_channel_id_default=config.slack_channel_id, + ) + + while True: + submission_table_start(db_config) + submission_table_update(db_config) + + sample_table_create(db_config, config) + sample_table_handle_errors(db_config, config, slack_config) + time.sleep(2) + + +if __name__ == "__main__": + create_sample() diff --git a/ena-submission/scripts/ena_submission_helper.py b/ena-submission/scripts/ena_submission_helper.py new file mode 100644 index 000000000..2f3ff351a --- /dev/null +++ b/ena-submission/scripts/ena_submission_helper.py @@ -0,0 +1,430 @@ +import gzip +import json +import logging +import os +import re +import subprocess +import tempfile +from collections import defaultdict +from dataclasses import dataclass +from typing import Any + +import requests +import xmltodict +from ena_types import ( + AssemblyChromosomeListFile, + AssemblyManifest, + ProjectSet, + SampleSetType, + XmlAttribute, +) +from requests.auth import HTTPBasicAuth + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.INFO, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + + +@dataclass +class ENAConfig: + ena_submission_username: str + ena_submission_password: str + ena_submission_url: str + ena_reports_service_url: str + + +def get_ena_config( + ena_submission_username_default: str, + ena_submission_password_default: str, + ena_submission_url_default: str, + ena_reports_service_url_default: str, +) -> ENAConfig: + ena_submission_username = os.getenv("ENA_USERNAME") + if not ena_submission_username: + ena_submission_username = ena_submission_username_default + + ena_submission_password = os.getenv("ENA_PASSWORD") + if not ena_submission_password: + ena_submission_password = ena_submission_password_default + + ena_submission_url = ena_submission_url_default + ena_reports_service_url = ena_reports_service_url_default + + db_params = { + "ena_submission_username": ena_submission_username, + "ena_submission_password": ena_submission_password, + "ena_submission_url": ena_submission_url, + "ena_reports_service_url": ena_reports_service_url, + } + + return ENAConfig(**db_params) + + +@dataclass +class CreationResults: + errors: list[str] + warnings: list[str] + results: dict[str, str] | None = None + + +def recursive_defaultdict(): + return defaultdict(recursive_defaultdict) + + +def dataclass_to_dict(dataclass_instance): + """ + Converts a dataclass instance to a dictionary, handling nested dataclasses. + """ + if not hasattr(dataclass_instance, "__dataclass_fields__"): + return dataclass_instance + result = {} + for field in dataclass_instance.__dataclass_fields__: + value = getattr(dataclass_instance, field) + is_xml_attribute = isinstance(value, XmlAttribute) + if value is None: + continue + if isinstance(value, list): + result[field.upper()] = [dataclass_to_dict(item) for item in value] + elif is_xml_attribute: + attribute_field = "@" + field + result[attribute_field] = value + else: + result[field.upper()] = dataclass_to_dict(value) + return result + + +def dataclass_to_xml(dataclass_instance, root_name="root"): + dataclass_dict = dataclass_to_dict(dataclass_instance) + return xmltodict.unparse({root_name: dataclass_dict}, pretty=True) + + +def get_submission_dict(): + submission = recursive_defaultdict() + submission["SUBMISSION"]["ACTIONS"]["ACTION"]["ADD"] = None + return submission + + +def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResults: + """ + The project creation request should be equivalent to + curl -u {params.ena_submission_username}:{params.ena_submission_password} \ + -F "SUBMISSION=@{submission.xml}" \ + -F "PROJECT=@{project.xml}" \ + {params.ena_submission_url} \ + > {output} + """ + errors = [] + warnings = [] + + def get_project_xml(project_set): + submission_set = get_submission_dict() + return { + "SUBMISSION": xmltodict.unparse(submission_set, pretty=True), + "PROJECT": dataclass_to_xml(project_set, root_name="PROJECT_SET"), + } + + xml = get_project_xml(project_set) + try: + response = post_webin(config, xml) + response.raise_for_status() + except requests.exceptions.RequestException as e: + error_message = ( + f"Request failed with status:{response.status_code}. Message: {e}. " + f"Response: {response.text}." + ) + logger.warning(error_message) + errors.append(error_message) + return CreationResults(results=None, errors=errors, warnings=warnings) + try: + parsed_response = xmltodict.parse(response.text) + valid = ( + parsed_response["RECEIPT"]["@success"] == "true" + and parsed_response["RECEIPT"]["PROJECT"]["@accession"] + and parsed_response["RECEIPT"]["SUBMISSION"]["@accession"] + ) + if not valid: + raise requests.exceptions.RequestException + except Exception as e: + error_message = f"Response is in unexpected format: {e}. " f"Response: {response.text}." + logger.warning(error_message) + errors.append(error_message) + return CreationResults(results=None, errors=errors, warnings=warnings) + project_results = { + "bioproject_accession": parsed_response["RECEIPT"]["PROJECT"]["@accession"], + "ena_submission_accession": parsed_response["RECEIPT"]["SUBMISSION"]["@accession"], + } + return CreationResults(results=project_results, errors=errors, warnings=warnings) + + +def create_ena_sample(config: ENAConfig, sample_set: SampleSetType) -> CreationResults: + """ + The sample creation request should be equivalent to + curl -u {params.ena_submission_username}:{params.ena_submission_password} \ + -F "SUBMISSION=@submission.xml" \ + -F "SAMPLE=@{sample.xml}" \ + {params.ena_submission_url} \ + > {output} + """ + errors = [] + warnings = [] + + def get_sample_xml(sample_set): + submission_set = get_submission_dict() + files = { + "SUBMISSION": xmltodict.unparse(submission_set, pretty=True), + "SAMPLE": dataclass_to_xml(sample_set, root_name="SAMPLE_SET"), + } + return files + + xml = get_sample_xml(sample_set) + try: + response = post_webin(config, xml) + response.raise_for_status() + except requests.exceptions.RequestException: + error_message = ( + f"Request failed with status:{response.status_code}. " + f"Request: {response.request}, Response: {response.text}" + ) + logger.warning(error_message) + errors.append(error_message) + return CreationResults(results=None, errors=errors, warnings=warnings) + try: + parsed_response = xmltodict.parse(response.text) + valid = ( + parsed_response["RECEIPT"]["@success"] == "true" + and parsed_response["RECEIPT"]["SAMPLE"]["@accession"] + and parsed_response["RECEIPT"]["SAMPLE"]["EXT_ID"]["@type"] == "biosample" + and parsed_response["RECEIPT"]["SAMPLE"]["EXT_ID"]["@accession"] + and parsed_response["RECEIPT"]["SUBMISSION"]["@accession"] + ) + if not valid: + raise requests.exceptions.RequestException + except: + error_message = ( + f"Response is in unexpected format. " + f"Request: {response.request}, Response: {response.text}" + ) + logger.warning(error_message) + errors.append(error_message) + return CreationResults(results=None, errors=errors, warnings=warnings) + sample_results = { + "ena_sample_accession": parsed_response["RECEIPT"]["SAMPLE"]["@accession"], + "biosample_accession": parsed_response["RECEIPT"]["SAMPLE"]["EXT_ID"]["@accession"], + "ena_submission_accession": parsed_response["RECEIPT"]["SUBMISSION"]["@accession"], + } + return CreationResults(results=sample_results, errors=errors, warnings=warnings) + + +def post_webin(config: ENAConfig, xml: dict[str, Any]) -> requests.Response: + return requests.post( + config.ena_submission_url, + auth=HTTPBasicAuth(config.ena_submission_username, config.ena_submission_password), + files=xml, + timeout=10, # wait a full 10 seconds for a response incase slow + ) + + +def create_chromosome_list(list_object: AssemblyChromosomeListFile) -> str: + """ + Creates a temp file chromosome list: + https://ena-docs.readthedocs.io/en/latest/submit/fileprep/assembly.html#chromosome-list-file + """ + with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as temp: + filename = temp.name + + with gzip.GzipFile(filename, "wb") as gz: + for entry in list_object.chromosomes: + gz.write( + f"{entry.object_name}\t{entry.chromosome_name}\t{entry.topology!s}-{entry.chromosome_type!s}\n".encode() + ) + + return filename + + +def create_fasta( + unaligned_sequences: dict[str, str], chromosome_list: AssemblyChromosomeListFile +) -> str: + """ + Creates a temp fasta file: + https://ena-docs.readthedocs.io/en/latest/submit/fileprep/assembly.html#fasta-file + """ + with tempfile.NamedTemporaryFile(delete=False, suffix=".fasta.gz") as temp: + filename = temp.name + + with gzip.GzipFile(filename, "wb") as gz: + if len(unaligned_sequences.keys()) == 1: + entry = chromosome_list.chromosomes[0] + gz.write(f">{entry.object_name}\n".encode()) + gz.write(f"{unaligned_sequences["main"]}\n".encode()) + else: + for entry in chromosome_list.chromosomes: + gz.write(f">{entry.object_name}\n".encode()) + gz.write(f"{unaligned_sequences[entry.chromosome_name]}\n".encode()) + + return filename + + +def create_manifest(manifest: AssemblyManifest) -> str: + """ + Creates a temp manifest file: + https://ena-docs.readthedocs.io/en/latest/submit/assembly/genome.html#manifest-files + """ + with tempfile.NamedTemporaryFile(delete=False, suffix=".tsv") as temp: + filename = temp.name + with open(filename, "w") as f: + f.write(f"STUDY\t{manifest.study}\n") + f.write(f"SAMPLE\t{manifest.sample}\n") + f.write( + f"ASSEMBLYNAME\t{manifest.assemblyname}\n" + ) # This is the alias that needs to be unique + f.write(f"ASSEMBLY_TYPE\t{manifest.assembly_type!s}\n") + f.write(f"COVERAGE\t{manifest.coverage}\n") + f.write(f"PROGRAM\t{manifest.program}\n") + f.write(f"PLATFORM\t{manifest.platform}\n") + f.write(f"FASTA\t{manifest.fasta}\n") + f.write(f"CHROMOSOME_LIST\t{manifest.chromosome_list}\n") + if manifest.description: + f.write(f"DESCRIPTION\t{manifest.description}\n") + if manifest.moleculetype: + f.write(f"MOLECULETYPE\t{manifest.moleculetype!s}\n") + + return filename + + +def post_webin_cli( + config: ENAConfig, manifest_filename, center_name=None +) -> subprocess.CompletedProcess: + subprocess_args = [ + "java", + "-jar", + "webin-cli.jar", + "-username", + config.ena_submission_username, + "-password", + config.ena_submission_password, + "-context", + "genome", + "-manifest", + manifest_filename, + "-submit", + "-test", # TODO(https://github.com/loculus-project/loculus/issues/2425): remove in prod + ] + if center_name: + subprocess_args.extend(["-centername", center_name]) + return subprocess.run( + subprocess_args, + capture_output=True, + text=True, + check=False, + ) + + +def create_ena_assembly( + config: ENAConfig, manifest_filename: str, center_name=None +) -> CreationResults: + """ + This is equivalent to running: + webin-cli -username {params.ena_submission_username} -password {params.ena_submission_password} + -context genome -manifest {manifest_file} -submit + """ + errors = [] + warnings = [] + response = post_webin_cli(config, manifest_filename, center_name=center_name) + logger.info(response.stdout) + if response.returncode != 0: + error_message = ( + f"Request failed with status:{response.returncode}. " + f"Stdout: {response.stdout}, Stderr: {response.stderr}" + ) + logger.warning(error_message) + errors.append(error_message) + return CreationResults(results=None, errors=errors, warnings=warnings) + + lines = response.stdout.splitlines() + erz_accession = None + for line in lines: + if "The following analysis accession was assigned to the submission:" in line: + match = re.search(r"ERZ\d+", line) + if match: + erz_accession = match.group(0) + break + if not erz_accession: + error_message = ( + f"Response is in unexpected format. " + f"Stdout: {response.stdout}, Stderr: {response.stderr}" + ) + logger.warning(error_message) + errors.append(error_message) + return CreationResults(results=None, errors=errors, warnings=warnings) + assembly_results = { + "erz_accession": erz_accession, + } + return CreationResults(results=assembly_results, errors=errors, warnings=warnings) + + +def check_ena(config: ENAConfig, erz_accession: str) -> CreationResults: + """ + This is equivalent to running: + curl -X 'GET' \ + '{config.ena_reports_service_url}/analysis-process/{erz_accession}?format=json&max-results=100' \ + -H 'accept: */*' \ + -H 'Authorization: Basic KEY' + """ + url = f"{config.ena_reports_service_url}/analysis-process/{erz_accession}?format=json&max-results=100" + + errors = [] + warnings = [] + try: + response = requests.get( + url, + auth=HTTPBasicAuth(config.ena_submission_username, config.ena_submission_password), + timeout=10, # wait a full 10 seconds for a response incase slow + ) + response.raise_for_status() + except requests.exceptions.RequestException: + error_message = ( + f"ENA check failed with status:{response.status_code}. " + f"Request: {response.request}, Response: {response.text}" + ) + logger.warning(error_message) + errors.append(error_message) + return CreationResults(results=None, errors=errors, warnings=warnings) + if response.text == "[]": + # For some minutes the response will be empty, requests to + # f"{config.ena_reports_service_url}/analysis-files/{erz_accession}?format=json" + # should still succeed + return CreationResults(results=None, errors=errors, warnings=warnings) + try: + parsed_response = json.loads(response.text) + entry = parsed_response[0]["report"] + if entry["processingError"]: + raise requests.exceptions.RequestException + if entry["processingStatus"] == "COMPLETED": + acc_list = entry["acc"].split(",") + acc_dict = {a.split(":")[0]: a.split(":")[-1] for a in acc_list} + if "genome" not in acc_dict: + raise requests.exceptions.RequestException + gca_accession = acc_dict["genome"] + if "chromosomes" not in acc_dict: + raise requests.exceptions.RequestException + insdc_accession = acc_dict["chromosomes"] + else: + return CreationResults(results=None, errors=errors, warnings=warnings) + except: + error_message = ( + f"ENA Check returned errors or is in unexpected format. " + f"Request: {response.request}, Response: {response.text}" + ) + logger.warning(error_message) + errors.append(error_message) + return CreationResults(results=None, errors=errors, warnings=warnings) + assembly_results = { + "erz_accession": erz_accession, + "gca_accession": gca_accession, + "insdc_accession": insdc_accession, + } + return CreationResults(results=assembly_results, errors=errors, warnings=warnings) diff --git a/ena-submission/scripts/ena_types.py b/ena-submission/scripts/ena_types.py new file mode 100644 index 000000000..2f8043649 --- /dev/null +++ b/ena-submission/scripts/ena_types.py @@ -0,0 +1,247 @@ +import dataclasses +from dataclasses import dataclass, field +from enum import Enum + + +@dataclass +class XrefType: + db: str | None = None + id: str | None = None + label: str | None = None + + +@dataclass +class UrlType: + label: str | None = None + url: str | None = None + + +@dataclass +class ProjectLink: + xref_link: XrefType | None = None + url_link: UrlType | None = None + + +@dataclass +class ProjectLinks: + project_link: list[ProjectLink] | None = None + + +@dataclass +class OrganismType: + taxon_id: int | None = None + scientific_name: str | None = None + common_name: str | None = None + strain: str | None = None + breed: str | None = None + cultivar: str | None = None + isolate: str | None = None + + +@dataclass +class SequencingProject: + locus_tag_prefix: list[str] = dataclasses.field(default_factory=list) + + +def default_sequencing_project() -> SequencingProject: + return SequencingProject() + + +@dataclass +class SubmissionProject: + sequencing_project: SequencingProject = field(default_factory=default_sequencing_project) + organism: OrganismType | None = None + + +@dataclass +class UmbrellaProject: + organism: OrganismType | None = None + + +@dataclass +class RelatedProjectSubType: + accession: str | None = None + + +@dataclass +class RelatedProject: + parent_project: RelatedProjectSubType | None = None + child_project: RelatedProjectSubType | None = None + peer_project: RelatedProjectSubType | None = None + + +@dataclass +class ProjectTypeCollaborators: + collaborator: list[str] + + +@dataclass +class XmlAttribute: + def __init__(self, name): + self.name = name + + def __str__(self): + return self.name + + +@dataclass +class ProjectType: + name: str + title: str + description: str + center_name: XmlAttribute | None = None + alias: XmlAttribute | None = None + collaborators: ProjectTypeCollaborators | None = None + submission_project: SubmissionProject | None = None + umbrella_project: UmbrellaProject | None = None + related_projects: RelatedProject | None = None + project_links: ProjectLinks | None = None + project_attributes: dict[str, str] | None = None + + +def default_project_type(): + return ProjectType( + name="default_name", title="default_title", description="default_description" + ) + + +@dataclass +class ProjectSet: + project: list[ProjectType] + + +@dataclass +class SampleName: + taxon_id: int | None = None + scientific_name: str | None = None + common_name: str | None = None + display_name: str | None = None + + +@dataclass +class SampleAttribute: + tag: str + value: str + units: str | None = None + + +@dataclass +class SampleAttributes: + sample_attribute: list[SampleAttribute] = None + + +@dataclass +class SampleLinks: + sample_link: list[ProjectLink] + + +@dataclass +class SampleType: + center_name: XmlAttribute | None = None + alias: XmlAttribute | None = None + title: str | None = None + sample_name: SampleName | None = None + description: str | None = None + sample_links: SampleLinks | None = None + sample_attributes: SampleAttributes | None = None + + +def default_sample_type(): + return SampleType() + + +@dataclass +class SampleSetType: + sample: list[SampleType] + + +class AssemblyType(Enum): + CLONE = "clone" + ISOLATE = "isolate" + + def __str__(self): + return self.value + + +class MoleculeType(Enum): + GENOMIC_DNA = "genomic DNA" + GENOMIC_RNA = "genomic RNA" + VIRAL_CRNA = "viral cRNA" + + def __str__(self): + return self.value + + +@dataclass +class AssemblyManifest: + study: str + sample: str + assemblyname: str # Note: this SHOULD be 1 word no hyphen + assembly_type: AssemblyType + coverage: str + program: str + platform: str + fasta: str + chromosome_list: str + mingaplength: int | None = None + moleculetype: MoleculeType | None = None + description: str | None = None + run_ref: list[str] | None = None + + +class ChromosomeType(Enum): + CHROMOSOME = "chromosome" + PLASMID = "plasmid" + LINKAGE_GROUP = "linkage_group" + MONOPARTITE = "monopartite" + SEGMENTED = "segmented" + MULTIPARTITE = "multipartite" + + def __str__(self): + return self.value + + +class ChromosomeLocation(Enum): + MACRONUCLEAR = "macronuclear" + NUCLEOMORPH = "nucleomorph" + MITOCHONDRION = "mitochondrion" + KINETOPLAST = "kinetoplast" + CHLOROPLAST = "chloroplast" + CHROMOPLAST = "chromoplast" + PLASTID = "plastid" + VIRION = "virion" + PHAGE = "phage" + PROVIRAL = "proviral" + PROPHAGE = "prophage" + VIROID = "viroid" + CYANELLE = "cyanelle" + APICOPLAST = "apicoplast" + LEUCOPLAST = "leucoplast" + PROPLASTID = "proplastid" + HYDROGENOSOME = "hydrogenosome" + CHROMATOPHORE = "chromatophore" + + def __str__(self): + return self.value + + +class Topology(Enum): + LINEAR = "linear" + CIRCULAR = "circular" + + def __str__(self): + return self.value + + +@dataclass +class AssemblyChromosomeListFileObject: + object_name: str + chromosome_name: str + chromosome_type: ChromosomeType + topology: Topology = Topology.LINEAR + chromosome_location: ChromosomeLocation | None = None + + +@dataclass +class AssemblyChromosomeListFile: + chromosomes: list[AssemblyChromosomeListFileObject] diff --git a/ena-submission/scripts/get_ena_submission_list.py b/ena-submission/scripts/get_ena_submission_list.py index aed9d78a3..0ab5cd0d8 100644 --- a/ena-submission/scripts/get_ena_submission_list.py +++ b/ena-submission/scripts/get_ena_submission_list.py @@ -6,9 +6,10 @@ import click import yaml -from call_loculus import get_released_data -from notifications import get_slack_config, notify, upload_file_with_comment -from submission_db_helper import get_db_config, in_submission_table +from call_loculus import fetch_released_entries +from notifications import notify, slack_conn_init, upload_file_with_comment +from psycopg2.pool import SimpleConnectionPool +from submission_db_helper import db_init, in_submission_table logger = logging.getLogger(__name__) logging.basicConfig( @@ -38,7 +39,9 @@ class Config: slack_channel_id: str -def get_data_for_submission(config, entries, db_config, organism): +def filter_for_submission( + config: Config, db_config: SimpleConnectionPool, entries: dict[str, str], organism: str +) -> dict[str, Any]: """ Filter data in state APPROVED_FOR_RELEASE: - data must be state "OPEN" for use @@ -56,11 +59,10 @@ def get_data_for_submission(config, entries, db_config, organism): continue if item["metadata"]["submitter"] == config.ingest_pipeline_submitter: continue - fields = [1 if item["metadata"][field] else 0 for field in config.ena_specific_metadata] - if in_submission_table(accession, version, db_config): + if in_submission_table(db_config, {"accession": accession, "version": version}): continue - if sum(fields) > 0: - logging.warn( + if any(item["metadata"].get(field, False) for field in config.ena_specific_metadata): + logging.warning( f"Found sequence: {key} with ena-specific-metadata fields and not submitted by us ", f"or {config.ingest_pipeline_submitter}. Potential user error: discarding sequence.", ) @@ -70,25 +72,24 @@ def get_data_for_submission(config, entries, db_config, organism): return data_dict -def send_slack_notification(config: Config, output_file: str): - slack_config = get_slack_config( +def send_slack_notification_with_file(config: Config, output_file: str) -> None: + slack_config = slack_conn_init( slack_hook_default=config.slack_hook, slack_token_default=config.slack_token, slack_channel_id_default=config.slack_channel_id, ) if not slack_config.slack_hook: - logging.info("Could not find slack hook cannot send message") - - if slack_config.slack_hook: - comment = ( - f"{config.backend_url}: ENA Submission pipeline wants to submit the following sequences" - ) - try: - response = upload_file_with_comment(slack_config, output_file, comment) - if not response.get("ok", False): - raise Exception - except Exception as e: - notify(slack_config, comment + f" - file upload to slack failed with Error {e}") + logging.info("Could not find slack hook, cannot send message") + return + comment = ( + f"{config.backend_url}: ENA Submission pipeline wants to submit the following sequences" + ) + try: + response = upload_file_with_comment(slack_config, output_file, comment) + if not response.get("ok", False): + raise Exception + except Exception as e: + notify(slack_config, comment + f" - file upload to slack failed with Error {e}") @click.command() @@ -121,7 +122,7 @@ def get_ena_submission_list(log_level, config_file, output_file): config = Config(**relevant_config) logger.info(f"Config: {config}") - db_config = get_db_config( + db_config = db_init( db_password_default=config.db_password, db_username_default=config.db_username, db_host_default=config.db_host, @@ -134,13 +135,13 @@ def get_ena_submission_list(log_level, config_file, output_file): ] logging.info(f"Getting released sequences for organism: {organism}") - all_entries = get_released_data(config, organism) - data = get_data_for_submission(config, all_entries, db_config, organism) - entries_to_submit.update(data) + released_entries = fetch_released_entries(config, organism) + submittable_entries = filter_for_submission(config, db_config, released_entries, organism) + entries_to_submit.update(submittable_entries) if entries_to_submit: Path(output_file).write_text(json.dumps(entries_to_submit), encoding="utf-8") - send_slack_notification(config, output_file) + send_slack_notification_with_file(config, output_file) else: logging.info("No sequences found to submit to ENA") Path(output_file).write_text("", encoding="utf-8") diff --git a/ena-submission/scripts/notifications.py b/ena-submission/scripts/notifications.py index 302b2f305..ecc5be8bc 100644 --- a/ena-submission/scripts/notifications.py +++ b/ena-submission/scripts/notifications.py @@ -3,9 +3,10 @@ import os import zipfile from dataclasses import dataclass +from datetime import datetime, timedelta import requests -from slack_sdk import WebClient +from slack_sdk import WebClient, web @dataclass @@ -13,14 +14,15 @@ class SlackConfig: slack_hook: str slack_token: str slack_channel_id: str + last_notification_sent: datetime | None logger = logging.getLogger(__name__) -def get_slack_config( +def slack_conn_init( slack_hook_default: str, slack_token_default: str, slack_channel_id_default: str -): +) -> SlackConfig: slack_hook = os.getenv("SLACK_HOOK") if not slack_hook: slack_hook = slack_hook_default @@ -33,10 +35,13 @@ def get_slack_config( if not slack_channel_id: slack_channel_id = slack_channel_id_default + last_notification_sent = None + params = { "slack_hook": slack_hook, "slack_token": slack_token, "slack_channel_id": slack_channel_id, + "last_notification_sent": last_notification_sent, } return SlackConfig(**params) @@ -48,7 +53,9 @@ def notify(config: SlackConfig, text: str): requests.post(config.slack_hook, data=json.dumps({"text": text}), timeout=10) -def upload_file_with_comment(config: SlackConfig, file_path: str, comment: str): +def upload_file_with_comment( + config: SlackConfig, file_path: str, comment: str +) -> web.SlackResponse: """Upload file with comment to slack channel""" client = WebClient(token=config.slack_token) output_file_zip = file_path.split(".")[0] + ".zip" @@ -61,3 +68,22 @@ def upload_file_with_comment(config: SlackConfig, file_path: str, comment: str): channel=config.slack_channel_id, initial_comment=comment, ) + + +def send_slack_notification( + comment: str, slack_config: SlackConfig, time: datetime, time_threshold: int = 12 +): + """ + Sends a slack notification if current time is over time_threshold hours + since slack_config.last_notification_sent. + """ + if not slack_config.slack_hook: + logger.info("Could not find slack hook cannot send message") + return + if ( + not slack_config.last_notification_sent + or time - timedelta(hours=time_threshold) > slack_config.last_notification_sent + ): + logger.warning(comment) + notify(slack_config, comment) + slack_config.last_notification_sent = time diff --git a/ena-submission/scripts/submission_db_helper.py b/ena-submission/scripts/submission_db_helper.py index ba450fa88..4b9c4138b 100644 --- a/ena-submission/scripts/submission_db_helper.py +++ b/ena-submission/scripts/submission_db_helper.py @@ -1,20 +1,17 @@ import os from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timedelta from enum import Enum import psycopg2 import pytz +from psycopg2.extras import RealDictCursor +from psycopg2.pool import SimpleConnectionPool -@dataclass -class DBConfig: - username: str - password: str - host: str - - -def get_db_config(db_password_default: str, db_username_default: str, db_host_default: str): +def db_init( + db_password_default: str, db_username_default: str, db_host_default: str +) -> SimpleConnectionPool: db_password = os.getenv("DB_PASSWORD") if not db_password: db_password = db_password_default @@ -27,25 +24,30 @@ def get_db_config(db_password_default: str, db_username_default: str, db_host_de if not db_host: db_host = db_host_default - db_params = { - "username": db_username, - "password": db_password, - "host": db_host, - } - - return DBConfig(**db_params) + return SimpleConnectionPool( + minconn=1, + maxconn=4, # max 7*4 connections to db allowed + dbname="loculus", + user=db_username, + host=db_host, + password=db_password, + options="-c search_path=ena-submission", + ) class StatusAll(Enum): READY_TO_SUBMIT = 0 SUBMITTING_PROJECT = 1 - SUBMITTING_SAMPLE = 2 - SUBMITTING_ASSEMBLY = 3 - SUBMITTED_ALL = 4 - SENT_TO_LOCULUS = 5 - HAS_ERRORS_PROJECT = 6 - HAS_ERRORS_ASSEMBLY = 7 - HAS_ERRORS_SAMPLE = 8 + SUBMITTED_PROJECT = 2 + SUBMITTING_SAMPLE = 3 + SUBMITTED_SAMPLE = 4 + SUBMITTING_ASSEMBLY = 5 + SUBMITTED_ALL = 6 + SENT_TO_LOCULUS = 7 + HAS_ERRORS_PROJECT = 8 + HAS_ERRORS_ASSEMBLY = 9 + HAS_ERRORS_SAMPLE = 10 + HAS_ERRORS_EXT_METADATA_UPLOAD = 11 def __str__(self): return self.name @@ -56,11 +58,44 @@ class Status(Enum): SUBMITTING = 1 SUBMITTED = 2 HAS_ERRORS = 3 + WAITING = 4 # Only for assembly creation def __str__(self): return self.name +class TableName(Enum): + PROJECT_TABLE = "project_table" + SAMPLE_TABLE = "sample_table" + ASSEMBLY_TABLE = "assembly_table" + SUBMISSION_TABLE = "submission_table" + + @classmethod + def validate(cls, value: str): + if value not in cls._value2member_map_: + msg = ( + f"Invalid table name '{value}'." + f" Allowed values are: {', '.join([e.value for e in cls])}" + ) + raise ValueError(msg) + + +def is_valid_column_name(table_name: TableName, column_name: str) -> bool: + match table_name: + case "project_table": + field_names = ProjectTableEntry.__annotations__.keys() + case "sample_table": + field_names = SampleTableEntry.__annotations__.keys() + case "assembly_table": + field_names = AssemblyTableEntry.__annotations__.keys() + case "submission_table": + field_names = SubmissionTableEntry.__annotations__.keys() + + if column_name not in field_names: + msg = f"Invalid column name '{column_name}' for {table_name}" + raise ValueError(msg) + + @dataclass class SubmissionTableEntry: accession: str @@ -74,57 +109,336 @@ class SubmissionTableEntry: finished_at: datetime | None = None metadata: str | None = None unaligned_nucleotide_sequences: str | None = None + center_name: str | None = None external_metadata: str | None = None -def connect_to_db(db_config: DBConfig): - """ - Establish connection to ena_submitter DB, if DB doesn't exist create it. - """ +@dataclass +class ProjectTableEntry: + group_id: int + organism: str + errors: str | None = None + warnings: str | None = None + status: Status = Status.READY + started_at: datetime | None = None + finished_at: datetime | None = None + center_name: str | None = None + result: str | None = None + + +@dataclass +class SampleTableEntry: + accession: str + version: int + errors: str | None = None + warnings: str | None = None + status: Status = Status.READY + started_at: datetime | None = None + finished_at: datetime | None = None + result: str | None = None + + +@dataclass +class AssemblyTableEntry: + accession: str + version: int + errors: str | None = None + warnings: str | None = None + status: Status = Status.READY + started_at: datetime | None = None + finished_at: datetime | None = None + result: str | None = None + + +def find_conditions_in_db( + db_conn_pool: SimpleConnectionPool, table_name: TableName, conditions: dict[str, str] +) -> dict[str, str]: + con = db_conn_pool.getconn() try: - con = psycopg2.connect( - dbname="loculus", - user=db_config.username, - host=db_config.host, - password=db_config.password, - options="-c search_path=ena-submission", - ) - except ConnectionError as e: - raise ConnectionError("Could not connect to loculus DB") from e - return con - - -def in_submission_table(accession: str, version: int, db_config: DBConfig) -> bool: - con = connect_to_db(db_config) - cur = con.cursor() - cur.execute( - "select * from submission_table where accession=%s and version=%s", - (f"{accession}", f"{version}"), - ) - return bool(cur.rowcount) - - -def add_to_submission_table(db_config: DBConfig, submission_table_entry: SubmissionTableEntry): - con = connect_to_db(db_config) - cur = con.cursor() - submission_table_entry.started_at = datetime.now(tz=pytz.utc) - - cur.execute( - "insert into submission_table values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", - ( - submission_table_entry.accession, - submission_table_entry.version, - submission_table_entry.organism, - submission_table_entry.group_id, - submission_table_entry.errors, - submission_table_entry.warnings, - str(submission_table_entry.status_all), - submission_table_entry.started_at, - submission_table_entry.finished_at, - submission_table_entry.metadata, - submission_table_entry.unaligned_nucleotide_sequences, - submission_table_entry.external_metadata, - ), - ) - con.commit() - con.close() + with con, con.cursor(cursor_factory=RealDictCursor) as cur: + # Prevent sql-injection with table_name and column_name validation + TableName.validate(table_name) + for key in conditions: + is_valid_column_name(table_name, key) + + query = f"SELECT * FROM {table_name}" # noqa: S608 + + where_clause = " AND ".join([f"{key}=%s" for key in conditions]) + query += f" WHERE {where_clause}" + + cur.execute( + query, + tuple( + str(value) if (isinstance(value, (Status, StatusAll))) else value # noqa: UP038 + for value in conditions.values() + ), + ) + + results = cur.fetchall() + finally: + db_conn_pool.putconn(con) + + return results + + +def find_errors_in_db( + db_conn_pool: SimpleConnectionPool, table_name: TableName, time_threshold: int = 15 +) -> dict[str, str]: + con = db_conn_pool.getconn() + try: + with con, con.cursor(cursor_factory=RealDictCursor) as cur: + min_start_time = datetime.now(tz=pytz.utc) + timedelta(minutes=-time_threshold) + # Prevent sql-injection with table_name validation + TableName.validate(table_name) + + query = f""" + SELECT * FROM {table_name} + WHERE (status = 'HAS_ERRORS' AND started_at < %s) + OR (status = 'SUBMITTING' AND started_at < %s) + """ # noqa: S608 + + cur.execute(query, (min_start_time, min_start_time)) + + results = cur.fetchall() + finally: + db_conn_pool.putconn(con) + + return results + + +def find_stuck_in_submission_db( + db_conn_pool: SimpleConnectionPool, time_threshold: int = 48 +) -> dict[str, str]: + con = db_conn_pool.getconn() + try: + with con, con.cursor(cursor_factory=RealDictCursor) as cur: + min_start_time = datetime.now(tz=pytz.utc) + timedelta(hours=-time_threshold) + + query = """ + SELECT * FROM submission_table + WHERE status_all = 'HAS_ERRORS_EXT_METADATA_UPLOAD' + AND started_at < %s + """ + + cur.execute(query, (min_start_time,)) + + results = cur.fetchall() + finally: + db_conn_pool.putconn(con) + + return results + + +def find_waiting_in_db( + db_conn_pool: SimpleConnectionPool, table_name: TableName, time_threshold: int = 48 +) -> dict[str, str]: + con = db_conn_pool.getconn() + try: + with con, con.cursor(cursor_factory=RealDictCursor) as cur: + min_start_time = datetime.now(tz=pytz.utc) + timedelta(hours=-time_threshold) + # Prevent sql-injection with table_name validation + TableName.validate(table_name) + + query = f"SELECT * FROM {table_name} WHERE status = 'WAITING' AND started_at < %s" # noqa: S608 + + cur.execute(query, (min_start_time,)) + + results = cur.fetchall() + finally: + db_conn_pool.putconn(con) + + return results + + +def update_db_where_conditions( + db_conn_pool: SimpleConnectionPool, + table_name: TableName, + conditions: dict[str, str], + update_values: dict[str, str], +) -> int: + updated_row_count = 0 + con = db_conn_pool.getconn() + try: + with con, con.cursor(cursor_factory=RealDictCursor) as cur: + # Prevent sql-injection with table_name and column_name validation + TableName.validate(table_name) + for key in conditions: + is_valid_column_name(table_name, key) + + query = f"UPDATE {table_name} SET " # noqa: S608 + + set_clause = ", ".join([f"{key}=%s" for key in update_values]) + query += set_clause + + where_clause = " AND ".join([f"{key}=%s" for key in conditions]) + query += f" WHERE {where_clause}" + parameters = tuple( + str(value) if (isinstance(value, (Status, StatusAll))) else value # noqa: UP038 + for value in update_values.values() + ) + tuple( + str(value) if (isinstance(value, (Status, StatusAll))) else value # noqa: UP038 + for value in conditions.values() + ) + + cur.execute(query, parameters) + updated_row_count = cur.rowcount + con.commit() + except (Exception, psycopg2.DatabaseError) as e: + con.rollback() + print(f"update_db_where_conditions errored with: {e}") + finally: + db_conn_pool.putconn(con) + return updated_row_count + + +def add_to_project_table( + db_conn_pool: SimpleConnectionPool, project_table_entry: ProjectTableEntry +) -> bool: + con = db_conn_pool.getconn() + try: + with con, con.cursor() as cur: + project_table_entry.started_at = datetime.now(tz=pytz.utc) + + cur.execute( + "INSERT INTO project_table VALUES (%s, %s, %s, %s, %s, %s, %s, %s)", + ( + project_table_entry.group_id, + project_table_entry.organism, + project_table_entry.errors, + project_table_entry.warnings, + str(project_table_entry.status), + project_table_entry.started_at, + project_table_entry.finished_at, + project_table_entry.result, + ), + ) + + con.commit() + return True + except Exception as e: + con.rollback() + print(f"add_to_project_table errored with: {e}") + return False + finally: + db_conn_pool.putconn(con) + + +def add_to_sample_table( + db_conn_pool: SimpleConnectionPool, sample_table_entry: SampleTableEntry +) -> bool: + con = db_conn_pool.getconn() + try: + with con, con.cursor() as cur: + sample_table_entry.started_at = datetime.now(tz=pytz.utc) + + cur.execute( + "insert into sample_table values(%s,%s,%s,%s,%s,%s,%s,%s)", + ( + sample_table_entry.accession, + sample_table_entry.version, + sample_table_entry.errors, + sample_table_entry.warnings, + str(sample_table_entry.status), + sample_table_entry.started_at, + sample_table_entry.finished_at, + sample_table_entry.result, + ), + ) + con.commit() + return True + except Exception as e: + con.rollback() + print(f"add_to_sample_table errored with: {e}") + return False + finally: + db_conn_pool.putconn(con) + + +def add_to_assembly_table( + db_conn_pool: SimpleConnectionPool, assembly_table_entry: AssemblyTableEntry +) -> bool: + con = db_conn_pool.getconn() + try: + with con, con.cursor() as cur: + assembly_table_entry.started_at = datetime.now(tz=pytz.utc) + + cur.execute( + "insert into assembly_table values(%s,%s,%s,%s,%s,%s,%s,%s)", + ( + assembly_table_entry.accession, + assembly_table_entry.version, + assembly_table_entry.errors, + assembly_table_entry.warnings, + str(assembly_table_entry.status), + assembly_table_entry.started_at, + assembly_table_entry.finished_at, + assembly_table_entry.result, + ), + ) + con.commit() + return True + except Exception as e: + con.rollback() + print(f"add_to_assembly_table errored with: {e}") + return False + finally: + db_conn_pool.putconn(con) + + +def in_submission_table(db_conn_pool: SimpleConnectionPool, conditions) -> bool: + con = db_conn_pool.getconn() + try: + with con, con.cursor() as cur: + for key in conditions: + is_valid_column_name("submission_table", key) + + query = "SELECT * from submission_table" + + where_clause = " AND ".join([f"{key}=%s" for key in conditions]) + query += f" WHERE {where_clause}" + cur.execute( + query, + tuple( + str(value) if (isinstance(value, (Status, StatusAll))) else value # noqa: UP038 + for value in conditions.values() + ), + ) + in_db = bool(cur.rowcount) + finally: + db_conn_pool.putconn(con) + return in_db + + +def add_to_submission_table( + db_conn_pool: SimpleConnectionPool, submission_table_entry: SubmissionTableEntry +) -> bool: + con = db_conn_pool.getconn() + try: + with con, con.cursor() as cur: + submission_table_entry.started_at = datetime.now(tz=pytz.utc) + + cur.execute( + "insert into submission_table values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", + ( + submission_table_entry.accession, + submission_table_entry.version, + submission_table_entry.organism, + submission_table_entry.group_id, + submission_table_entry.errors, + submission_table_entry.warnings, + str(submission_table_entry.status_all), + submission_table_entry.started_at, + submission_table_entry.finished_at, + submission_table_entry.metadata, + submission_table_entry.unaligned_nucleotide_sequences, + submission_table_entry.external_metadata, + ), + ) + con.commit() + return True + except Exception as e: + con.rollback() + print(f"add_to_submission_table errored with: {e}") + return False + finally: + db_conn_pool.putconn(con) diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py new file mode 100644 index 000000000..d70dda7d5 --- /dev/null +++ b/ena-submission/scripts/test_ena_submission.py @@ -0,0 +1,271 @@ +import csv +import gzip +import json +import unittest +from pathlib import Path +from unittest import mock + +import xmltodict +import yaml +from create_assembly import ( + create_chromosome_list_object, + create_manifest_object, +) +from create_project import construct_project_set_object +from create_sample import construct_sample_set_object +from ena_submission_helper import ( + ENAConfig, + create_chromosome_list, + create_ena_project, + create_ena_sample, + create_fasta, + create_manifest, + dataclass_to_xml, +) +from ena_types import default_project_type, default_sample_type +from requests import exceptions + +# Default configs +with open("config/defaults.yaml", encoding="utf-8") as f: + defaults = yaml.safe_load(f) + +# Setup a mock configuration +test_ena_config = ENAConfig( + ena_submission_url="https://test.url", + ena_reports_service_url="https://test.url", + ena_submission_password="test_password", # noqa: S106 + ena_submission_username="test_user", +) + + +def mock_config(): + config = mock.Mock() + config.db_name = "Loculus" + config.unique_project_suffix = "Test suffix" + metadata_dict = {"taxon_id": "Test taxon", "scientific_name": "Test scientific name"} + config.organisms = {"Test organism": {"ingest": metadata_dict}} + config.metadata_mapping = defaults["metadata_mapping"] + config.metadata_mapping_mandatory_field_defaults = defaults[ + "metadata_mapping_mandatory_field_defaults" + ] + config.ena_checklist = "ERC000033" + return config + + +# Example XMLs +test_project_xml_response = Path("test/test_project_response.xml").read_text(encoding="utf-8") +text_project_xml_request = Path("test/text_project_request.xml").read_text(encoding="utf-8") +test_project_xml_failure_response = """ + + +""" + +test_sample_xml_request = Path("test/test_sample_request.xml").read_text(encoding="utf-8") +test_sample_xml_response = Path("test/test_sample_response.xml").read_text(encoding="utf-8") + + +# Test sample +loculus_sample: dict = json.load( + open("test/approved_ena_submission_list_test.json", encoding="utf-8") +) +sample_data_in_submission_table = { + "accession": "test_accession", + "version": "test_version", + "group_id": 1, + "organism": "Test organism", + "metadata": loculus_sample["LOC_0001TLY.1"]["metadata"], + "unaligned_nucleotide_sequences": { + "seg1": None, + "seg2": "GCGGCACGTCAGTACGTAAGTGTATCTCAAAGAAATACTTAACTTTGAGAGAGTGAATT", + "seg3": "CTTAACTTTGAGAGAGTGAATT", + }, + "center_name": "Fake center name", +} +project_table_entry = {"group_id": "1", "organism": "Test organism"} +sample_table_entry = { + "accession": "test_accession", + "version": "test_version", +} + + +# Mock requests +def mock_requests_post(status_code, text): + mock_response = mock.Mock() + mock_response.status_code = status_code + mock_response.text = text + return mock_response + + +class ProjectCreationTests(unittest.TestCase): + @mock.patch("requests.post") + def test_create_project_success(self, mock_post) -> None: + # Testing successful project creation + mock_post.return_value = mock_requests_post(200, test_project_xml_response) + project_set = default_project_type() + response = create_ena_project(test_ena_config, project_set) + desired_response = { + "bioproject_accession": "PRJEB20767", + "ena_submission_accession": "ERA912529", + } + self.assertEqual(response.results, desired_response) + + @mock.patch("requests.post") + def test_create_project_xml_failure(self, mock_post): + # Testing project creation failure due to incorrect status + mock_post.return_value = mock_requests_post(200, test_project_xml_failure_response) + project_set = default_project_type() + response = create_ena_project(test_ena_config, project_set) + error_message_part = "Response is in unexpected format" + self.assertIn(error_message_part, response.errors[0]) + + @mock.patch("requests.post") + def test_create_project_server_failure(self, mock_post): + # Testing project creation failure + mock_post.return_value = mock_requests_post(500, "Internal Server Error") + mock_post.return_value.raise_for_status.side_effect = exceptions.RequestException() + project_set = default_project_type() + response = create_ena_project(test_ena_config, project_set) + error_message_part = "Request failed with status:500" + self.assertIn(error_message_part, response.errors[0]) + error_message_part = "Response: Internal Server Error" + self.assertIn(error_message_part, response.errors[0]) + + def test_construct_project_set_object(self): + config = mock_config() + group_info = {"institution": "Test institution"} + project_set = construct_project_set_object(group_info, config, project_table_entry) + self.assertEqual( + xmltodict.parse(dataclass_to_xml(project_set, root_name="PROJECT_SET")), + xmltodict.parse(text_project_xml_request), + ) + + +class SampleCreationTests(unittest.TestCase): + @mock.patch("requests.post") + def test_create_sample_success(self, mock_post): + mock_post.return_value = mock_requests_post(200, test_sample_xml_response) + sample_set = default_sample_type() + response = create_ena_sample(test_ena_config, sample_set) + desired_response = { + "ena_sample_accession": "ERS1833148", + "biosample_accession": "SAMEA104174130", + "ena_submission_accession": "ERA979927", + } + self.assertEqual(response.results, desired_response) + + def test_sample_set_construction(self): + config = mock_config() + sample_set = construct_sample_set_object( + config, + sample_data_in_submission_table, + sample_table_entry, + ) + self.assertEqual( + xmltodict.parse(dataclass_to_xml(sample_set, root_name="SAMPLE_SET")), + xmltodict.parse(test_sample_xml_request), + ) + + +class AssemblyCreationTests(unittest.TestCase): + def setUp(self): + self.unaligned_sequences_multi = sample_data_in_submission_table[ + "unaligned_nucleotide_sequences" + ] + self.unaligned_sequences = { + "main": "CTTAACTTTGAGAGAGTGAATT", + } + self.seq_key = {"accession": "test_accession", "version": "test_version"} + + def test_create_chromosome_list_multi_segment(self): + chromosome_list = create_chromosome_list_object( + self.unaligned_sequences_multi, self.seq_key + ) + file_name_chromosome_list = create_chromosome_list(chromosome_list) + + with gzip.GzipFile(file_name_chromosome_list, "rb") as gz: + content = gz.read() + + self.assertEqual( + content, + b"test_accession.test_version_seg2\tseg2\tlinear-segmented\ntest_accession.test_version_seg3\tseg3\tlinear-segmented\n", + ) + + def test_create_chromosome_list(self): + chromosome_list = create_chromosome_list_object(self.unaligned_sequences, self.seq_key) + file_name_chromosome_list = create_chromosome_list(chromosome_list) + + with gzip.GzipFile(file_name_chromosome_list, "rb") as gz: + content = gz.read() + + self.assertEqual( + content, + b"test_accession.test_version\tmain\tlinear-segmented\n", + ) + + def test_create_fasta_multi(self): + chromosome_list = create_chromosome_list_object( + self.unaligned_sequences_multi, self.seq_key + ) + fasta_file_name = create_fasta(self.unaligned_sequences_multi, chromosome_list) + + with gzip.GzipFile(fasta_file_name, "rb") as gz: + content = gz.read() + self.assertEqual( + content, + b">test_accession.test_version_seg2\nGCGGCACGTCAGTACGTAAGTGTATCTCAAAGAAATACTTAACTTTGAGAGAGTGAATT\n>test_accession.test_version_seg3\nCTTAACTTTGAGAGAGTGAATT\n", + ) + + def test_create_fasta(self): + chromosome_list = create_chromosome_list_object(self.unaligned_sequences, self.seq_key) + fasta_file_name = create_fasta(self.unaligned_sequences, chromosome_list) + + with gzip.GzipFile(fasta_file_name, "rb") as gz: + content = gz.read() + self.assertEqual( + content, + b">test_accession.test_version\nCTTAACTTTGAGAGAGTGAATT\n", + ) + + def test_create_manifest(self): + config = mock_config() + group_key = {"group_id": 1, "organism": "Test organism"} + study_accession = "Test Study Accession" + sample_accession = "Test Sample Accession" + results_in_sample_table = {"result": {"ena_sample_accession": sample_accession}} + results_in_project_table = {"result": {"bioproject_accession": study_accession}} + manifest = create_manifest_object( + config, + results_in_sample_table, + results_in_project_table, + sample_data_in_submission_table, + self.seq_key, + group_key, + ) + manifest_file_name = create_manifest(manifest) + data = {} + with open(manifest_file_name, encoding="utf-8") as gz: + reader = csv.reader(gz, delimiter="\t") + for row in reader: + if len(row) >= 2: # Ensure the row has at least two elements + key = row[0] + value = row[1] + data[key] = value + # Temp file names are different + data.pop("CHROMOSOME_LIST") + data.pop("FASTA") + expected_data = { + "STUDY": study_accession, + "SAMPLE": sample_accession, + "ASSEMBLYNAME": "test_accession", + "ASSEMBLY_TYPE": "isolate", + "COVERAGE": "1", + "PROGRAM": "Unknown", + "PLATFORM": "Illumina", + "DESCRIPTION": "Original sequence submitted to Loculus with accession: test_accession, version: test_version", + } + + self.assertEqual(data, expected_data) + + +if __name__ == "__main__": + unittest.main() diff --git a/ena-submission/scripts/trigger_submission_to_ena.py b/ena-submission/scripts/trigger_submission_to_ena.py index 4a53811cf..c9bc568ba 100644 --- a/ena-submission/scripts/trigger_submission_to_ena.py +++ b/ena-submission/scripts/trigger_submission_to_ena.py @@ -4,18 +4,18 @@ import base64 import json import logging -import os import time from dataclasses import dataclass +from typing import Any import click import requests import yaml -from requests.auth import HTTPBasicAuth +from psycopg2.pool import SimpleConnectionPool from submission_db_helper import ( SubmissionTableEntry, add_to_submission_table, - get_db_config, + db_init, in_submission_table, ) @@ -35,15 +35,18 @@ class Config: db_username: str db_password: str db_host: str - github_username: str - github_pat: str github_url: str -def upload_sequences(db_config, sequences_to_upload): +def upload_sequences(db_config: SimpleConnectionPool, sequences_to_upload: dict[str, Any]): for full_accession, data in sequences_to_upload.items(): accession, version = full_accession.split(".") - if in_submission_table(accession, version, db_config): + if in_submission_table(db_config, {"accession": accession, "version": version}): + continue + if in_submission_table(db_config, {"accession": accession}): + # TODO: Correctly handle revisions + msg = f"Trying to submit revision for {accession}, this is not currently enabled" + logger.error(msg) continue entry = { "accession": accession, @@ -78,44 +81,35 @@ def trigger_submission_to_ena(log_level, config_file, input_file=None): logger.setLevel(log_level) logging.getLogger("requests").setLevel(logging.INFO) - with open(config_file) as file: + with open(config_file, encoding="utf-8") as file: full_config = yaml.safe_load(file) relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} config = Config(**relevant_config) logger.info(f"Config: {config}") - db_config = get_db_config(config.db_password, config.db_username, config.db_host) + db_config = db_init(config.db_password, config.db_username, config.db_host) if input_file: # Get sequences to upload from a file - sequences_to_upload: dict = json.load(open(input_file, encoding="utf-8")) - upload_sequences(db_config, sequences_to_upload) - return + with open(input_file, encoding="utf-8") as json_file: + sequences_to_upload: dict[str, Any] = json.load(json_file) + upload_sequences(db_config, sequences_to_upload) + return while True: # In a loop get approved sequences uploaded to Github and upload to submission_table - github_username = os.getenv("GITHUB_USERNAME") - if not github_username: - github_username = config.github_username - - github_pat = os.getenv("GITHUB_PAT") - if not github_pat: - github_pat = config.github_pat - response = requests.get( config.github_url, - auth=HTTPBasicAuth(github_username, github_pat), timeout=10, ) - if response.status_code == 200: - file_info = response.json() - sequences_to_upload = json.loads(base64.b64decode(file_info["content"]).decode("utf-8")) + if response.ok: + sequences_to_upload = response.json() else: error_msg = f"Failed to retrieve file: {response.status_code}" - raise Exception(error_msg) + logger.error(error_msg) upload_sequences(db_config, sequences_to_upload) - time.sleep(30) # Sleep for 30seconds to not overwhelm github + time.sleep(60) # Sleep for 1min to not overwhelm github if __name__ == "__main__": diff --git a/ena-submission/scripts/upload_external_metadata_to_loculus.py b/ena-submission/scripts/upload_external_metadata_to_loculus.py new file mode 100644 index 000000000..ed10cc8b4 --- /dev/null +++ b/ena-submission/scripts/upload_external_metadata_to_loculus.py @@ -0,0 +1,225 @@ +# This script collects the results of the ENA submission and uploads the results to Loculus + +import logging +import time +from dataclasses import dataclass +from datetime import datetime +from typing import Any + +import click +import pytz +import yaml +from call_loculus import submit_external_metadata +from notifications import SlackConfig, send_slack_notification, slack_conn_init +from psycopg2.pool import SimpleConnectionPool +from submission_db_helper import ( + StatusAll, + db_init, + find_conditions_in_db, + find_stuck_in_submission_db, + update_db_where_conditions, +) + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.INFO, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + + +@dataclass +class Config: + organisms: list[dict[str, str]] + organism: str + backend_url: str + keycloak_token_url: str + keycloak_client_id: str + username: str + password: str + ena_specific_metadata: list[str] + db_username: str + db_password: str + db_host: str + slack_hook: str + slack_token: str + slack_channel_id: str + + +def get_external_metadata(db_config: SimpleConnectionPool, entry: dict[str, Any]) -> dict[str, Any]: + accession = entry["accession"] + data = { + "accession": accession, + "version": entry["version"], + "externalMetadata": {}, + } + group_key = {"group_id": entry["group_id"], "organism": entry["organism"]} + seq_key = {"accession": accession, "version": entry["version"]} + + # Get corresponding entry in the project table for (group_id, organism) + corresponding_project = find_conditions_in_db( + db_config, table_name="project_table", conditions=group_key + ) + if len(corresponding_project) == 1: + data["externalMetadata"]["bioprojectAccession"] = corresponding_project[0]["result"][ + "bioproject_accession" + ] + else: + raise Exception + # Check corresponding entry in the sample table for (accession, version) + corresponding_sample = find_conditions_in_db( + db_config, table_name="sample_table", conditions=seq_key + ) + if len(corresponding_sample) == 1: + data["externalMetadata"]["biosampleAccession"] = corresponding_sample[0]["result"][ + "biosample_accession" + ] + else: + raise Exception + # Check corresponding entry in the assembly table for (accession, version) + corresponding_assembly = find_conditions_in_db( + db_config, table_name="assembly_table", conditions=seq_key + ) + if len(corresponding_assembly) == 1: + data["externalMetadata"]["gcaAccession"] = corresponding_assembly[0]["result"][ + "gca_accession" + ] + else: + raise Exception + return data + + +def get_external_metadata_and_send_to_loculus( + db_config: SimpleConnectionPool, config: Config, retry_number=3 +): + # Get external metadata + conditions = {"status_all": StatusAll.SUBMITTED_ALL} + submitted_all = find_conditions_in_db( + db_config, table_name="submission_table", conditions=conditions + ) + for entry in submitted_all: + accession = entry["accession"] + data = get_external_metadata(db_config, entry) + seq_key = {"accession": accession, "version": entry["version"]} + + try: + submit_external_metadata( + data, + config, + entry["organism"], + ) + update_values = { + "status_all": StatusAll.SENT_TO_LOCULUS, + "finished_at": datetime.now(tz=pytz.utc), + } + number_rows_updated = 0 + tries = 0 + while number_rows_updated != 1 and tries < retry_number: + if tries > 0: + logger.warning( + f"External Metadata Update succeeded but db update failed - reentry DB update #{tries}." + ) + number_rows_updated = update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + tries += 1 + if number_rows_updated == 1: + logger.info(f"External metadata update for {entry["accession"]} succeeded!") + except: + logger.error(f"ExternalMetadata update failed for {accession}") + update_values = { + "status_all": StatusAll.HAS_ERRORS_EXT_METADATA_UPLOAD, + "started_at": datetime.now(tz=pytz.utc), + } + number_rows_updated = 0 + tries = 0 + while number_rows_updated != 1 and tries < retry_number: + if tries > 0: + # If state not correctly added retry + logger.warning( + f"External metadata update creation failed and DB update failed - reentry DB update #{tries}." + ) + number_rows_updated = update_db_where_conditions( + db_config, + table_name="submission_table", + conditions=seq_key, + update_values=update_values, + ) + tries += 1 + continue + + +def upload_handle_errors( + db_config: SimpleConnectionPool, + config: Config, + slack_config: SlackConfig, + time_threshold: int = 15, + slack_time_threshold: int = 12, +): + """ + - time_threshold: (minutes) + - slack_time_threshold: (hours) + + 1. Find all entries in submission_table in state HAS_ERRORS_EXT_METADATA_UPLOAD over time_threshold + 2. If time since last slack_notification is over slack_time_threshold send notification + """ + entries_with_errors = find_stuck_in_submission_db( + db_config, + time_threshold=time_threshold, + ) + if len(entries_with_errors) > 0: + error_msg = ( + f"{config.backend_url}: ENA Submission pipeline found {len(entries_with_errors)} entries" + f" in submission_table in status HAS_ERRORS_EXT_METADATA_UPLOAD for over {time_threshold}m" + ) + send_slack_notification( + error_msg, + slack_config, + time=datetime.now(tz=pytz.utc), + time_threshold=slack_time_threshold, + ) + + +@click.command() +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +@click.option( + "--config-file", + required=True, + type=click.Path(exists=True), +) +def upload_external_metadata(log_level, config_file): + logger.setLevel(log_level) + logging.getLogger("requests").setLevel(logging.INFO) + + with open(config_file, encoding="utf-8") as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config.get(key, []) for key in Config.__annotations__} + config = Config(**relevant_config) + logger.info(f"Config: {config}") + db_config = db_init(config.db_password, config.db_username, config.db_host) + slack_config = slack_conn_init( + slack_hook_default=config.slack_hook, + slack_token_default=config.slack_token, + slack_channel_id_default=config.slack_channel_id, + ) + + while True: + get_external_metadata_and_send_to_loculus(db_config, config) + upload_handle_errors( + db_config, + config, + slack_config, + ) + time.sleep(2) + + +if __name__ == "__main__": + upload_external_metadata() diff --git a/ena-submission/test/ena_submission_list.json b/ena-submission/test/approved_ena_submission_list_test.json similarity index 99% rename from ena-submission/test/ena_submission_list.json rename to ena-submission/test/approved_ena_submission_list_test.json index 00a88faa4..4139efce8 100644 --- a/ena-submission/test/ena_submission_list.json +++ b/ena-submission/test/approved_ena_submission_list_test.json @@ -38,6 +38,8 @@ "anatomicalPart": null, "geoLocAdmin1": null, "geoLocAdmin2": null, + "geoLocLatitude": null, + "geoLocLongitude": null, "geoLocCountry": "Pakistan", "insdcVersion_L": null, "insdcVersion_M": null, @@ -52,7 +54,7 @@ "collectionDevice": null, "collectionMethod": null, "depthOfCoverage": null, - "hostHealthState": null, + "hostHealthState": "Hospital care required", "ncbiReleaseDate": null, "ncbiVirusTaxId": null, "sraRunAccession": null, @@ -64,7 +66,7 @@ "hostHealthOutcome": null, "hostOriginCountry": null, "purposeOfSampling": null, - "sequencingProtocol": null, + "sequencingProtocol": "Illumina", "specimenProcessing": null, "hostNameScientific": "Homo sapiens", "presamplingActivity": null, @@ -180,6 +182,7 @@ "NP": [], "GPC": [], "RdRp": [] - } + }, + "organism": "cchf" } } \ No newline at end of file diff --git a/ena-submission/test/test_project_response.xml b/ena-submission/test/test_project_response.xml new file mode 100644 index 000000000..8ae17013a --- /dev/null +++ b/ena-submission/test/test_project_response.xml @@ -0,0 +1,8 @@ + + + + + This submission is a TEST submission and will be discarded within 24 hours + +ADD + \ No newline at end of file diff --git a/ena-submission/test/test_sample_request.xml b/ena-submission/test/test_sample_request.xml new file mode 100644 index 000000000..7c1c63456 --- /dev/null +++ b/ena-submission/test/test_sample_request.xml @@ -0,0 +1,72 @@ + + + Test scientific name: Genome sequencing + + Test taxon + Test scientific name + + Automated upload of Test scientific name sequences submitted by Fake center name from Loculus + + + + Loculus + test_accession + + + + + + hospitalisation + true + + + collection date + 2023-08-26 + + + geographic location (country and/or sea) + Pakistan + + + host health state + Hospital care required + + + host scientific name + Homo sapiens + + + isolate + CCHF/NIHPAK-19/2023 + + + collecting institution + National Institute of Health, Department of Virology + + + authors + M. Ammar, M. Salman, M. Umair, Q. Ali, R. Hakim, S.A. Haider, Z. Jamal + + + collector name + not provided + + + host sex + not provided + + + host subject id + not provided + + + host common name + not provided + + + ENA-CHECKLIST + ERC000033 + + + + \ No newline at end of file diff --git a/ena-submission/test/test_sample_response.xml b/ena-submission/test/test_sample_response.xml new file mode 100644 index 000000000..319770095 --- /dev/null +++ b/ena-submission/test/test_sample_response.xml @@ -0,0 +1,10 @@ + + + + + + + This submission is a TEST submission and will be discarded within 24 hours + + ADD + \ No newline at end of file diff --git a/ena-submission/test/text_project_request.xml b/ena-submission/test/text_project_request.xml new file mode 100644 index 000000000..33d278317 --- /dev/null +++ b/ena-submission/test/text_project_request.xml @@ -0,0 +1,22 @@ + + + Test scientific name + Test scientific name: Genome sequencing + Automated upload of Test scientific name sequences submitted by Test institution from Loculus + + + + Test taxon + Test scientific name + + + + + + Loculus + 1 + + + + + \ No newline at end of file diff --git a/kubernetes/loculus/templates/ena-submission-deployment.yaml b/kubernetes/loculus/templates/ena-submission-deployment.yaml index 17fc959c8..6ff856a90 100644 --- a/kubernetes/loculus/templates/ena-submission-deployment.yaml +++ b/kubernetes/loculus/templates/ena-submission-deployment.yaml @@ -89,19 +89,23 @@ spec: secretKeyRef: name: slack-notifications key: slack-channel-id - - name: GITHUB_USERNAME + - name: ENA_USERNAME valueFrom: secretKeyRef: - name: github-approval-repo - key: github-username - - name: GITHUB_PAT + name: ena-submission + key: username + - name: ENA_PASSWORD valueFrom: secretKeyRef: - name: github-approval-repo - key: github-pat + name: ena-submission + key: password args: - snakemake - results/triggered + - results/project_created + - results/sample_created + - results/assembly_created + - results/uploaded_external_metadata volumeMounts: - name: loculus-ena-submission-config-volume mountPath: /package/config/config.yaml diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index b266132fe..b0ca4bd5e 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -7,7 +7,7 @@ disableWebsite: false disableBackend: false disablePreprocessing: false disableIngest: false -disableEnaSubmission: true +disableEnaSubmission: false siloImportLimitSeconds: 3600 ingestLimitSeconds: 1800 getSubmissionListLimitSeconds: 600 @@ -497,6 +497,14 @@ defaultOrganismConfig: &defaultOrganismConfig header: "INSDC" ingest: bioprojects noInput: true + - name: gcaAccession + displayName: GCA accession + customDisplay: + type: link + url: "https://www.ncbi.nlm.nih.gov/datasets/genome/__value__" + header: "INSDC" + noInput: true + oneHeader: true - name: biosampleAccession customDisplay: type: link @@ -1091,6 +1099,7 @@ defaultOrganismConfig: &defaultOrganismConfig image: ghcr.io/loculus-project/ingest configFile: &ingestConfigFile taxon_id: 186538 + scientific_name: "Zaire ebolavirus" referenceGenomes: nucleotideSequences: - name: "main" @@ -1174,6 +1183,7 @@ defaultOrganisms: <<: *ingest configFile: taxon_id: 3048448 + scientific_name: "West Nile virus" referenceGenomes: nucleotideSequences: - name: main @@ -1401,6 +1411,7 @@ defaultOrganisms: configFile: <<: *ingestConfigFile taxon_id: 3052518 + scientific_name: "Orthonairovirus haemorrhagiae" nucleotide_sequences: - L - M @@ -1495,11 +1506,11 @@ secrets: slack-hook: "dummy" slack-token: "dummy" slack-channel-id: "dummy" - github-approval-repo: + ena-submission: type: raw data: - github-username: "dummy" - github-pat: "dummy" + username: "dummy" + password: "dummy" enableCrossRefCredentials: true runDevelopmentKeycloakDatabase: true runDevelopmentMainDatabase: true diff --git a/kubernetes/loculus/values_preview_server.yaml b/kubernetes/loculus/values_preview_server.yaml index f976bdede..cd61d77fd 100644 --- a/kubernetes/loculus/values_preview_server.yaml +++ b/kubernetes/loculus/values_preview_server.yaml @@ -22,11 +22,12 @@ secrets: slack-hook: AgCLEhTwqKL278AbNwpqdRqeg6naNrQJWx3q8Zp+ecXjMaaLLBi1C3uQlt0WKioy+pUAhfe7MowXKLM55hLyh/InZ9o3yLi9T/5cVRXcEXCvODWmbhr94XhcYI3KnVngZLcNl9Gr4LR+bz8A0sl/rCijNYrqYeDCLI6XUmB8mlKnHPqrF6CXC8Y5xyDbNYJONx6DAugq+gQcZYJ101vUOtu2LTD8awCsdF5FOzdcZ344Vxn/xwDlbbvUEKEQp5A5aMfx95zpa+rV/sQYHeCb7Dy1oWqpOrZP/rPJ4K9VGRx5QA9o1Qi0Pl3alRUqiPUR6pbMxbX8u0kCN6drFKxXDAMd+SadsppDGbNQNeQP5cphNJwYxL/0MIgXxJTrQpcynJK1FULX9W+1GtXg+tX4hRCtZL5hnCxPw12QcNOL2N8SJLGEe8gK8QtALpu/DH/trVJ3rMDRkZhhWCvtb9Zt9EuvUhxs07sE9DZ7rEAqzx51v4vr9CzmxkHEiAhrC3Se3CxnSspBP1/X9SvZ+GXn+ZuXzN+KivWCnim0RwhRD75Y7ZP8ct/iu3ilb6b7Pl+KOgOkA3In7c4yVAZwXMTuF6aP2/8inPx5Kk6p8ks7c5XeSIDFOH7C6EJuD7E69Fz6ijaF5bJN8NWBVxkE88xq4un5e7dcuqjIqaQ8kLDX1g8aXiklr6qD29q9H/m+gtd6lxcMb53bMl0EI+GHYTBZn3w+T1PxlY1uoBfNzt1efjXJD7AWlTDxze+5PIYgiFVAOdfv3ey5HJMMw1w5MTLMW44hkpt4MCaHvREBTXq5sxBJe10= slack-token: AgDFAP+F7ze+TY+yK71JPOSkKwIpnBXh7WWweyZZwYm1/CwmKS1iQ4O1p54sTrHrMC737Si27MfTVvrEZRj9aAwq0fQJZt5yldpfMaTccZJHj3rQ5kczyuCMYVcFzmyywAr4DXsUCscjrOLgwTiiK/d8jSFkKyXupC7bB2EcZZUFGpFAj7PiSeJuEIKQptkSjeaasCQXkdkuoKczM38GCu502pIxaJ9kIXVrereyKUpsU/uFDgj0IcKqfiT0M2FGs8Ujl3CXpMxcOLSuxVyCnje27GHpsYrd/uEKX3yl1rB1rV6Z+gMzlO9DDPW/XJl6TY5snOxdaCv7uNzAGwgb/rlaZ5fnrNqsOoucJvh35yxMcKDsx/hY72H7PRnzNpLeqZ/2zAub+fQP/o1edjxaYHaSltS0lwzCivIPOHv66/dDOD9v0LncWkCWGXXOp8Fsz9OrF+NcAZjIY/hDzwy+JRDA7Wtn7jlkA07WFpobkyyKfN+bNT1664wS2IMDRYA6+MbkA99v+ScVsEVlxJqn+PiaDtexQAfQcyN2NPbQe+9xMIQavvrcnwxdMwAP8DBME3vhdrD8yDRJ9GN+ygtZ3dB4FC4iW20ETyzlAqJ/H9M2/ed1O3VyraFDCV3PmSBdq7Rinj1Zg6D+IEp54HtwWiu5s7iNeKW37cSSloRUaojWQ1BFPB6msfP/O5yqREdGrWVhmChWvSDMw2LxmnZbCw3mVdMr9B1XeK76GHa4kOhVOcEqzl0X61NILYDecgLP6HVJZhB+NpHJfOY= slack-channel-id: AgDDxu0CGC/AFqOHMeHjV3KUGPoY0QAmGoqtiuxPPaP+GWOuz4xZbBP6Fymh0XbHFHMB9PtWowrMbvFOkfTncRiKqyK6HIU7GU2GtCla2WTZQNRAW82gcJnxtbtm4KymN3LyTj27qBHkQHNZ90qyBGdsJUYwmBW8XB6wNoz10KDyQSvYrYwQEe0onCgnislxslATPB6CQFWHMghKYoaHHAECpf6sN1kS9rvNq/1e7gG8s++lgF7qSZgjQP77Q6kMoiMS5krX03pPKZXsc69mI8GLIvhalsUfg2BO7swa3FgCbjecp32lW6KuRCfzeMmj2NWpWTLcJSPYPJN0sOkhRFOWUlrylztG82l7dGl5JofJWQVopF+qLTAR6LxHaujFHQ+Y2x4/5tBmurwOT6xXknQjXqYs2qbG1OriivJrjRwhRoZWE2vR5YlE+Zz8S9/vYw0JnKibnB1YvbdBBnpllyXYjTJa4818W05DvJ70qLeILMYcEkY4/jv5xqNdGuwp34gZLcW6+qztHfQVXRf0uXM1B7BPNH0aNMBNN7D0m1vWkTNgKC+V2PiEH9nTVhwSF+MlG/rmR2+v84kWhMP3qdX8/28GnBnvS9rryzuc3e2mHBIiIGHwW+SQjlmdq0jDTtuvFtU8I7ncB5PUe+sYZ0zFrn57blraBG5ntqtZfb+aS3modE+ElmCgBzi8gSQoVxXzmOIfMZRRwAaH0w== - github-approval-repo: + ena-submission: type: sealedsecret clusterWide: "true" encryptedData: - github-username: AgBtyXeimJmI7e1di9ubkzNODtAAlm0LiXEJJGfzzR/KExSrVMYsWcIIQOCfS/bbLvvvSyddKPGeFYhozG6CK943oTJ2rZHeMBjqtXX0AEw655KfJNgHWoWWk1xdmOzITtzKNlHi4cPcxswQte84NhJqRZPJ1sNhQfOm/AwL0NplQXD6xyl7UGAUKOW8rSLgH+gTUsTzmz2prgwiF3SmwwHCSClL/q9H4/nqkMycMQ5NcQK+5cpUUeWidzL3LjANCJS815S8oMdUWbDFwB3NPPfUhYdRuXM19MAHxq2hhAkn6rKGNI8tQhahi8dfmH2QDMyS1KVRof6taBXJIVMxeejL8nOJbNhPBdtyM+3hVasm/frLycUJxzvQpPPv1ictprM5K/r2sJDIqyXVOZSspyXQ09gFz2D13QsOSfaERbKUMtn9L0bfsG3N/zwo614N8YNCXf+dvIjVCoFwMD5RQ5IkqHLPEQiJ77++feSO+4fbfItjMI8qP4mM6YRI7VqweVKufwdSSropRpxhytnjqoWfvHFyDgzsw4ZuJwlyZs01yuJelog4yqG8WszWCv6Ae3fRM3gaquCj2UlFfxTBxN+aZId/lV3LCG1l06g82J6+YTL5wdCnYuYcpZmp21651J9jWwWzCumRWaQgbY6Ou0VjHKZ9if1uDdYA3S7SJJx/ZDZK4cixoS9VXgsPWFgmHbZNHqOFzOd2Dl9kRA== - github-pat: AgBa5wjYBw3GWXihWdoedx3oEbR5z808WoeIGyk5aAZPQ1MTMbWm4ZMdvJUZVPlkRacQXSt+A2pniYMjzl0bqHkc5guVQOoHu6jZgtyfaOyqT4Ergz1+L9VUzyn6E1NwqV2BtdKQTN7A1VvtLw0BLVG0E01SUcHeoU9MWlF+9BWFtXWWYxWL/sTLKoz0tnj8yu6LLHMMYeCnABAoZwZbJ3kf1UVp3FwcJbuSqsrUTPaeS6z5g90jL/zwJ150VUh9kInHs+kwV1imZp06ZUm3gpdY1f/EQZnxHTPUoaYLOo91cEhG7bSD9KQz1B6m07VzjIlnrwduPUCFQs1gk1hU7gO+6MYlMqCIb8NUQbuX5d5cHiENJm9DXAJdd8oNW02JQ371gulVYvXagAdrYgYv75Mr3YjejHZeinZrJ/ZxeZI0+fL4SLkh+77RyuN1R6ab1vGBoUG+LgHsUEBT7mw/wXtrzRmNBwxNBXSfTrrKM+EzM9KUsrSe490sK3Rf4TExjynL/9biS9ZFw601tdBh8luO3xFhCr8Bn4q7sWnKNmvSu+gMZklyhR65Hf0LHTxUdDcK8WZKFyds6foHTAv9kX4+NK7upVfAdjx9Tpf0ONad3nSboi/+1vYoGtAZcL6BZoNL+1Xu2WAlVa+WollJSV2k4KeTEswE1vmPAhoapyxpry9Mz+8CGkohvSPjHGdfXSNSW7K9zKCCubqzHV2+EsnaDWJ9+cwghAQFOpKdbJuHLwalMpAkTztRjr5wHfD6KSo1apCHZl84S0B2u8floEzU1XRiW8zq10m8cb3/xmdteNL3FNMlZlEieD2nsiU= + username: AgB/I94iUkgkAX3Tqs3znnIvVqeLuhcznJlMEa1iTMiwfitx1Dw4O8NBehTnnVD1XQ+jfj2x2fGeINpjz61GbCMEFz8rhQ9gGRr4eFiWLWSsThUX+cTrRxPdGDxCZkFhr959agKMd2ks91eZe+99RoSq/39WthVvpFaITNr6PLIVH1EBycCkC0i6mIS0PIU9rZwigfw1Q/qnOgPOi9usv8KU31zwBY/f68FzplrCMNQ8rrO9mmfQs1q8Ac3e6Vj6GVaWeFSc1vc/j+n0EbQJKlqZnX6VrESY45u1lHbqQpKyUiejQ+T36jJIA2573YMJFi7GXfxpvWuLxNmgCOilVqdrNcpINs0ajokn2AHPNdTzZiDOtrvt2uBUinsUGrtO5olBz6wfWwzx0jMtdH2wjm6dwYlMONwOh9REyISk40XAL0VgpTQlZRFCs+NdhLRyLW5gKtOatmuAg4yx6d69PHSBwjsWjHSzf7vrcr/MBzpQSzEGEsqRB0NMXaTGj79VGQWG7a2ToQuh0TIUgnvE6Ffb9uj0bcEXTrLZgtJsbwLuKfCJQy6JXLmpWYHngAAQRVuDhW1pr9xhSgN9rDUrk5LGvDvFOxvo+9BmUqYApEGyO3jM3cOMiDLqh3x34nqwiZ+1p8W603gPMyUIR22WevoCIiYmiwFXfnsBmvVdw880xVtTc8XNiQo5gzo0ycW1RCkoYs0u8LtWNCF6Dg== + password: AgA8KuXDI0SykCoyS1veoKaMXjA/Vv3LHGBFaRyks2KJvRJjMm8QapeMzxCxR0ln2poqnC0BZ97tqvj5ZmE3GsMjdw0vYLl15uQXcu8AdnmKhud+7c0ks6IHVo14jBPtR7/wga/l/3Ot54P7xiwDLdrQkRDsX2X7qw5q6QXM81poDcMlAE6QQArCusnm+64sPzM2FU0QGZ1awZeXvjB20dR0/x1HqiVrrxLAms+XN3fARrviXVQz6jlAGMs/L6z70BngcnNj3d4eYxh5ZIG4l+L4EyhuwW67jn9l/Zy0m0bhQZMImwmuGIsAyQGwnrGRz4UG/W0TauW3hFF7GF59qYGNrjv97iwk/TZ32dKID9wUZBaiVGSmqzXlFLmdNzEH6dQ2is/juT+75ej/nMqgMkdT9tn0yPol1jZCx/xkdfsHafKOkTxPk/WF9IlY/c+9BiajvgQwCTZILYxI5hJft96xd+RMiwQtwyccBwXI5YccVq2cDZLZ9fTkreot3YRDX4R9PgK+sBhHYb3tYmnxJ4FU72Yyr1ezcP30ArXfhwcn2UBQ4RfgUEFYJajSEgB0emuc3pr1F7jKMopwgX06zZOlrhAWOSSn4s2Z8JKG2o1DELJslBhT6/i+DAKxFWe2RiIf0IZqFOlPmU7ztfmDWBrK+RZE5fbUViPb3YQDgyITYVKFcuBeUl3vHi0DEaZ4Eal1R7lfQdY/8qDnXE3PZ2U= +reduceResourceRequest: true previewDocs: true robotsNoindexHeader: true