feat(prepro): make nextclade pipeline configurable through `values.ya…

…ml` (#1347) * feat(prepro): Make Nextclade prepro pipeline configurable through values.yaml * Configure instance with more mpox metadata * Lift outbreak and lineage from Nextclade for mpox * Use dpath for json-path like indexing into nested object * Rename metadata input fields (and some output fields) * fix: upload "not provided" rather than `null` for required fields to not error on submit Otherwise get this error from backend: `Response: {"type":"about:blank","title":"Unprocessable Entity","status":422,"detail":"Field 'country' is null, but a value is required.","instance":"/mpox/submit-processed-data"}` * Handle submission errors, log full traceback but don't crash
loculus-project · Mar 14, 2024 · 3e12387 · 3e12387
1 parent 33e451f
commit 3e12387
Show file tree

Hide file tree

Showing 8 changed files with 604 additions and 123 deletions.
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -96,11 +96,37 @@ defaultOrganisms:
           type: string
           generateIndex: true
           autocomplete: true
-          required: false
+        - name: outbreak
+          type: string
+          generateIndex: true
+          autocomplete: true
+        - name: lineage
+          type: string
+          generateIndex: true
+          autocomplete: true
+        - name: ncbi_release_date
+          type: date
+        - name: country
+          type: string
+          required: true
+          generateIndex: true
+          autocomplete: true
+        - name: isolate_name
+          type: string
+        - name: author_affiliation
+          type: string
+          generateIndex: true
+          autocomplete: true
+        - name: authors
+          type: string
       website:
         tableColumns:
           - collection_date
+          - country
           - clade
+          - lineage
+          - author_affiliation
+          - ncbi_release_date
         defaultOrderBy: collection_date
         defaultOrder: descending
       silo:
@@ -116,6 +142,46 @@ defaultOrganisms:
         genes:
           OPG001: 247
         batch_size: 100
+        processing_spec:
+          collection_date:
+            function: process_date
+            inputs:
+              date: collection_date
+              release_date: ncbi_release_date
+            required: true
+          ncbi_release_date:
+            function: parse_timestamp
+            inputs:
+              timestamp: ncbi_release_date
+          clade:
+            function: identity
+            inputs:
+              input: nextclade.clade
+          outbreak:
+            function: identity
+            inputs:
+              input: nextclade.customNodeAttributes.outbreak
+          lineage:
+            function: identity
+            inputs:
+              input: nextclade.customNodeAttributes.lineage
+          country:
+            function: identity
+            inputs:
+              input: country
+            required: true
+          author_affiliation:
+            function: identity
+            inputs:
+              input: author_affiliation
+          authors:
+            function: identity
+            inputs:
+              input: authors
+          isolate_name:
+            function: identity
+            inputs:
+              input: isolate_name
     referenceGenomes:
       nucleotideSequences:
         - name: "main"

diff --git a/preprocessing/dummy/main.py b/preprocessing/dummy/main.py
@@ -2,11 +2,11 @@
 import dataclasses
 import json
 import random
-import requests
 import time
-from typing import List
-from typing import Optional
 from dataclasses import dataclass, field
+from typing import List, Optional
+
+import requests
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--backend-host", type=str, default="http://127.0.0.1:8079",
@@ -52,8 +52,8 @@ class Sequence:
     accession: int
     version: int
     data: dict
-    errors: Optional[List[ProcessingAnnotation]] = field(default_factory=list)
-    warnings: Optional[List[ProcessingAnnotation]] = field(default_factory=list)
+    errors: Optional[List[ProcessingAnnotation]] = field(default_factory=list[ProcessingAnnotation])
+    warnings: Optional[List[ProcessingAnnotation]] = field(default_factory=list[ProcessingAnnotation])
 
 
 def fetch_unprocessed_sequences(n: int) -> List[Sequence]:

diff --git a/preprocessing/nextclade/.mypy.ini b/preprocessing/nextclade/.mypy.ini
@@ -1,4 +1,5 @@
 [mypy]
+python_version = 3.12
 
 [mypy-Bio.*]
 ignore_missing_imports = True
diff --git a/preprocessing/nextclade/environment.yml b/preprocessing/nextclade/environment.yml
@@ -5,8 +5,11 @@ channels:
 dependencies:
   - python=3.12
   - biopython=1.83
+  - dpath=2.1
   - nextclade=3.3.1
-  - PyYAML
-  - requests=2.31
+  - pip=24.0
+  - PyYAML=6.0
   - pyjwt=2.8
-  - pip
+  - python-dateutil=2.9
+  - pytz=2024.1
+  - requests=2.31
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/config.py b/preprocessing/nextclade/src/loculus_preprocessing/config.py
@@ -5,7 +5,7 @@
 import os
 from dataclasses import dataclass
 from types import UnionType
-from typing import Type, get_args
+from typing import Any, Type, get_args
 
 import yaml
 
@@ -30,6 +30,7 @@ class Config:
     keep_tmp_dir: bool = False
     reference_length: int = 197209
     batch_size: int = 5
+    processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict)
 
 
 def load_config_from_yaml(config_file: str, config: Config) -> Config:

diff --git a/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py b/preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
@@ -1,15 +1,43 @@
-from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Any, Literal
+from enum import StrEnum, unique
+from typing import Any
 
 AccessionVersion = str
-NextcladeResult = dict[str, dict[str, Any]]
+GeneName = str
+NucleotideSequence = str
+AminoAcidSequence = str
+NucleotideInsertion = str
+AminoAcidInsertion = str
+FunctionName = str  # Name of function present in processing_functions
+ArgName = str  # Name of argument present in processing_functions
+InputField = (
+    str  # Name of field in input data, either inputMetadata or NextcladeMetadata
+)
+ProcessingInput = dict[str, str | None]
+
+
+@unique
+class AnnotationSourceType(StrEnum):
+    METADATA = "Metadata"
+    NUCLEOTIDE_SEQUENCE = "NucleotideSequence"
+
+
+@dataclass
+class AnnotationSource:
+    name: str
+    type: AnnotationSourceType
+
+
+@dataclass
+class ProcessingAnnotation:
+    source: list[AnnotationSource]
+    message: str
 
 
 @dataclass
 class UnprocessedData:
-    metadata: Mapping[str, str]
-    unalignedNucleotideSequences: Mapping[str, str]
+    metadata: dict[str, str]
+    unalignedNucleotideSequences: dict[str, NucleotideSequence]
 
 
 @dataclass
@@ -18,26 +46,36 @@ class UnprocessedEntry:
     data: UnprocessedData
 
 
+FunctionInputs = dict[ArgName, InputField]
+
+
 @dataclass
-class ProcessedData:
-    metadata: dict[str, dict[str, Any]]
-    unalignedNucleotideSequences: dict[str, Any]
-    alignedNucleotideSequences: dict[str, Any]
-    nucleotideInsertions: dict[str, Any]
-    alignedAminoAcidSequences: dict[str, Any]
-    aminoAcidInsertions: dict[str, Any]
+class ProcessingSpec:
+    inputs: FunctionInputs
+    function: FunctionName
+    required: bool | None
 
 
+# For single segment, need to generalize for multi segments later
 @dataclass
-class AnnotationSource:
-    field: str
-    type: Literal["metadata", "nucleotideSequence"]
+class UnprocessedAfterNextclade:
+    inputMetadata: dict[str, Any]  # Original user supplied metadata
+    nextcladeMetadata: dict[str, Any] | None  # Derived metadata produced by Nextclade
+    unalignedNucleotideSequences: NucleotideSequence
+    alignedNucleotideSequences: NucleotideSequence | None
+    nucleotideInsertions: list[NucleotideInsertion]
+    alignedAminoAcidSequences: dict[GeneName, AminoAcidSequence | None]
+    aminoAcidInsertions: dict[GeneName, list[AminoAcidInsertion]]
 
 
 @dataclass
-class ProcessingAnnotation:
-    source: AnnotationSource
-    message: str
+class ProcessedData:
+    metadata: dict[str, Any]
+    unalignedNucleotideSequences: dict[str, Any]
+    alignedNucleotideSequences: dict[str, Any]
+    nucleotideInsertions: dict[str, Any]
+    alignedAminoAcidSequences: dict[str, Any]
+    aminoAcidInsertions: dict[str, Any]
 
 
 @dataclass
@@ -52,3 +90,10 @@ class ProcessedEntry:
     data: ProcessedData
     errors: list[ProcessingAnnotation] = field(default_factory=list)
     warnings: list[ProcessingAnnotation] = field(default_factory=list)
+
+
+@dataclass
+class ProcessingResult:
+    datum: str | None
+    warnings: list[ProcessingAnnotation] = field(default_factory=list)
+    errors: list[ProcessingAnnotation] = field(default_factory=list)