Skip to content

Commit

Permalink
feat(prepro): make nextclade pipeline configurable through `values.ya…
Browse files Browse the repository at this point in the history
…ml` (#1347)

* feat(prepro): Make Nextclade prepro pipeline configurable through values.yaml

* Configure instance with more mpox metadata

* Lift outbreak and lineage from Nextclade for mpox

* Use dpath for json-path like indexing into nested object

* Rename metadata input fields (and some output fields)

* fix: upload "not provided" rather than `null` for required fields to not error on submit

Otherwise get this error from backend:
`Response: {"type":"about:blank","title":"Unprocessable Entity","status":422,"detail":"Field 'country' is null, but a value is required.","instance":"/mpox/submit-processed-data"}`

* Handle submission errors, log full traceback but don't crash
  • Loading branch information
corneliusroemer authored Mar 14, 2024
1 parent 33e451f commit 3e12387
Show file tree
Hide file tree
Showing 8 changed files with 604 additions and 123 deletions.
68 changes: 67 additions & 1 deletion kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,37 @@ defaultOrganisms:
type: string
generateIndex: true
autocomplete: true
required: false
- name: outbreak
type: string
generateIndex: true
autocomplete: true
- name: lineage
type: string
generateIndex: true
autocomplete: true
- name: ncbi_release_date
type: date
- name: country
type: string
required: true
generateIndex: true
autocomplete: true
- name: isolate_name
type: string
- name: author_affiliation
type: string
generateIndex: true
autocomplete: true
- name: authors
type: string
website:
tableColumns:
- collection_date
- country
- clade
- lineage
- author_affiliation
- ncbi_release_date
defaultOrderBy: collection_date
defaultOrder: descending
silo:
Expand All @@ -116,6 +142,46 @@ defaultOrganisms:
genes:
OPG001: 247
batch_size: 100
processing_spec:
collection_date:
function: process_date
inputs:
date: collection_date
release_date: ncbi_release_date
required: true
ncbi_release_date:
function: parse_timestamp
inputs:
timestamp: ncbi_release_date
clade:
function: identity
inputs:
input: nextclade.clade
outbreak:
function: identity
inputs:
input: nextclade.customNodeAttributes.outbreak
lineage:
function: identity
inputs:
input: nextclade.customNodeAttributes.lineage
country:
function: identity
inputs:
input: country
required: true
author_affiliation:
function: identity
inputs:
input: author_affiliation
authors:
function: identity
inputs:
input: authors
isolate_name:
function: identity
inputs:
input: isolate_name
referenceGenomes:
nucleotideSequences:
- name: "main"
Expand Down
10 changes: 5 additions & 5 deletions preprocessing/dummy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import dataclasses
import json
import random
import requests
import time
from typing import List
from typing import Optional
from dataclasses import dataclass, field
from typing import List, Optional

import requests

parser = argparse.ArgumentParser()
parser.add_argument("--backend-host", type=str, default="http://127.0.0.1:8079",
Expand Down Expand Up @@ -52,8 +52,8 @@ class Sequence:
accession: int
version: int
data: dict
errors: Optional[List[ProcessingAnnotation]] = field(default_factory=list)
warnings: Optional[List[ProcessingAnnotation]] = field(default_factory=list)
errors: Optional[List[ProcessingAnnotation]] = field(default_factory=list[ProcessingAnnotation])
warnings: Optional[List[ProcessingAnnotation]] = field(default_factory=list[ProcessingAnnotation])


def fetch_unprocessed_sequences(n: int) -> List[Sequence]:
Expand Down
1 change: 1 addition & 0 deletions preprocessing/nextclade/.mypy.ini
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
[mypy]
python_version = 3.12

[mypy-Bio.*]
ignore_missing_imports = True
9 changes: 6 additions & 3 deletions preprocessing/nextclade/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ channels:
dependencies:
- python=3.12
- biopython=1.83
- dpath=2.1
- nextclade=3.3.1
- PyYAML
- requests=2.31
- pip=24.0
- PyYAML=6.0
- pyjwt=2.8
- pip
- python-dateutil=2.9
- pytz=2024.1
- requests=2.31
3 changes: 2 additions & 1 deletion preprocessing/nextclade/src/loculus_preprocessing/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
from dataclasses import dataclass
from types import UnionType
from typing import Type, get_args
from typing import Any, Type, get_args

import yaml

Expand All @@ -30,6 +30,7 @@ class Config:
keep_tmp_dir: bool = False
reference_length: int = 197209
batch_size: int = 5
processing_spec: dict[str, dict[str, Any]] = dataclasses.field(default_factory=dict)


def load_config_from_yaml(config_file: str, config: Config) -> Config:
Expand Down
81 changes: 63 additions & 18 deletions preprocessing/nextclade/src/loculus_preprocessing/datatypes.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,43 @@
from collections.abc import Mapping
from dataclasses import dataclass, field
from typing import Any, Literal
from enum import StrEnum, unique
from typing import Any

AccessionVersion = str
NextcladeResult = dict[str, dict[str, Any]]
GeneName = str
NucleotideSequence = str
AminoAcidSequence = str
NucleotideInsertion = str
AminoAcidInsertion = str
FunctionName = str # Name of function present in processing_functions
ArgName = str # Name of argument present in processing_functions
InputField = (
str # Name of field in input data, either inputMetadata or NextcladeMetadata
)
ProcessingInput = dict[str, str | None]


@unique
class AnnotationSourceType(StrEnum):
METADATA = "Metadata"
NUCLEOTIDE_SEQUENCE = "NucleotideSequence"


@dataclass
class AnnotationSource:
name: str
type: AnnotationSourceType


@dataclass
class ProcessingAnnotation:
source: list[AnnotationSource]
message: str


@dataclass
class UnprocessedData:
metadata: Mapping[str, str]
unalignedNucleotideSequences: Mapping[str, str]
metadata: dict[str, str]
unalignedNucleotideSequences: dict[str, NucleotideSequence]


@dataclass
Expand All @@ -18,26 +46,36 @@ class UnprocessedEntry:
data: UnprocessedData


FunctionInputs = dict[ArgName, InputField]


@dataclass
class ProcessedData:
metadata: dict[str, dict[str, Any]]
unalignedNucleotideSequences: dict[str, Any]
alignedNucleotideSequences: dict[str, Any]
nucleotideInsertions: dict[str, Any]
alignedAminoAcidSequences: dict[str, Any]
aminoAcidInsertions: dict[str, Any]
class ProcessingSpec:
inputs: FunctionInputs
function: FunctionName
required: bool | None


# For single segment, need to generalize for multi segments later
@dataclass
class AnnotationSource:
field: str
type: Literal["metadata", "nucleotideSequence"]
class UnprocessedAfterNextclade:
inputMetadata: dict[str, Any] # Original user supplied metadata
nextcladeMetadata: dict[str, Any] | None # Derived metadata produced by Nextclade
unalignedNucleotideSequences: NucleotideSequence
alignedNucleotideSequences: NucleotideSequence | None
nucleotideInsertions: list[NucleotideInsertion]
alignedAminoAcidSequences: dict[GeneName, AminoAcidSequence | None]
aminoAcidInsertions: dict[GeneName, list[AminoAcidInsertion]]


@dataclass
class ProcessingAnnotation:
source: AnnotationSource
message: str
class ProcessedData:
metadata: dict[str, Any]
unalignedNucleotideSequences: dict[str, Any]
alignedNucleotideSequences: dict[str, Any]
nucleotideInsertions: dict[str, Any]
alignedAminoAcidSequences: dict[str, Any]
aminoAcidInsertions: dict[str, Any]


@dataclass
Expand All @@ -52,3 +90,10 @@ class ProcessedEntry:
data: ProcessedData
errors: list[ProcessingAnnotation] = field(default_factory=list)
warnings: list[ProcessingAnnotation] = field(default_factory=list)


@dataclass
class ProcessingResult:
datum: str | None
warnings: list[ProcessingAnnotation] = field(default_factory=list)
errors: list[ProcessingAnnotation] = field(default_factory=list)
Loading

0 comments on commit 3e12387

Please sign in to comment.