From c0077ab9ccb2aa89a058947c4d62602dc3b591cc Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 28 Nov 2024 14:48:57 +0100 Subject: [PATCH 01/31] . --- src/annotate/cli.rs | 110 ++++++++++++++++++++ src/annotate/mod.rs | 1 + src/annotate/seqvars/csq.rs | 37 ++----- src/annotate/seqvars/mod.rs | 167 ++++++++++--------------------- src/annotate/seqvars/provider.rs | 2 +- src/annotate/strucvars/mod.rs | 8 +- src/server/run/mod.rs | 31 +++--- src/verify/seqvars.rs | 2 +- 8 files changed, 194 insertions(+), 164 deletions(-) create mode 100644 src/annotate/cli.rs diff --git a/src/annotate/cli.rs b/src/annotate/cli.rs new file mode 100644 index 00000000..19a654d5 --- /dev/null +++ b/src/annotate/cli.rs @@ -0,0 +1,110 @@ +use clap::Args as ClapArgs; +use strum::{Display, VariantArray}; + +#[derive(Debug, ClapArgs)] +#[group(required = true, multiple = true)] +pub struct Sources { + #[arg(long)] + pub transcripts: Option>, + #[arg(long)] + pub frequencies: Option, + #[arg(long)] + pub clinvar: Option, +} + +#[derive(Debug, ClapArgs)] +pub struct TranscriptSettings { + /// The transcript source. + #[arg(long, value_enum, default_value_t = TranscriptSource::Both)] + pub transcript_source: TranscriptSource, + + /// Whether to report only the worst consequence for each picked transcript. + #[arg(long)] + pub report_most_severe_consequence_by: Option, + + /// Which kind of transcript to pick / restrict to. Default is not to pick at all. + /// + /// Depending on `--pick-transcript-mode`, if multiple transcripts match the selection, + /// either the first one is kept or all are kept. + #[arg(long)] + pub pick_transcript: Vec, + + /// Determines how to handle multiple transcripts. Default is to keep all. + /// + /// When transcript picking is enabled via `--pick-transcript`, + /// either keep the first one found or keep all that match. + #[arg(long, default_value = "all")] + pub pick_transcript_mode: TranscriptPickMode, +} + +#[derive( + Debug, + Copy, + Clone, + PartialEq, + Eq, + PartialOrd, + Ord, + Display, + clap::ValueEnum, + VariantArray, + parse_display::FromStr, +)] +pub enum ConsequenceBy { + Gene, + Transcript, + // or "Variant"? + Allele, +} + +#[derive( + Debug, + Copy, + Clone, + PartialEq, + Eq, + PartialOrd, + Ord, + Display, + clap::ValueEnum, + VariantArray, + parse_display::FromStr, +)] +pub enum TranscriptPickType { + ManeSelect, + ManePlusClinical, + Length, + EnsemblCanonical, + RefSeqSelect, + GencodePrimary, + Basic, +} + +#[derive(Debug, Copy, Clone, Display, clap::ValueEnum, Default)] +pub enum TranscriptPickMode { + #[default] + First, + All, +} + +/// Enum that allows to select the transcript source. +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + Default, + serde::Deserialize, + serde::Serialize, + clap::ValueEnum, +)] +pub enum TranscriptSource { + /// ENSEMBL + Ensembl, + /// RefSeq + RefSeq, + /// Both + #[default] + Both, +} diff --git a/src/annotate/mod.rs b/src/annotate/mod.rs index 302471c2..b7471115 100644 --- a/src/annotate/mod.rs +++ b/src/annotate/mod.rs @@ -6,6 +6,7 @@ use noodles::vcf::variant::record_buf::samples::sample::value::Genotype; pub mod seqvars; pub mod strucvars; +pub(crate) mod cli; const VCF_4_4: FileFormat = FileFormat::new(4, 4); diff --git a/src/annotate/seqvars/csq.rs b/src/annotate/seqvars/csq.rs index 1df50ded..25c8edab 100644 --- a/src/annotate/seqvars/csq.rs +++ b/src/annotate/seqvars/csq.rs @@ -1,4 +1,9 @@ //! Compute molecular consequence of variants. +use super::{ + ann::{Allele, AnnField, Consequence, FeatureBiotype, FeatureType, Pos, Rank, SoFeature}, + provider::Provider as MehariProvider, +}; +use crate::annotate::cli::{ConsequenceBy, TranscriptSource}; use crate::pbs::txs::{GenomeAlignment, Strand, TranscriptBiotype, TranscriptTag}; use enumflags2::BitFlags; use hgvs::parser::{NoRef, ProteinEdit, UncertainLengthChange}; @@ -14,12 +19,6 @@ use std::cmp::Ordering; use std::ops::Range; use std::{collections::HashMap, sync::Arc}; -use super::{ - ann::{Allele, AnnField, Consequence, FeatureBiotype, FeatureType, Pos, Rank, SoFeature}, - provider::Provider as MehariProvider, - ConsequenceBy, -}; - /// A variant description how VCF would do it. #[derive(Debug, PartialEq, Eq, Clone, Default)] pub struct VcfVariant { @@ -33,28 +32,6 @@ pub struct VcfVariant { pub alternative: String, } -/// Enum that allows to select the transcript source. -#[derive( - Debug, - Clone, - Copy, - PartialEq, - Eq, - Default, - serde::Deserialize, - serde::Serialize, - clap::ValueEnum, -)] -pub enum TranscriptSource { - /// ENSEMBL - Ensembl, - /// RefSeq - RefSeq, - /// Both - #[default] - Both, -} - /// Configuration for consequence prediction. #[derive(Debug, Clone, derive_builder::Builder)] #[builder(pattern = "immutable")] @@ -84,7 +61,7 @@ impl Default for Config { pub struct ConsequencePredictor { /// The internal transcript provider for locating transcripts. #[derivative(Debug = "ignore")] - provider: Arc, + pub(crate) provider: Arc, /// Assembly mapper for variant consequence prediction. #[derivative(Debug = "ignore")] mapper: assembly::Mapper, @@ -1247,10 +1224,10 @@ impl ConsequencePredictor { #[cfg(test)] mod test { use super::*; + use crate::annotate::cli::TranscriptPickType; use crate::annotate::seqvars::provider::ConfigBuilder as MehariProviderConfigBuilder; use crate::annotate::seqvars::{ load_tx_db, run_with_writer, Args, AsyncAnnotatedVariantWriter, PathOutput, - TranscriptPickType, }; use crate::common::noodles::{open_variant_reader, open_variant_writer, NoodlesVariantReader}; use csv::ReaderBuilder; diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 3526932a..5fdb682c 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -8,6 +8,19 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Instant; +use crate::annotate::cli::{Sources, TranscriptSettings}; +use crate::annotate::genotype_string; +use crate::annotate::seqvars::csq::{ + ConfigBuilder as ConsequencePredictorConfigBuilder, ConsequencePredictor, VcfVariant, +}; +use crate::annotate::seqvars::provider::{ + ConfigBuilder as MehariProviderConfigBuilder, Provider as MehariProvider, +}; +use crate::common::noodles::{open_variant_reader, open_variant_writer, NoodlesVariantReader}; +use crate::common::{guess_assembly, GenomeRelease}; +use crate::db::merge::merge_transcript_databases; +use crate::pbs::txs::TxSeqDatabase; +use crate::ped::{PedigreeByName, Sex}; use annonars::common::cli::is_canonical; use annonars::common::keys; use annonars::freqs::serialized::{auto, mt, xy}; @@ -39,19 +52,6 @@ use strum::{Display, VariantArray}; use thousands::Separable; use tokio::io::AsyncWriteExt; -use crate::annotate::genotype_string; -use crate::annotate::seqvars::csq::{ - ConfigBuilder as ConsequencePredictorConfigBuilder, ConsequencePredictor, VcfVariant, -}; -use crate::annotate::seqvars::provider::{ - ConfigBuilder as MehariProviderConfigBuilder, Provider as MehariProvider, -}; -use crate::common::noodles::{open_variant_reader, open_variant_writer, NoodlesVariantReader}; -use crate::common::{guess_assembly, GenomeRelease}; -use crate::db::merge::merge_transcript_databases; -use crate::pbs::txs::TxSeqDatabase; -use crate::ped::{PedigreeByName, Sex}; - use self::ann::{AnnField, Consequence, FeatureBiotype}; pub mod ann; @@ -91,28 +91,6 @@ pub struct Args { #[command(flatten)] pub output: PathOutput, - /// The transcript source. - #[arg(long, value_enum, default_value_t = csq::TranscriptSource::Both)] - pub transcript_source: csq::TranscriptSource, - - /// Whether to report only the worst consequence for each picked transcript. - #[arg(long)] - pub report_most_severe_consequence_by: Option, - - /// Which kind of transcript to pick / restrict to. Default is not to pick at all. - /// - /// Depending on `--pick-transcript-mode`, if multiple transcripts match the selection, - /// either the first one is kept or all are kept. - #[arg(long)] - pub pick_transcript: Vec, - - /// Determines how to handle multiple transcripts. Default is to keep all. - /// - /// When transcript picking is enabled via `--pick-transcript`, - /// either keep the first one found or keep all that match. - #[arg(long, default_value = "all")] - pub pick_transcript_mode: TranscriptPickMode, - /// For debug purposes, maximal number of variants to annotate. #[arg(long)] pub max_var_count: Option, @@ -124,67 +102,10 @@ pub struct Args { /// What to annotate and which source to use. #[command(flatten)] pub sources: Sources, -} - -#[derive( - Debug, - Copy, - Clone, - PartialEq, - Eq, - PartialOrd, - Ord, - Display, - clap::ValueEnum, - VariantArray, - parse_display::FromStr, -)] -pub enum ConsequenceBy { - Gene, - Transcript, - // or "Variant"? - Allele, -} - -#[derive( - Debug, - Copy, - Clone, - PartialEq, - Eq, - PartialOrd, - Ord, - Display, - clap::ValueEnum, - VariantArray, - parse_display::FromStr, -)] -pub enum TranscriptPickType { - ManeSelect, - ManePlusClinical, - Length, - EnsemblCanonical, - RefSeqSelect, - GencodePrimary, - Basic, -} -#[derive(Debug, Copy, Clone, Display, clap::ValueEnum, Default)] -pub enum TranscriptPickMode { - #[default] - First, - All, -} - -#[derive(Debug, ClapArgs)] -#[group(required = true, multiple = true)] -pub struct Sources { - #[arg(long)] - transcripts: Option>, - #[arg(long)] - frequencies: Option, - #[arg(long)] - clinvar: Option, + /// Transcript annotation related settings + #[command(flatten)] + pub transcript_settings: TranscriptSettings, } #[derive(Debug, Display, Copy, Clone, clap::ValueEnum, PartialEq, Eq, parse_display::FromStr)] @@ -1390,7 +1311,7 @@ impl AsyncAnnotatedVariantWriter for VarFishSeqvarTsvWriter { } #[allow(clippy::large_enum_variant)] -enum AnnotatorEnum { +pub(crate) enum AnnotatorEnum { Frequency(FrequencyAnnotator), Clinvar(ClinvarAnnotator), Consequence(ConsequenceAnnotator), @@ -1406,10 +1327,22 @@ impl AnnotatorEnum { } } -struct Annotator { +pub(crate) struct Annotator { annotators: Vec, } +impl Annotator { + pub(crate) fn seqvars(&self) -> Option<&ConsequenceAnnotator> { + for annotator in &self.annotators { + match annotator { + AnnotatorEnum::Consequence(a) => return Some(a.clone()), + _ => (), + } + } + None + } +} + pub struct FrequencyAnnotator { db: DBWithThreadMode, } @@ -1418,7 +1351,7 @@ impl FrequencyAnnotator { Self { db } } - fn from_path(path: impl AsRef + Display) -> anyhow::Result { + pub(crate) fn from_path(path: impl AsRef + Display) -> anyhow::Result { // Open the frequency RocksDB database in read only mode. tracing::info!("Opening frequency database"); tracing::debug!("RocksDB path = {}", &path); @@ -1638,7 +1571,7 @@ impl ClinvarAnnotator { Self { db } } - fn from_path(path: impl AsRef + Display) -> anyhow::Result { + pub(crate) fn from_path(path: impl AsRef + Display) -> anyhow::Result { tracing::info!("Opening ClinVar database"); tracing::debug!("RocksDB path = {}", &path); let options = rocksdb::Options::default(); @@ -1719,8 +1652,8 @@ impl ClinvarAnnotator { } } -struct ConsequenceAnnotator { - predictor: ConsequencePredictor, +pub(crate) struct ConsequenceAnnotator { + pub(crate) predictor: ConsequencePredictor, } impl ConsequenceAnnotator { @@ -1728,7 +1661,11 @@ impl ConsequenceAnnotator { Self { predictor } } - fn from_db_and_args(tx_db: TxSeqDatabase, args: &Args) -> anyhow::Result { + pub(crate) fn from_db_and_settings( + tx_db: TxSeqDatabase, + transcript_settings: &TranscriptSettings, + ) -> anyhow::Result { + let args = transcript_settings; let provider = Arc::new(MehariProvider::new( tx_db, MehariProviderConfigBuilder::default() @@ -1776,7 +1713,7 @@ impl ConsequenceAnnotator { } impl Annotator { - fn new(annotators: Vec) -> Self { + pub(crate) fn new(annotators: Vec) -> Self { Self { annotators } } @@ -1882,7 +1819,7 @@ async fn run_with_writer( writer.set_assembly(assembly); tracing::info!("Determined input assembly to be {:?}", &assembly); - let annotator = setup_annotator(args)?; + let annotator = setup_seqvars_annotator(&args.sources, &args.transcript_settings)?; // Perform the VCF annotation. tracing::info!("Annotating VCF ..."); @@ -1941,21 +1878,24 @@ async fn run_with_writer( Ok(()) } -fn setup_annotator(args: &Args) -> Result { +pub(crate) fn setup_seqvars_annotator( + sources: &Sources, + transcript_settings: &TranscriptSettings, +) -> Result { let mut annotators = vec![]; // Add the frequency annotator if requested. - if let Some(rocksdb_path) = &args.sources.frequencies { + if let Some(rocksdb_path) = &sources.frequencies { annotators.push(FrequencyAnnotator::from_path(rocksdb_path).map(AnnotatorEnum::Frequency)?) } // Add the ClinVar annotator if requested. - if let Some(rocksdb_path) = &args.sources.clinvar { + if let Some(rocksdb_path) = &sources.clinvar { annotators.push(ClinvarAnnotator::from_path(rocksdb_path).map(AnnotatorEnum::Clinvar)?) } // Add the consequence annotator if requested. - if let Some(tx_sources) = &args.sources.transcripts { + if let Some(tx_sources) = &sources.transcripts { tracing::info!("Opening transcript database(s)"); let tx_db = merge_transcript_databases( @@ -1966,7 +1906,8 @@ fn setup_annotator(args: &Args) -> Result { )?; annotators.push( - ConsequenceAnnotator::from_db_and_args(tx_db, args).map(AnnotatorEnum::Consequence)?, + ConsequenceAnnotator::from_db_and_settings(tx_db, transcript_settings) + .map(AnnotatorEnum::Consequence)?, ); } @@ -1993,14 +1934,14 @@ pub fn from_vcf_allele(value: &noodles::vcf::variant::RecordBuf, allele_no: usiz #[cfg(test)] mod test { + use super::binning::bin_from_range; + use super::{run, Args, PathOutput}; + use crate::annotate::cli::Sources; + use crate::annotate::cli::{ConsequenceBy, TranscriptSource}; use clap_verbosity_flag::Verbosity; use pretty_assertions::assert_eq; use temp_testdir::TempDir; - use super::binning::bin_from_range; - use super::{csq::TranscriptSource, run, Args, ConsequenceBy, PathOutput}; - use crate::annotate::seqvars::Sources; - #[tokio::test] async fn smoke_test_output_vcf() -> Result<(), anyhow::Error> { let temp = TempDir::default(); diff --git a/src/annotate/seqvars/provider.rs b/src/annotate/seqvars/provider.rs index baf8c574..ee8634e3 100644 --- a/src/annotate/seqvars/provider.rs +++ b/src/annotate/seqvars/provider.rs @@ -2,7 +2,6 @@ use std::collections::HashMap; -use crate::annotate::seqvars::{TranscriptPickMode, TranscriptPickType}; use crate::db::create::Reason; use crate::db::TranscriptDatabase; use crate::{ @@ -24,6 +23,7 @@ use hgvs::{ sequences::{seq_md5, TranslationTable}, }; use itertools::Itertools; +use crate::annotate::cli::{TranscriptPickMode, TranscriptPickType}; /// Mitochondrial accessions. const MITOCHONDRIAL_ACCESSIONS: &[&str] = &[ diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs index 832943eb..e1fbee84 100644 --- a/src/annotate/strucvars/mod.rs +++ b/src/annotate/strucvars/mod.rs @@ -58,10 +58,6 @@ mod maelstrom; #[derive(Parser, Debug, Clone)] #[command(about = "Annotate structural variant VCF files", long_about = None)] pub struct Args { - /// Path to the mehari database folder. - #[arg(long)] - pub path_db: String, - /// Genome release to use, default is to auto-detect. #[arg(long, value_enum)] pub genome_release: Option, @@ -86,9 +82,11 @@ pub struct Args { /// Minimal reciprocal overlap to require. #[arg(long, default_value_t = 0.8)] pub min_overlap: f32, + /// Slack to use around break-ends. #[arg(long, default_value_t = 50)] pub slack_bnd: i32, + /// Slack to use around insertions. #[arg(long, default_value_t = 50)] pub slack_ins: i32, @@ -96,6 +94,7 @@ pub struct Args { /// Seed for random number generator (UUIDs), if any. #[arg(long)] pub rng_seed: Option, + /// Optionally, value to write to `##fileDate`. #[arg(long)] pub file_date: Option, @@ -4155,7 +4154,6 @@ mod test { let out_path = temp.join(format!("out{}", suffix)); let args = Args { - path_db: String::from("tests/data/db/create"), genome_release: Some(GenomeRelease::Grch37), path_input_ped: String::from("tests/data/annotate/strucvars/maelstrom/delly2-min.ped"), path_input_vcf: vec![String::from( diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 6e0dadc6..63d25496 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -1,5 +1,6 @@ -use std::sync::Arc; - +use crate::annotate::cli::{Sources, TranscriptSettings}; +use crate::annotate::seqvars::{setup_seqvars_annotator, ClinvarAnnotator, FrequencyAnnotator}; +use crate::db::merge::merge_transcript_databases; use crate::{ annotate::{ seqvars::{ @@ -10,6 +11,8 @@ use crate::{ }, common::GenomeRelease, }; +use anyhow::Error; +use std::sync::Arc; /// Implementation of Actix server. pub mod actix_server; @@ -90,9 +93,13 @@ pub mod openapi { #[derive(clap::Parser, Debug)] #[command(about = "Run Mehari REST API server", long_about = None)] pub struct Args { - /// Path to the mehari database folder. - #[arg(long)] - pub path_db: String, + /// What to annotate and which source to use. + #[command(flatten)] + pub sources: Sources, + + /// Transcript related settings. + #[command(flatten)] + pub transcript_settings: TranscriptSettings, /// Whether to suppress printing hints. #[arg(long, default_value_t = false)] @@ -101,6 +108,7 @@ pub struct Args { /// IP to listen on. #[arg(long, default_value = "127.0.0.1")] pub listen_host: String, + /// Port to listen on. #[arg(long, default_value_t = 8080)] pub listen_port: u16, @@ -176,15 +184,10 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a let mut data = actix_server::WebServerData::default(); for genome_release in [GenomeRelease::Grch37, GenomeRelease::Grch38] { let assembly = genome_release.into(); - let path = format!("{}/{}/txs.bin.zst", &args.path_db, path_component(assembly)); - if !std::path::Path::new(&path).exists() { - tracing::warn!("No transcript database found at {}", &path); - continue; - } - tracing::info!(" - loading {}", &path); - let tx_db = load_tx_db(&path)?; - tracing::info!(" - building interval trees"); - let provider = Arc::new(MehariProvider::new(tx_db, Default::default())); + + let annotator = setup_seqvars_annotator(&args.sources, &args.transcript_settings)?; + let seqvars_csq_predictor = &annotator.seqvars().unwrap().predictor; + let provider = seqvars_csq_predictor.provider.clone(); data.provider.insert(genome_release, provider.clone()); tracing::info!(" - building seqvars predictors"); data.seqvars_predictors.insert( diff --git a/src/verify/seqvars.rs b/src/verify/seqvars.rs index 97462026..45fee039 100644 --- a/src/verify/seqvars.rs +++ b/src/verify/seqvars.rs @@ -11,12 +11,12 @@ use crate::annotate::seqvars::{ csq::{ConfigBuilder as ConsequencePredictorConfigBuilder, ConsequencePredictor, VcfVariant}, load_tx_db, path_component, provider::{ConfigBuilder as MehariProviderConfigBuilder, Provider as MehariProvider}, - ConsequenceBy, TranscriptPickMode, TranscriptPickType, }; use biocommons_bioutils::assemblies::Assembly; use clap::Parser; use noodles::core::{Position, Region}; use quick_cache::unsync::Cache; +use crate::annotate::cli::{ConsequenceBy, TranscriptPickMode, TranscriptPickType}; /// Command line arguments for `verify seqvars` sub command. #[derive(Parser, Debug)] From cc28a903cde0fe23ae34b0637aae9c6c56c24112 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 16 Dec 2024 13:39:13 +0100 Subject: [PATCH 02/31] use TranscriptSettings in test args --- src/annotate/cli.rs | 2 +- src/annotate/seqvars/csq.rs | 11 ++++---- src/annotate/seqvars/mod.rs | 50 ++++++++++++++++++------------------- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/src/annotate/cli.rs b/src/annotate/cli.rs index 19a654d5..fe8e6b9a 100644 --- a/src/annotate/cli.rs +++ b/src/annotate/cli.rs @@ -12,7 +12,7 @@ pub struct Sources { pub clinvar: Option, } -#[derive(Debug, ClapArgs)] +#[derive(Debug, ClapArgs, Default)] pub struct TranscriptSettings { /// The transcript source. #[arg(long, value_enum, default_value_t = TranscriptSource::Both)] diff --git a/src/annotate/seqvars/csq.rs b/src/annotate/seqvars/csq.rs index 25c8edab..5a063ea5 100644 --- a/src/annotate/seqvars/csq.rs +++ b/src/annotate/seqvars/csq.rs @@ -1224,7 +1224,7 @@ impl ConsequencePredictor { #[cfg(test)] mod test { use super::*; - use crate::annotate::cli::TranscriptPickType; + use crate::annotate::cli::{TranscriptPickType, TranscriptSettings}; use crate::annotate::seqvars::provider::ConfigBuilder as MehariProviderConfigBuilder; use crate::annotate::seqvars::{ load_tx_db, run_with_writer, Args, AsyncAnnotatedVariantWriter, PathOutput, @@ -1706,10 +1706,11 @@ mod test { path_output_vcf: Some(output.as_ref().to_str().unwrap().into()), path_output_tsv: None, }, - transcript_source: Default::default(), - report_most_severe_consequence_by: Some(ConsequenceBy::Allele), - pick_transcript: vec![TranscriptPickType::ManeSelect], - pick_transcript_mode: Default::default(), + transcript_settings: TranscriptSettings { + report_most_severe_consequence_by: Some(ConsequenceBy::Allele), + pick_transcript: vec![TranscriptPickType::ManeSelect], + ..Default::default() + }, max_var_count: None, hgnc: None, sources: crate::annotate::seqvars::Sources { diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 5fdb682c..6e36f735 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1936,7 +1936,7 @@ pub fn from_vcf_allele(value: &noodles::vcf::variant::RecordBuf, allele_no: usiz mod test { use super::binning::bin_from_range; use super::{run, Args, PathOutput}; - use crate::annotate::cli::Sources; + use crate::annotate::cli::{Sources, TranscriptPickType, TranscriptSettings}; use crate::annotate::cli::{ConsequenceBy, TranscriptSource}; use clap_verbosity_flag::Verbosity; use pretty_assertions::assert_eq; @@ -1954,10 +1954,10 @@ mod test { let assembly = "grch37"; let args = Args { genome_release: None, - report_most_severe_consequence_by: Some(ConsequenceBy::Gene), - transcript_source: TranscriptSource::Both, - pick_transcript: vec![], - pick_transcript_mode: Default::default(), + transcript_settings: TranscriptSettings { + report_most_severe_consequence_by: Some(ConsequenceBy::Gene), + ..Default::default() + }, path_input_vcf: String::from("tests/data/annotate/seqvars/brca1.examples.vcf"), output: PathOutput { path_output_vcf: Some(path_out.into_os_string().into_string().unwrap()), @@ -1995,10 +1995,10 @@ mod test { let assembly = "grch37"; let args = Args { genome_release: None, - report_most_severe_consequence_by: Some(ConsequenceBy::Gene), - transcript_source: TranscriptSource::Both, - pick_transcript: vec![], - pick_transcript_mode: Default::default(), + transcript_settings: TranscriptSettings { + report_most_severe_consequence_by: Some(ConsequenceBy::Gene), + ..Default::default() + }, path_input_vcf: String::from("tests/data/annotate/seqvars/brca1.examples.vcf"), output: PathOutput { path_output_vcf: None, @@ -2048,10 +2048,10 @@ mod test { let assembly = "grch37"; let args = Args { genome_release: None, - report_most_severe_consequence_by: Some(ConsequenceBy::Gene), - transcript_source: TranscriptSource::Both, - pick_transcript: vec![], - pick_transcript_mode: Default::default(), + transcript_settings: TranscriptSettings { + report_most_severe_consequence_by: Some(ConsequenceBy::Gene), + ..Default::default() + }, path_input_vcf: String::from("tests/data/annotate/seqvars/badly_formed_vcf_entry.vcf"), output: PathOutput { path_output_vcf: None, @@ -2095,10 +2095,10 @@ mod test { let assembly = "grch37"; let args = Args { genome_release: None, - report_most_severe_consequence_by: Some(ConsequenceBy::Gene), - transcript_source: TranscriptSource::Both, - pick_transcript: vec![], - pick_transcript_mode: Default::default(), + transcript_settings: TranscriptSettings { + report_most_severe_consequence_by: Some(ConsequenceBy::Gene), + ..Default::default() + }, path_input_vcf: String::from("tests/data/annotate/seqvars/mitochondrial_variants.vcf"), output: PathOutput { path_output_vcf: None, @@ -2144,10 +2144,10 @@ mod test { let assembly = "grch37"; let args = Args { genome_release: None, - report_most_severe_consequence_by: Some(ConsequenceBy::Gene), - transcript_source: TranscriptSource::Both, - pick_transcript: vec![], - pick_transcript_mode: Default::default(), + transcript_settings: TranscriptSettings { + report_most_severe_consequence_by: Some(ConsequenceBy::Gene), + ..Default::default() + }, path_input_vcf: String::from("tests/data/annotate/seqvars/clair3-glnexus-min.vcf"), output: PathOutput { path_output_vcf: None, @@ -2193,10 +2193,10 @@ mod test { let assembly = "grch38"; let args = Args { genome_release: None, - report_most_severe_consequence_by: Some(ConsequenceBy::Gene), - transcript_source: TranscriptSource::Both, - pick_transcript: vec![], - pick_transcript_mode: Default::default(), + transcript_settings: TranscriptSettings { + report_most_severe_consequence_by: Some(ConsequenceBy::Gene), + ..Default::default() + }, path_input_vcf: String::from("tests/data/annotate/seqvars/brca2_zar1l/brca2_zar1l.vcf"), output: PathOutput { path_output_vcf: None, From 2031dbd83266f055e5d62720a3777c93acec7596 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 16 Dec 2024 15:03:05 +0100 Subject: [PATCH 03/31] lints --- src/annotate/cli.rs | 2 +- src/annotate/seqvars/mod.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/annotate/cli.rs b/src/annotate/cli.rs index fe8e6b9a..2b79bcf2 100644 --- a/src/annotate/cli.rs +++ b/src/annotate/cli.rs @@ -12,7 +12,7 @@ pub struct Sources { pub clinvar: Option, } -#[derive(Debug, ClapArgs, Default)] +#[derive(Debug, ClapArgs, Default, Clone)] pub struct TranscriptSettings { /// The transcript source. #[arg(long, value_enum, default_value_t = TranscriptSource::Both)] diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 6e36f735..019c01df 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -48,7 +48,7 @@ use prost::Message; use rocksdb::{DBWithThreadMode, MultiThreaded}; use rustc_hash::FxHashMap; use serde::{Deserialize, Serialize}; -use strum::{Display, VariantArray}; +use strum::Display; use thousands::Separable; use tokio::io::AsyncWriteExt; @@ -1335,7 +1335,7 @@ impl Annotator { pub(crate) fn seqvars(&self) -> Option<&ConsequenceAnnotator> { for annotator in &self.annotators { match annotator { - AnnotatorEnum::Consequence(a) => return Some(a.clone()), + AnnotatorEnum::Consequence(a) => return Some(a), _ => (), } } From 133e7da43fd1fcc242edaedf3bfd2e824275f708 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 16 Dec 2024 15:03:25 +0100 Subject: [PATCH 04/31] merge tx dbs separated by genome release --- src/annotate/seqvars/mod.rs | 55 ++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 019c01df..8d4c08f1 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1819,7 +1819,8 @@ async fn run_with_writer( writer.set_assembly(assembly); tracing::info!("Determined input assembly to be {:?}", &assembly); - let annotator = setup_seqvars_annotator(&args.sources, &args.transcript_settings)?; + let annotator = + setup_seqvars_annotator(&args.sources, &args.transcript_settings, Some(assembly))?; // Perform the VCF annotation. tracing::info!("Annotating VCF ..."); @@ -1878,11 +1879,23 @@ async fn run_with_writer( Ok(()) } +pub(crate) fn proto_assembly_from(assembly: &Assembly) -> Option { + crate::pbs::txs::Assembly::from_str_name(&format!( + "ASSEMBLY_{}", + match assembly { + Assembly::Grch38 => "GRCH38", + _ => "GRCH37", + } + )) +} + pub(crate) fn setup_seqvars_annotator( sources: &Sources, transcript_settings: &TranscriptSettings, + assembly: Option, ) -> Result { let mut annotators = vec![]; + let pb_assembly = assembly.as_ref().and_then(proto_assembly_from); // Add the frequency annotator if requested. if let Some(rocksdb_path) = &sources.frequencies { @@ -1898,17 +1911,39 @@ pub(crate) fn setup_seqvars_annotator( if let Some(tx_sources) = &sources.transcripts { tracing::info!("Opening transcript database(s)"); - let tx_db = merge_transcript_databases( - tx_sources + // Filter out any transcript databases that do not match the requested assembly. + let check_assembly = |db: &TxSeqDatabase, assembly: crate::pbs::txs::Assembly| { + db.source_version .iter() - .map(load_tx_db) - .collect::>>()?, - )?; + .map(|s| s.assembly) + .any(|a| a == i32::from(assembly)) + }; + let databases = tx_sources + .iter() + .enumerate() + .map(|(i, path)| (i, load_tx_db(path))) + .filter_map(|(i, txdb)| match txdb { + Ok(db) => match pb_assembly { + Some(assembly) if check_assembly(&db, assembly) => Some(Ok(db)), + Some(_) => { + tracing::info!("Skipping transcript database {} as its assembly {:?} does not match the requested one ({:?})", &tx_sources[i], &db.source_version, &assembly); + None + }, + None => Some(Ok(db)), + }, + Err(_) => Some(txdb), + }) + .collect::>>()?; - annotators.push( - ConsequenceAnnotator::from_db_and_settings(tx_db, transcript_settings) - .map(AnnotatorEnum::Consequence)?, - ); + if databases.is_empty() { + tracing::warn!("No suitable transcript databases found for requested assembly {:?}, therefore no consequence prediction will occur.", &assembly); + } else { + let tx_db = merge_transcript_databases(databases)?; + annotators.push( + ConsequenceAnnotator::from_db_and_settings(tx_db, transcript_settings) + .map(AnnotatorEnum::Consequence)?, + ); + } } let annotator = Annotator::new(annotators); From e5fd61e09acedffa6e6fad66fd945d1947ca9dbf Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Mon, 16 Dec 2024 15:04:41 +0100 Subject: [PATCH 05/31] load csq predictors depending on genome release in the server subcmd --- src/annotate/seqvars/mod.rs | 4 ++-- src/db/merge.rs | 1 + src/server/run/mod.rs | 17 +++++++---------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 8d4c08f1..e113fe0f 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1971,8 +1971,8 @@ pub fn from_vcf_allele(value: &noodles::vcf::variant::RecordBuf, allele_no: usiz mod test { use super::binning::bin_from_range; use super::{run, Args, PathOutput}; - use crate::annotate::cli::{Sources, TranscriptPickType, TranscriptSettings}; - use crate::annotate::cli::{ConsequenceBy, TranscriptSource}; + use crate::annotate::cli::ConsequenceBy; + use crate::annotate::cli::{Sources, TranscriptSettings}; use clap_verbosity_flag::Verbosity; use pretty_assertions::assert_eq; use temp_testdir::TempDir; diff --git a/src/db/merge.rs b/src/db/merge.rs index 4e1b06a2..29a88163 100644 --- a/src/db/merge.rs +++ b/src/db/merge.rs @@ -22,6 +22,7 @@ pub struct Args { pub fn merge_transcript_databases( mut databases: Vec, ) -> anyhow::Result { + assert!(!databases.is_empty()); if let Some((first, others)) = databases.split_first_mut() { if !others.is_empty() { tracing::info!("Merging multiple transcript databases into one"); diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 63d25496..67d70a99 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -1,18 +1,13 @@ use crate::annotate::cli::{Sources, TranscriptSettings}; -use crate::annotate::seqvars::{setup_seqvars_annotator, ClinvarAnnotator, FrequencyAnnotator}; -use crate::db::merge::merge_transcript_databases; +use crate::annotate::seqvars::setup_seqvars_annotator; use crate::{ annotate::{ - seqvars::{ - csq::ConsequencePredictor as SeqvarConsequencePredictor, load_tx_db, path_component, - provider::Provider as MehariProvider, - }, + seqvars::csq::ConsequencePredictor as SeqvarConsequencePredictor, strucvars::csq::ConsequencePredictor as StrucvarConsequencePredictor, }, common::GenomeRelease, }; -use anyhow::Error; -use std::sync::Arc; +use clap::ValueEnum; /// Implementation of Actix server. pub mod actix_server; @@ -182,10 +177,12 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a tracing::info!("Loading database..."); let before_loading = std::time::Instant::now(); let mut data = actix_server::WebServerData::default(); - for genome_release in [GenomeRelease::Grch37, GenomeRelease::Grch38] { + for genome_release in GenomeRelease::value_variants().iter().copied() { + tracing::info!(" - loading genome release {:?}", genome_release); let assembly = genome_release.into(); - let annotator = setup_seqvars_annotator(&args.sources, &args.transcript_settings)?; + let annotator = + setup_seqvars_annotator(&args.sources, &args.transcript_settings, Some(assembly))?; let seqvars_csq_predictor = &annotator.seqvars().unwrap().predictor; let provider = seqvars_csq_predictor.provider.clone(); data.provider.insert(genome_release, provider.clone()); From 0a29b1cf6716223b333274a4b1fa34f157da16d5 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Tue, 17 Dec 2024 10:23:54 +0100 Subject: [PATCH 06/31] also skip freq and clinvar dbs if assembly does not match, more info logging --- src/annotate/seqvars/mod.rs | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index e113fe0f..fe77e514 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1899,12 +1899,36 @@ pub(crate) fn setup_seqvars_annotator( // Add the frequency annotator if requested. if let Some(rocksdb_path) = &sources.frequencies { - annotators.push(FrequencyAnnotator::from_path(rocksdb_path).map(AnnotatorEnum::Frequency)?) + let skip = assembly.map_or(false, |a| !rocksdb_path.contains(path_component(a))); + if !skip { + tracing::info!( + "Loading frequency database for assembly {:?} from {}", + &assembly, + &rocksdb_path + ); + annotators + .push(FrequencyAnnotator::from_path(rocksdb_path).map(AnnotatorEnum::Frequency)?) + } else { + tracing::warn!("Skipping frequency database as its assembly does not match the requested one ({:?})", &assembly); + } } // Add the ClinVar annotator if requested. if let Some(rocksdb_path) = &sources.clinvar { - annotators.push(ClinvarAnnotator::from_path(rocksdb_path).map(AnnotatorEnum::Clinvar)?) + let skip = assembly.map_or(false, |a| !rocksdb_path.contains(path_component(a))); + if !skip { + tracing::info!( + "Loading ClinVar database for assembly {:?} from {}", + &assembly, + &rocksdb_path + ); + annotators.push(ClinvarAnnotator::from_path(rocksdb_path).map(AnnotatorEnum::Clinvar)?) + } else { + tracing::warn!( + "Skipping ClinVar database as its assembly does not match the requested one ({:?})", + &assembly + ); + } } // Add the consequence annotator if requested. @@ -1939,6 +1963,10 @@ pub(crate) fn setup_seqvars_annotator( tracing::warn!("No suitable transcript databases found for requested assembly {:?}, therefore no consequence prediction will occur.", &assembly); } else { let tx_db = merge_transcript_databases(databases)?; + tracing::info!( + "Loaded transcript database(s) from {}", + &tx_sources.join(", ") + ); annotators.push( ConsequenceAnnotator::from_db_and_settings(tx_db, transcript_settings) .map(AnnotatorEnum::Consequence)?, From 6e43c13fe60415425cf41b37373e8e5cc3ca08b5 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Tue, 17 Dec 2024 10:24:09 +0100 Subject: [PATCH 07/31] lint: explicit named lifetimes --- src/common/noodles.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/noodles.rs b/src/common/noodles.rs index ac79ad19..55b14151 100644 --- a/src/common/noodles.rs +++ b/src/common/noodles.rs @@ -60,7 +60,7 @@ pub trait NoodlesVariantReader { async fn records<'a>( &'a mut self, header: &'a Header, - ) -> LocalBoxStream>; + ) -> LocalBoxStream<'a, std::io::Result>; } impl NoodlesVariantReader for VariantReader { @@ -76,7 +76,7 @@ impl NoodlesVariantReader for VariantReader { async fn records<'a>( &'a mut self, header: &'a Header, - ) -> LocalBoxStream> { + ) -> LocalBoxStream<'a, std::io::Result> { match self { VariantReader::Vcf(r) => r.record_bufs(header).boxed_local(), VariantReader::Bcf(r) => r From 1f568f82a86999b4ea223fd9a1e87ff71c6f8efe Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Tue, 17 Dec 2024 10:47:53 +0100 Subject: [PATCH 08/31] rephrase info about skipped databases --- src/annotate/seqvars/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index fe77e514..72ff8e2c 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1950,7 +1950,7 @@ pub(crate) fn setup_seqvars_annotator( Ok(db) => match pb_assembly { Some(assembly) if check_assembly(&db, assembly) => Some(Ok(db)), Some(_) => { - tracing::info!("Skipping transcript database {} as its assembly {:?} does not match the requested one ({:?})", &tx_sources[i], &db.source_version, &assembly); + tracing::info!("Skipping transcript database {} as its version {:?} does not support the requested assembly ({:?})", &tx_sources[i], &db.source_version, &assembly); None }, None => Some(Ok(db)), From 8f4decb3c4d5392f3367de52f9419980992831dd Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Tue, 17 Dec 2024 10:48:39 +0100 Subject: [PATCH 09/31] check whether the predictor could be successfully instantiated, otherwise skip endpoint for respective genome_release --- src/server/run/mod.rs | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 67d70a99..c408a1fa 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -1,4 +1,5 @@ use crate::annotate::cli::{Sources, TranscriptSettings}; +use crate::annotate::seqvars::csq::ConfigBuilder; use crate::annotate::seqvars::setup_seqvars_annotator; use crate::{ annotate::{ @@ -183,22 +184,35 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a let annotator = setup_seqvars_annotator(&args.sources, &args.transcript_settings, Some(assembly))?; - let seqvars_csq_predictor = &annotator.seqvars().unwrap().predictor; - let provider = seqvars_csq_predictor.provider.clone(); - data.provider.insert(genome_release, provider.clone()); - tracing::info!(" - building seqvars predictors"); - data.seqvars_predictors.insert( - genome_release, - SeqvarConsequencePredictor::new(provider.clone(), Default::default()), - ); - tracing::info!(" - building strucvars predictors"); - data.strucvars_predictors.insert( - genome_release, - StrucvarConsequencePredictor::new(provider.clone(), assembly), - ); + if let Some(seqvars_csq_predictor) = annotator.seqvars().map(|a| &a.predictor) { + let config = ConfigBuilder::default() + .report_most_severe_consequence_by( + args.transcript_settings.report_most_severe_consequence_by, + ) + .transcript_source(args.transcript_settings.transcript_source) + .build()?; + + let provider = seqvars_csq_predictor.provider.clone(); + data.provider.insert(genome_release, provider.clone()); + tracing::info!(" - building seqvars predictors"); + data.seqvars_predictors.insert( + genome_release, + SeqvarConsequencePredictor::new(provider.clone(), config), + ); + tracing::info!(" - building strucvars predictors"); + data.strucvars_predictors.insert( + genome_release, + StrucvarConsequencePredictor::new(provider.clone(), assembly), + ); + } else { + tracing::warn!( + "No predictors for genome release {:?}, respective endpoint will be unavailable.", + genome_release + ); + } } let data = actix_web::web::Data::new(data); - tracing::info!("...done loading data {:?}", before_loading.elapsed()); + tracing::info!("... done loading data {:?}", before_loading.elapsed()); // Print the server URL and some hints (the latter: unless suppressed). print_hints(args); From b6df08ccca502e2353c3b24fe1fdf96ba309dd5f Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Tue, 17 Dec 2024 10:48:44 +0100 Subject: [PATCH 10/31] fix typo --- src/server/run/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index c408a1fa..d0ca31f5 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -130,7 +130,7 @@ pub fn print_hints(args: &Args) { args.listen_host.as_str(), args.listen_port ); - // The endpoint `/tx/csq` to comput ethe consequence of a variant; without and with filtering + // The endpoint `/tx/csq` to compute the consequence of a variant; without and with filtering // for HGNC gene ID. tracing::info!( " try: http://{}:{}/seqvars/csq?genome_release=grch37\ From 7f159dba0bc842a91675e227c4075e6e0a04c150 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Tue, 17 Dec 2024 11:57:17 +0100 Subject: [PATCH 11/31] rename fn seqvars to consequence --- src/annotate/mod.rs | 2 +- src/annotate/seqvars/mod.rs | 7 +++---- src/annotate/seqvars/provider.rs | 2 +- src/server/run/mod.rs | 2 +- src/verify/seqvars.rs | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/annotate/mod.rs b/src/annotate/mod.rs index b7471115..70fad2fe 100644 --- a/src/annotate/mod.rs +++ b/src/annotate/mod.rs @@ -4,9 +4,9 @@ use noodles::vcf::header::FileFormat; use noodles::vcf::variant::record::samples::series::value::genotype::Phasing; use noodles::vcf::variant::record_buf::samples::sample::value::Genotype; +pub(crate) mod cli; pub mod seqvars; pub mod strucvars; -pub(crate) mod cli; const VCF_4_4: FileFormat = FileFormat::new(4, 4); diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 72ff8e2c..9bac7b87 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1332,11 +1332,10 @@ pub(crate) struct Annotator { } impl Annotator { - pub(crate) fn seqvars(&self) -> Option<&ConsequenceAnnotator> { + pub(crate) fn consequence(&self) -> Option<&ConsequenceAnnotator> { for annotator in &self.annotators { - match annotator { - AnnotatorEnum::Consequence(a) => return Some(a), - _ => (), + if let AnnotatorEnum::Consequence(a) = annotator { + return Some(a); } } None diff --git a/src/annotate/seqvars/provider.rs b/src/annotate/seqvars/provider.rs index ee8634e3..2d714ddc 100644 --- a/src/annotate/seqvars/provider.rs +++ b/src/annotate/seqvars/provider.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; +use crate::annotate::cli::{TranscriptPickMode, TranscriptPickType}; use crate::db::create::Reason; use crate::db::TranscriptDatabase; use crate::{ @@ -23,7 +24,6 @@ use hgvs::{ sequences::{seq_md5, TranslationTable}, }; use itertools::Itertools; -use crate::annotate::cli::{TranscriptPickMode, TranscriptPickType}; /// Mitochondrial accessions. const MITOCHONDRIAL_ACCESSIONS: &[&str] = &[ diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index d0ca31f5..945705e4 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -184,7 +184,7 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a let annotator = setup_seqvars_annotator(&args.sources, &args.transcript_settings, Some(assembly))?; - if let Some(seqvars_csq_predictor) = annotator.seqvars().map(|a| &a.predictor) { + if let Some(seqvars_csq_predictor) = annotator.consequence().map(|a| &a.predictor) { let config = ConfigBuilder::default() .report_most_severe_consequence_by( args.transcript_settings.report_most_severe_consequence_by, diff --git a/src/verify/seqvars.rs b/src/verify/seqvars.rs index 45fee039..89bbfca0 100644 --- a/src/verify/seqvars.rs +++ b/src/verify/seqvars.rs @@ -7,6 +7,7 @@ use std::{ time::Instant, }; +use crate::annotate::cli::{ConsequenceBy, TranscriptPickMode, TranscriptPickType}; use crate::annotate::seqvars::{ csq::{ConfigBuilder as ConsequencePredictorConfigBuilder, ConsequencePredictor, VcfVariant}, load_tx_db, path_component, @@ -16,7 +17,6 @@ use biocommons_bioutils::assemblies::Assembly; use clap::Parser; use noodles::core::{Position, Region}; use quick_cache::unsync::Cache; -use crate::annotate::cli::{ConsequenceBy, TranscriptPickMode, TranscriptPickType}; /// Command line arguments for `verify seqvars` sub command. #[derive(Parser, Debug)] From d64eaad09bb7bf461a77ee1391680a02944fb68f Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Tue, 17 Dec 2024 11:59:59 +0100 Subject: [PATCH 12/31] remove databases.is_empty assertion because that is tested anyway --- src/db/merge.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/db/merge.rs b/src/db/merge.rs index 29a88163..4e1b06a2 100644 --- a/src/db/merge.rs +++ b/src/db/merge.rs @@ -22,7 +22,6 @@ pub struct Args { pub fn merge_transcript_databases( mut databases: Vec, ) -> anyhow::Result { - assert!(!databases.is_empty()); if let Some((first, others)) = databases.split_first_mut() { if !others.is_empty() { tracing::info!("Merging multiple transcript databases into one"); From d0672ab8115d612da39dfa837e7fdcaf94ef2a1b Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 11:26:51 +0100 Subject: [PATCH 13/31] update sources help texts --- src/annotate/cli.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/annotate/cli.rs b/src/annotate/cli.rs index 2b79bcf2..d5561272 100644 --- a/src/annotate/cli.rs +++ b/src/annotate/cli.rs @@ -4,10 +4,24 @@ use strum::{Display, VariantArray}; #[derive(Debug, ClapArgs)] #[group(required = true, multiple = true)] pub struct Sources { + + /// Transcript database containing the transcript information. + /// + /// Pre-built databases are available at https://github.com/varfish-org/mehari-data-tx/releases #[arg(long)] pub transcripts: Option>, + + /// Frequency database. + /// + /// The frequency database contains gnomAD frequencies for the variants. + /// Pre-built databases are available at TODO #[arg(long)] pub frequencies: Option, + + /// ClinVar database. + /// + /// The ClinVar database contains clinical significance information for the variants. + /// Pre-built databases are available at https://github.com/varfish-org/annonars-data-clinvar/releases #[arg(long)] pub clinvar: Option, } @@ -18,7 +32,7 @@ pub struct TranscriptSettings { #[arg(long, value_enum, default_value_t = TranscriptSource::Both)] pub transcript_source: TranscriptSource, - /// Whether to report only the worst consequence for each picked transcript. + /// Whether to report only the most severe consequence, grouped by gene, transcript, or allele. #[arg(long)] pub report_most_severe_consequence_by: Option, From eea025e6dd531c74e81033828864d0e9e0efe9c3 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 11:28:22 +0100 Subject: [PATCH 14/31] fmt --- src/annotate/cli.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/annotate/cli.rs b/src/annotate/cli.rs index d5561272..c8ae0a37 100644 --- a/src/annotate/cli.rs +++ b/src/annotate/cli.rs @@ -4,7 +4,6 @@ use strum::{Display, VariantArray}; #[derive(Debug, ClapArgs)] #[group(required = true, multiple = true)] pub struct Sources { - /// Transcript database containing the transcript information. /// /// Pre-built databases are available at https://github.com/varfish-org/mehari-data-tx/releases From e93b5f1fe9b7db346c7212bf2916b2495516c3bb Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 11:28:35 +0100 Subject: [PATCH 15/31] include strand in seqvars/csq --- src/server/run/actix_server/seqvars_csq.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/server/run/actix_server/seqvars_csq.rs b/src/server/run/actix_server/seqvars_csq.rs index 9ffb91c1..e25ff51b 100644 --- a/src/server/run/actix_server/seqvars_csq.rs +++ b/src/server/run/actix_server/seqvars_csq.rs @@ -73,6 +73,8 @@ pub(crate) struct SeqvarsCsqResultEntry { pub protein_pos: Option, /// Distance to feature. pub distance: Option, + /// Strand of the alignment + pub strand: i32, /// Optional list of warnings and error messages. pub messages: Option>, } @@ -146,6 +148,7 @@ async fn handle_impl( cds_pos, protein_pos, distance, + strand, messages, .. } = ann_field; @@ -173,6 +176,7 @@ async fn handle_impl( cds_pos, protein_pos, distance, + strand, messages, }; result.push(entry); From ff126ceb7f35017b2df7f8987ae248cc0233788a Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 12:37:51 +0100 Subject: [PATCH 16/31] add frequency endpoint (wip) --- src/annotate/seqvars/mod.rs | 131 +++++++++++++-- src/server/run/actix_server/frequencies.rs | 183 +++++++++++++++++++++ src/server/run/actix_server/mod.rs | 5 + src/server/run/mod.rs | 28 +++- 4 files changed, 332 insertions(+), 15 deletions(-) create mode 100644 src/server/run/actix_server/frequencies.rs diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 9bac7b87..ce805cfe 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1340,8 +1340,27 @@ impl Annotator { } None } + + pub(crate) fn frequencies(&self) -> Option<&FrequencyAnnotator> { + for annotator in &self.annotators { + if let AnnotatorEnum::Frequency(a) = annotator { + return Some(a); + } + } + None + } + + pub(crate) fn clinvar(&self) -> Option<&ClinvarAnnotator> { + for annotator in &self.annotators { + if let AnnotatorEnum::Clinvar(a) = annotator { + return Some(a); + } + } + None + } } +#[derive(Debug)] pub struct FrequencyAnnotator { db: DBWithThreadMode, } @@ -1413,7 +1432,7 @@ impl FrequencyAnnotator { Ok(()) } - /// Annotate record on gonomosomal chromosome with gnomAD exomes/genomes. + /// Annotate record on gonosomal chromosome with gnomAD exomes/genomes. pub fn annotate_record_xy( &self, key: &[u8], @@ -1423,51 +1442,47 @@ impl FrequencyAnnotator { .db .get_cf(self.db.cf_handle("gonosomal").as_ref().unwrap(), key)? { - let auto_record = xy::Record::from_buf(&freq); + let xy_record = xy::Record::from_buf(&freq); vcf_record.info_mut().insert( "gnomad_exomes_an".into(), - Some(field::Value::Integer(auto_record.gnomad_exomes.an as i32)), + Some(field::Value::Integer(xy_record.gnomad_exomes.an as i32)), ); vcf_record.info_mut().insert( "gnomad_exomes_hom".into(), - Some(field::Value::Integer( - auto_record.gnomad_exomes.ac_hom as i32, - )), + Some(field::Value::Integer(xy_record.gnomad_exomes.ac_hom as i32)), ); vcf_record.info_mut().insert( "gnomad_exomes_het".into(), - Some(field::Value::Integer( - auto_record.gnomad_exomes.ac_het as i32, - )), + Some(field::Value::Integer(xy_record.gnomad_exomes.ac_het as i32)), ); vcf_record.info_mut().insert( "gnomad_exomes_hemi".into(), Some(field::Value::Integer( - auto_record.gnomad_exomes.ac_hemi as i32, + xy_record.gnomad_exomes.ac_hemi as i32, )), ); vcf_record.info_mut().insert( "gnomad_genomes_an".into(), - Some(field::Value::Integer(auto_record.gnomad_genomes.an as i32)), + Some(field::Value::Integer(xy_record.gnomad_genomes.an as i32)), ); vcf_record.info_mut().insert( "gnomad_genomes_hom".into(), Some(field::Value::Integer( - auto_record.gnomad_genomes.ac_hom as i32, + xy_record.gnomad_genomes.ac_hom as i32, )), ); vcf_record.info_mut().insert( "gnomad_genomes_het".into(), Some(field::Value::Integer( - auto_record.gnomad_genomes.ac_het as i32, + xy_record.gnomad_genomes.ac_het as i32, )), ); vcf_record.info_mut().insert( "gnomad_genomes_hemi".into(), Some(field::Value::Integer( - auto_record.gnomad_genomes.ac_hemi as i32, + xy_record.gnomad_genomes.ac_hemi as i32, )), ); }; @@ -1537,6 +1552,94 @@ impl FrequencyAnnotator { } Ok(()) } + + pub(crate) fn annotate_variant( + &self, + vcf_var: &VcfVariant, + ) -> anyhow::Result> + { + // Only attempt lookups into RocksDB for canonical contigs. + if !is_canonical(&vcf_var.chromosome) { + return Ok(None); + } + + // Build key for RocksDB database + let vcf_var = keys::Var::from( + &vcf_var.chromosome, + vcf_var.position, + &vcf_var.reference, + &vcf_var.alternative, + ); + let key: Vec = vcf_var.clone().into(); + use crate::server::run::actix_server::frequencies::*; + // Annotate with frequency. + if CHROM_AUTO.contains(vcf_var.chrom.as_str()) { + if let Some(freq) = self + .db + .get_cf(self.db.cf_handle("autosomal").as_ref().unwrap(), key)? + { + let val = auto::Record::from_buf(&freq); + Ok(Some(FrequencyResultEntry::Autosomal( + AutosomalResultEntry { + gnomad_exomes_an: val.gnomad_exomes.an, + gnomad_exomes_hom: val.gnomad_exomes.ac_hom, + gnomad_exomes_het: val.gnomad_exomes.ac_het, + gnomad_genomes_an: val.gnomad_genomes.an, + gnomad_genomes_hom: val.gnomad_genomes.ac_hom, + gnomad_genomes_het: val.gnomad_genomes.ac_het, + }, + ))) + } else { + Err(anyhow!("No frequency data found for variant {:?}", vcf_var)) + } + } else if CHROM_XY.contains(vcf_var.chrom.as_str()) { + if let Some(freq) = self + .db + .get_cf(self.db.cf_handle("gonosomal").as_ref().unwrap(), key)? + { + let val = xy::Record::from_buf(&freq); + Ok(Some(FrequencyResultEntry::Gonosomal( + GonosomalResultEntry { + gnomad_exomes_an: val.gnomad_exomes.an, + gnomad_exomes_hom: val.gnomad_exomes.ac_hom, + gnomad_exomes_het: val.gnomad_exomes.ac_het, + gnomad_exomes_hemi: val.gnomad_exomes.ac_hemi, + gnomad_genomes_an: val.gnomad_genomes.an, + gnomad_genomes_hom: val.gnomad_genomes.ac_hom, + gnomad_genomes_het: val.gnomad_genomes.ac_het, + gnomad_genomes_hemi: val.gnomad_genomes.ac_hemi, + }, + ))) + } else { + Err(anyhow!("No frequency data found for variant {:?}", vcf_var)) + } + } else if CHROM_MT.contains(vcf_var.chrom.as_str()) { + if let Some(freq) = self + .db + .get_cf(self.db.cf_handle("mitochondrial").as_ref().unwrap(), key)? + { + let val = mt::Record::from_buf(&freq); + Ok(Some(FrequencyResultEntry::Mitochondrial( + MitochondrialResultEntry { + helix_an: val.helixmtdb.an, + helix_hom: val.helixmtdb.ac_hom, + helix_het: val.helixmtdb.ac_het, + gnomad_genomes_an: val.gnomad_mtdna.an, + gnomad_genomes_hom: val.gnomad_mtdna.ac_hom, + gnomad_genomes_het: val.gnomad_mtdna.ac_het, + }, + ))) + } else { + Err(anyhow!("No frequency data found for variant {:?}", vcf_var)) + } + } else { + tracing::trace!( + "Record @{:?} on non-canonical chromosome, skipping.", + &vcf_var + ); + Ok(None) + } + } } pub struct ClinvarAnnotator { diff --git a/src/server/run/actix_server/frequencies.rs b/src/server/run/actix_server/frequencies.rs new file mode 100644 index 00000000..758dd8fe --- /dev/null +++ b/src/server/run/actix_server/frequencies.rs @@ -0,0 +1,183 @@ +//! Implementation of endpoint `/api/v1/seqvars/csq`. +//! +//! Also includes the implementation of the `/seqvars/csq` endpoint (deprecated). + +use actix_web::{ + get, + web::{self, Data, Json, Path}, +}; + +use crate::{annotate::seqvars::csq::VcfVariant, common::GenomeRelease}; + +use super::{versions::VersionsInfoResponse, CustomError}; + +/// Query parameters of the `/api/v1/seqvars/csq` endpoint. +#[derive( + Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::IntoParams, utoipa::ToSchema, +)] +#[serde(rename_all = "snake_case")] +#[serde_with::skip_serializing_none] +pub(crate) struct FrequencyQuery { + /// The assembly. + pub genome_release: GenomeRelease, + /// SPDI sequence. + pub chromosome: String, + /// SPDI position. + pub position: u32, + /// SPDI deletion. + pub reference: String, + /// SPDI insertion. + pub alternative: String, + /// Optionally, the HGNC ID of the gene to limit to. + pub hgnc_id: Option, +} + +/// One entry in `FrequencyResponse`. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +pub(crate) enum FrequencyResultEntry { + Autosomal(AutosomalResultEntry), + Gonosomal(GonosomalResultEntry), + Mitochondrial(MitochondrialResultEntry), +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +pub(crate) struct AutosomalResultEntry { + pub gnomad_exomes_an: u32, + + pub gnomad_exomes_hom: u32, + + pub gnomad_exomes_het: u32, + + pub gnomad_genomes_an: u32, + + pub gnomad_genomes_hom: u32, + + pub gnomad_genomes_het: u32, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +pub(crate) struct GonosomalResultEntry { + pub gnomad_exomes_an: u32, + + pub gnomad_exomes_hom: u32, + + pub gnomad_exomes_het: u32, + + pub gnomad_exomes_hemi: u32, + + pub gnomad_genomes_an: u32, + + pub gnomad_genomes_hom: u32, + + pub gnomad_genomes_het: u32, + + pub gnomad_genomes_hemi: u32, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +pub(crate) struct MitochondrialResultEntry { + pub helix_an: u32, + + pub helix_hom: u32, + + pub helix_het: u32, + + pub gnomad_genomes_an: u32, + + pub gnomad_genomes_hom: u32, + + pub gnomad_genomes_het: u32, +} + +/// Response of the `/api/v1/frequency` endpoint. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +pub(crate) struct FrequencyResponse { + /// Version information. + pub version: VersionsInfoResponse, + /// The original query records. + pub query: FrequencyQuery, + /// The resulting records for the scored genes. + pub result: Vec, +} + +/// Implementation of endpoints. +async fn handle_impl( + data: Data, + _path: Path<()>, + query: web::Query, +) -> actix_web::Result, super::CustomError> { + let FrequencyQuery { + genome_release, + chromosome, + position, + reference, + alternative, + hgnc_id, + } = query.clone().into_inner(); + + let annotator = data + .frequency_annotators + .get(&genome_release) + .ok_or_else(|| { + super::CustomError::new(anyhow::anyhow!( + "genome release not supported: {:?}", + &query.genome_release + )) + })?; + + let mut result = Vec::new(); + let g_var = VcfVariant { + chromosome, + position: position as i32, + reference, + alternative, + }; + let frequencies = annotator + .annotate_variant(&g_var) + .map_err(|e| super::CustomError::new(anyhow::anyhow!("annotation failed: {}", &e)))?; + if let Some(frequencies) = frequencies { + result.push(frequencies); + } + + let result = FrequencyResponse { + version: VersionsInfoResponse::from_web_server_data(data.into_inner().as_ref()) + .map_err(|e| CustomError::new(anyhow::anyhow!("Problem determining version: {}", e)))?, + query: query.into_inner(), + result, + }; + + Ok(Json(result)) +} + +/// Query for consequence of a variant. +#[allow(clippy::unused_async)] +#[get("/frequency")] +async fn handle( + data: Data, + _path: Path<()>, + query: web::Query, +) -> actix_web::Result, super::CustomError> { + handle_impl(data, _path, query).await +} + +/// Query for consequence of a variant. +#[allow(clippy::unused_async)] +#[utoipa::path( + get, + operation_id = "frequency", + params( + FrequencyQuery + ), + responses( + (status = 200, description = "Frequency information.", body = FrequencyResponse), + (status = 500, description = "Internal server error.", body = CustomError) + ) +)] +#[get("/api/v1/frequency")] +async fn handle_with_openapi( + data: Data, + _path: Path<()>, + query: web::Query, +) -> actix_web::Result, super::CustomError> { + handle_impl(data, _path, query).await +} diff --git a/src/server/run/actix_server/mod.rs b/src/server/run/actix_server/mod.rs index e888cce0..ef4dbcbf 100644 --- a/src/server/run/actix_server/mod.rs +++ b/src/server/run/actix_server/mod.rs @@ -8,11 +8,13 @@ use utoipa::OpenApi as _; use crate::annotate::seqvars::provider::Provider as MehariProvider; use crate::annotate::strucvars::csq::ConsequencePredictor as StrucvarConsequencePredictor; use crate::{annotate::seqvars::csq::ConsequencePredictor, common::GenomeRelease}; +use crate::annotate::seqvars::FrequencyAnnotator; pub mod gene_txs; pub mod seqvars_csq; pub mod strucvars_csq; pub mod versions; +pub mod frequencies; #[derive(Debug, serde::Serialize, utoipa::ToSchema)] pub struct CustomError { @@ -47,6 +49,7 @@ pub struct WebServerData { /// The structural variant consequence predictors for eacha ssembly. pub strucvars_predictors: std::collections::HashMap, + pub frequency_annotators: std::collections::HashMap, } /// Main entry point for running the REST server. @@ -64,6 +67,8 @@ pub async fn main( .service(seqvars_csq::handle_with_openapi) .service(strucvars_csq::handle) .service(strucvars_csq::handle_with_openapi) + .service(frequencies::handle) + .service(frequencies::handle_with_openapi) .service(versions::handle) .service( utoipa_swagger_ui::SwaggerUi::new("/swagger-ui/{_:.*}") diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 945705e4..350d70fd 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -1,6 +1,6 @@ use crate::annotate::cli::{Sources, TranscriptSettings}; use crate::annotate::seqvars::csq::ConfigBuilder; -use crate::annotate::seqvars::setup_seqvars_annotator; +use crate::annotate::seqvars::{setup_seqvars_annotator, FrequencyAnnotator}; use crate::{ annotate::{ seqvars::csq::ConsequencePredictor as SeqvarConsequencePredictor, @@ -210,6 +210,32 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a genome_release ); } + + if let Some(path) = args.sources.frequencies.as_ref() { + if path + .to_ascii_lowercase() + .contains(&genome_release.name().to_ascii_lowercase()) + { + if let Ok(freq) = FrequencyAnnotator::from_path(path) { + data.frequency_annotators.insert(genome_release, freq); + } else { + tracing::warn!( + "Failed to load frequencies predictor for genome release {:?}, respective endpoint will be unavailable.", + genome_release + ); + } + } else { + tracing::warn!( + "No frequencies predictor for genome release {:?}, respective endpoint will be unavailable.", + genome_release + ); + } + } else { + tracing::warn!( + "No frequencies predictor for genome release {:?}, respective endpoint will be unavailable.", + genome_release + ); + } } let data = actix_web::web::Data::new(data); tracing::info!("... done loading data {:?}", before_loading.elapsed()); From 551d6cf7059adeb953bf27dd5db0ba6b7e1e7d47 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 12:44:56 +0100 Subject: [PATCH 17/31] =?UTF-8?q?update=20'try:=20=E2=80=A6'=20hints=20to?= =?UTF-8?q?=20openapi?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/annotate/seqvars/mod.rs | 2 +- src/server/run/actix_server/frequencies.rs | 8 ++++---- src/server/run/actix_server/mod.rs | 4 ++-- src/server/run/mod.rs | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index ce805cfe..bf15e35c 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1562,7 +1562,7 @@ impl FrequencyAnnotator { if !is_canonical(&vcf_var.chromosome) { return Ok(None); } - + // Build key for RocksDB database let vcf_var = keys::Var::from( &vcf_var.chromosome, diff --git a/src/server/run/actix_server/frequencies.rs b/src/server/run/actix_server/frequencies.rs index 758dd8fe..b08f3356 100644 --- a/src/server/run/actix_server/frequencies.rs +++ b/src/server/run/actix_server/frequencies.rs @@ -1,6 +1,6 @@ -//! Implementation of endpoint `/api/v1/seqvars/csq`. +//! Implementation of endpoint `/api/v1/frequency`. //! -//! Also includes the implementation of the `/seqvars/csq` endpoint (deprecated). +//! Also includes the implementation of the `/frequency` endpoint (deprecated). use actix_web::{ get, @@ -149,7 +149,7 @@ async fn handle_impl( Ok(Json(result)) } -/// Query for consequence of a variant. +/// Query for gnomAD frequencies of a variant. #[allow(clippy::unused_async)] #[get("/frequency")] async fn handle( @@ -160,7 +160,7 @@ async fn handle( handle_impl(data, _path, query).await } -/// Query for consequence of a variant. +/// Query for gnomAD frequencies of a variant. #[allow(clippy::unused_async)] #[utoipa::path( get, diff --git a/src/server/run/actix_server/mod.rs b/src/server/run/actix_server/mod.rs index ef4dbcbf..24b21d54 100644 --- a/src/server/run/actix_server/mod.rs +++ b/src/server/run/actix_server/mod.rs @@ -6,15 +6,15 @@ use actix_web::ResponseError; use utoipa::OpenApi as _; use crate::annotate::seqvars::provider::Provider as MehariProvider; +use crate::annotate::seqvars::FrequencyAnnotator; use crate::annotate::strucvars::csq::ConsequencePredictor as StrucvarConsequencePredictor; use crate::{annotate::seqvars::csq::ConsequencePredictor, common::GenomeRelease}; -use crate::annotate::seqvars::FrequencyAnnotator; +pub mod frequencies; pub mod gene_txs; pub mod seqvars_csq; pub mod strucvars_csq; pub mod versions; -pub mod frequencies; #[derive(Debug, serde::Serialize, utoipa::ToSchema)] pub struct CustomError { diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 350d70fd..68db66fa 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -125,7 +125,7 @@ pub fn print_hints(args: &Args) { // The endpoint `/genes/txs` provides transcript information. tracing::info!( - " try: http://{}:{}/genes/txs?hgncId=HGNC:1100&\ + " try: http://{}:{}/api/v1/genes/transcripts?hgnc_id=HGNC:1100&genome_build=grch37&\ genomeBuild=GENOME_BUILD_GRCH37", args.listen_host.as_str(), args.listen_port @@ -133,27 +133,27 @@ pub fn print_hints(args: &Args) { // The endpoint `/tx/csq` to compute the consequence of a variant; without and with filtering // for HGNC gene ID. tracing::info!( - " try: http://{}:{}/seqvars/csq?genome_release=grch37\ + " try: http://{}:{}/api/v1/seqvars/csq?genome_release=grch37\ &chromosome=17&position=48275363&reference=C&alternative=A", args.listen_host.as_str(), args.listen_port ); tracing::info!( - " try: http://{}:{}/seqvars/csq?genome_release=grch37\ + " try: http://{}:{}/api/v1/seqvars/csq?genome_release=grch37\ &chromosome=17&position=48275363&reference=C&alternative=A&hgnc_id=HGNC:2197", args.listen_host.as_str(), args.listen_port ); // The endpoint `/strucvars/csq` computes the consequence of an SV. tracing::info!( - " try: http://{}:{}/strucvars/csq?genome_release=grch37\ + " try: http://{}:{}/api/v1/strucvars/csq?genome_release=grch37\ &chromosome=17&start=48275360&&stop=48275370&sv_type=DEL", args.listen_host.as_str(), args.listen_port ); // The endpoint `/structvars/csq` computes the consequence of an SV. tracing::info!( - " try: http://{}:{}/strucvars/csq?genome_release=grch37\ + " try: http://{}:{}/api/v1/strucvars/csq?genome_release=grch37\ &chromosome=17&start=48275360&&stop=48275370&sv_type=DEL", args.listen_host.as_str(), args.listen_port From 40796abd1bfdcc0bfce594efebc30cd13d97243a Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 13:11:57 +0100 Subject: [PATCH 18/31] do not initialize predictors/annotators multiple times --- src/annotate/seqvars/mod.rs | 86 +++++++++------------- src/server/run/actix_server/frequencies.rs | 3 - src/server/run/mod.rs | 56 ++++++++------ 3 files changed, 67 insertions(+), 78 deletions(-) diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index bf15e35c..887d1915 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1331,35 +1331,6 @@ pub(crate) struct Annotator { annotators: Vec, } -impl Annotator { - pub(crate) fn consequence(&self) -> Option<&ConsequenceAnnotator> { - for annotator in &self.annotators { - if let AnnotatorEnum::Consequence(a) = annotator { - return Some(a); - } - } - None - } - - pub(crate) fn frequencies(&self) -> Option<&FrequencyAnnotator> { - for annotator in &self.annotators { - if let AnnotatorEnum::Frequency(a) = annotator { - return Some(a); - } - } - None - } - - pub(crate) fn clinvar(&self) -> Option<&ClinvarAnnotator> { - for annotator in &self.annotators { - if let AnnotatorEnum::Clinvar(a) = annotator { - return Some(a); - } - } - None - } -} - #[derive(Debug)] pub struct FrequencyAnnotator { db: DBWithThreadMode, @@ -1997,7 +1968,6 @@ pub(crate) fn setup_seqvars_annotator( assembly: Option, ) -> Result { let mut annotators = vec![]; - let pb_assembly = assembly.as_ref().and_then(proto_assembly_from); // Add the frequency annotator if requested. if let Some(rocksdb_path) = &sources.frequencies { @@ -2037,29 +2007,7 @@ pub(crate) fn setup_seqvars_annotator( if let Some(tx_sources) = &sources.transcripts { tracing::info!("Opening transcript database(s)"); - // Filter out any transcript databases that do not match the requested assembly. - let check_assembly = |db: &TxSeqDatabase, assembly: crate::pbs::txs::Assembly| { - db.source_version - .iter() - .map(|s| s.assembly) - .any(|a| a == i32::from(assembly)) - }; - let databases = tx_sources - .iter() - .enumerate() - .map(|(i, path)| (i, load_tx_db(path))) - .filter_map(|(i, txdb)| match txdb { - Ok(db) => match pb_assembly { - Some(assembly) if check_assembly(&db, assembly) => Some(Ok(db)), - Some(_) => { - tracing::info!("Skipping transcript database {} as its version {:?} does not support the requested assembly ({:?})", &tx_sources[i], &db.source_version, &assembly); - None - }, - None => Some(Ok(db)), - }, - Err(_) => Some(txdb), - }) - .collect::>>()?; + let databases = load_transcript_dbs_for_assembly(tx_sources, assembly)?; if databases.is_empty() { tracing::warn!("No suitable transcript databases found for requested assembly {:?}, therefore no consequence prediction will occur.", &assembly); @@ -2080,6 +2028,38 @@ pub(crate) fn setup_seqvars_annotator( Ok(annotator) } +pub(crate) fn load_transcript_dbs_for_assembly( + tx_sources: &Vec, + assembly: Option, +) -> Result, Error> { + let pb_assembly = assembly.as_ref().and_then(proto_assembly_from); + + // Filter out any transcript databases that do not match the requested assembly. + let check_assembly = |db: &TxSeqDatabase, assembly: crate::pbs::txs::Assembly| { + db.source_version + .iter() + .map(|s| s.assembly) + .any(|a| a == i32::from(assembly)) + }; + let databases = tx_sources + .iter() + .enumerate() + .map(|(i, path)| (i, load_tx_db(path))) + .filter_map(|(i, txdb)| match txdb { + Ok(db) => match pb_assembly { + Some(assembly) if check_assembly(&db, assembly) => Some(Ok(db)), + Some(_) => { + tracing::info!("Skipping transcript database {} as its version {:?} does not support the requested assembly ({:?})", &tx_sources[i], &db.source_version, &assembly); + None + }, + None => Some(Ok(db)), + }, + Err(_) => Some(txdb), + }) + .collect::>>()?; + Ok(databases) +} + /// Create for all alternate alleles from the given VCF record. pub fn from_vcf_allele(value: &noodles::vcf::variant::RecordBuf, allele_no: usize) -> keys::Var { let chrom = value.reference_sequence_name().to_string(); diff --git a/src/server/run/actix_server/frequencies.rs b/src/server/run/actix_server/frequencies.rs index b08f3356..99635014 100644 --- a/src/server/run/actix_server/frequencies.rs +++ b/src/server/run/actix_server/frequencies.rs @@ -28,8 +28,6 @@ pub(crate) struct FrequencyQuery { pub reference: String, /// SPDI insertion. pub alternative: String, - /// Optionally, the HGNC ID of the gene to limit to. - pub hgnc_id: Option, } /// One entry in `FrequencyResponse`. @@ -112,7 +110,6 @@ async fn handle_impl( position, reference, alternative, - hgnc_id, } = query.clone().into_inner(); let annotator = data diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 68db66fa..72b30ef2 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -1,6 +1,9 @@ use crate::annotate::cli::{Sources, TranscriptSettings}; use crate::annotate::seqvars::csq::ConfigBuilder; -use crate::annotate::seqvars::{setup_seqvars_annotator, FrequencyAnnotator}; +use crate::annotate::seqvars::{ + load_transcript_dbs_for_assembly, ConsequenceAnnotator, FrequencyAnnotator, +}; +use crate::db::merge::merge_transcript_databases; use crate::{ annotate::{ seqvars::csq::ConsequencePredictor as SeqvarConsequencePredictor, @@ -182,28 +185,37 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a tracing::info!(" - loading genome release {:?}", genome_release); let assembly = genome_release.into(); - let annotator = - setup_seqvars_annotator(&args.sources, &args.transcript_settings, Some(assembly))?; - if let Some(seqvars_csq_predictor) = annotator.consequence().map(|a| &a.predictor) { - let config = ConfigBuilder::default() - .report_most_severe_consequence_by( - args.transcript_settings.report_most_severe_consequence_by, - ) - .transcript_source(args.transcript_settings.transcript_source) - .build()?; - - let provider = seqvars_csq_predictor.provider.clone(); - data.provider.insert(genome_release, provider.clone()); + if let Some(tx_db_paths) = args.sources.transcripts.as_ref() { tracing::info!(" - building seqvars predictors"); - data.seqvars_predictors.insert( - genome_release, - SeqvarConsequencePredictor::new(provider.clone(), config), - ); - tracing::info!(" - building strucvars predictors"); - data.strucvars_predictors.insert( - genome_release, - StrucvarConsequencePredictor::new(provider.clone(), assembly), - ); + let tx_dbs = load_transcript_dbs_for_assembly(tx_db_paths, Some(assembly))?; + if tx_dbs.is_empty() { + tracing::warn!( + "No transcript databases loaded, respective endpoint will be unavailable." + ); + } else { + let tx_db = merge_transcript_databases(tx_dbs)?; + let annotator = + ConsequenceAnnotator::from_db_and_settings(tx_db, &args.transcript_settings)?; + let config = ConfigBuilder::default() + .report_most_severe_consequence_by( + args.transcript_settings.report_most_severe_consequence_by, + ) + .transcript_source(args.transcript_settings.transcript_source) + .build()?; + + let provider = annotator.predictor.provider.clone(); + data.provider.insert(genome_release, provider.clone()); + data.seqvars_predictors.insert( + genome_release, + SeqvarConsequencePredictor::new(provider.clone(), config), + ); + + tracing::info!(" - building strucvars predictors"); + data.strucvars_predictors.insert( + genome_release, + StrucvarConsequencePredictor::new(provider.clone(), assembly), + ); + } } else { tracing::warn!( "No predictors for genome release {:?}, respective endpoint will be unavailable.", From 9a70495e84c88707c4649a49065a256fe9999c82 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 13:21:32 +0100 Subject: [PATCH 19/31] add frequency to apidocs --- src/server/run/mod.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 72b30ef2..9f310e95 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -26,6 +26,10 @@ pub mod openapi { StrucvarsGeneTranscriptEffects, StrucvarsTranscriptEffect, }; use crate::common::GenomeRelease; + use crate::server::run::actix_server::frequencies::{ + AutosomalResultEntry, FrequencyQuery, FrequencyResponse, FrequencyResultEntry, + GonosomalResultEntry, MitochondrialResultEntry, + }; use crate::server::run::actix_server::gene_txs::{ ExonAlignment, GenesTranscriptsListQuery, GenesTranscriptsListResponse, GenomeAlignment, Strand, Transcript, TranscriptBiotype, TranscriptTag, @@ -40,7 +44,9 @@ pub mod openapi { Assembly, DataVersionEntry, SoftwareVersions, VersionsInfoResponse, }; - use super::actix_server::{gene_txs, seqvars_csq, strucvars_csq, versions, CustomError}; + use super::actix_server::{ + frequencies, gene_txs, seqvars_csq, strucvars_csq, versions, CustomError, + }; /// Utoipa-based `OpenAPI` generation helper. #[derive(utoipa::OpenApi)] @@ -50,7 +56,8 @@ pub mod openapi { gene_txs::handle_with_openapi, seqvars_csq::handle_with_openapi, strucvars_csq::handle_with_openapi, - strucvars_csq::handle_with_openapi + strucvars_csq::handle_with_openapi, + frequencies::handle_with_openapi, ), components(schemas( Assembly, @@ -83,6 +90,12 @@ pub mod openapi { Transcript, TranscriptBiotype, TranscriptTag, + FrequencyQuery, + FrequencyResponse, + FrequencyResultEntry, + AutosomalResultEntry, + GonosomalResultEntry, + MitochondrialResultEntry, )) )] pub struct ApiDoc; From cc078768f286c50dcd5ce04b34d5b1e8c521ce76 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 14:19:58 +0100 Subject: [PATCH 20/31] also allow multiple --frequencies and --clinvar options --- src/annotate/cli.rs | 4 +- src/annotate/seqvars/mod.rs | 104 +++++++++++++++++++++++------------- src/server/run/mod.rs | 38 ++++++------- 3 files changed, 83 insertions(+), 63 deletions(-) diff --git a/src/annotate/cli.rs b/src/annotate/cli.rs index c8ae0a37..ed76b084 100644 --- a/src/annotate/cli.rs +++ b/src/annotate/cli.rs @@ -15,14 +15,14 @@ pub struct Sources { /// The frequency database contains gnomAD frequencies for the variants. /// Pre-built databases are available at TODO #[arg(long)] - pub frequencies: Option, + pub frequencies: Option>, /// ClinVar database. /// /// The ClinVar database contains clinical significance information for the variants. /// Pre-built databases are available at https://github.com/varfish-org/annonars-data-clinvar/releases #[arg(long)] - pub clinvar: Option, + pub clinvar: Option>, } #[derive(Debug, ClapArgs, Default, Clone)] diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 887d1915..0f8b9fb1 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1970,36 +1970,18 @@ pub(crate) fn setup_seqvars_annotator( let mut annotators = vec![]; // Add the frequency annotator if requested. - if let Some(rocksdb_path) = &sources.frequencies { - let skip = assembly.map_or(false, |a| !rocksdb_path.contains(path_component(a))); - if !skip { - tracing::info!( - "Loading frequency database for assembly {:?} from {}", - &assembly, - &rocksdb_path - ); - annotators - .push(FrequencyAnnotator::from_path(rocksdb_path).map(AnnotatorEnum::Frequency)?) - } else { - tracing::warn!("Skipping frequency database as its assembly does not match the requested one ({:?})", &assembly); + if let Some(rocksdb_paths) = &sources.frequencies { + let freq_dbs = load_frequency_dbs_for_assembly(rocksdb_paths, assembly)?; + for freq_db in freq_dbs { + annotators.push(AnnotatorEnum::Frequency(freq_db)) } } // Add the ClinVar annotator if requested. - if let Some(rocksdb_path) = &sources.clinvar { - let skip = assembly.map_or(false, |a| !rocksdb_path.contains(path_component(a))); - if !skip { - tracing::info!( - "Loading ClinVar database for assembly {:?} from {}", - &assembly, - &rocksdb_path - ); - annotators.push(ClinvarAnnotator::from_path(rocksdb_path).map(AnnotatorEnum::Clinvar)?) - } else { - tracing::warn!( - "Skipping ClinVar database as its assembly does not match the requested one ({:?})", - &assembly - ); + if let Some(rocksdb_paths) = &sources.clinvar { + let clinvar_dbs = load_clinvar_dbs_for_assembly(rocksdb_paths, assembly)?; + for clinvar_db in clinvar_dbs { + annotators.push(AnnotatorEnum::Clinvar(clinvar_db)) } } @@ -2028,6 +2010,52 @@ pub(crate) fn setup_seqvars_annotator( Ok(annotator) } +pub(crate) fn load_clinvar_dbs_for_assembly( + rocksdb_paths: &[String], + assembly: Option, +) -> Result, Error> { + rocksdb_paths + .iter() + .filter_map(|rocksdb_path| { + let skip = assembly.map_or(false, |a| !rocksdb_path.contains(path_component(a))); + if !skip { + tracing::info!( + "Loading ClinVar database for assembly {:?} from {}", + &assembly, + &rocksdb_path + ); + Some(ClinvarAnnotator::from_path(rocksdb_path)) + } else { + tracing::warn!( + "Skipping ClinVar database as its assembly does not match the requested one ({:?})", + &assembly + ); + None + } + }) + .collect() +} + +pub(crate) fn load_frequency_dbs_for_assembly( + rocksdb_paths: &[String], + assembly: Option, +) -> Result, Error> { + rocksdb_paths.iter().filter_map(|rocksdb_path| { + let skip = assembly.map_or(false, |a| !rocksdb_path.contains(path_component(a))); + if !skip { + tracing::info!( + "Loading frequency database for assembly {:?} from {}", + &assembly, + &rocksdb_path + ); + Some(FrequencyAnnotator::from_path(rocksdb_path)) + } else { + tracing::warn!("Skipping frequency database as its assembly does not match the requested one ({:?})", &assembly); + None + } + }).collect() +} + pub(crate) fn load_transcript_dbs_for_assembly( tx_sources: &Vec, assembly: Option, @@ -2113,8 +2141,8 @@ mod test { "tests/data/annotate/seqvars/brca1.examples.ped", )), sources: Sources { - frequencies: Some(format!("{prefix}/{assembly}/seqvars/freqs")), - clinvar: Some(format!("{prefix}/{assembly}/seqvars/clinvar")), + frequencies: Some(vec![format!("{prefix}/{assembly}/seqvars/freqs")]), + clinvar: Some(vec![format!("{prefix}/{assembly}/seqvars/clinvar")]), transcripts: Some(vec![format!("{prefix}/{assembly}/txs.bin.zst")]), }, hgnc: None, @@ -2154,8 +2182,8 @@ mod test { "tests/data/annotate/seqvars/brca1.examples.ped", )), sources: Sources { - frequencies: Some(format!("{prefix}/{assembly}/seqvars/freqs")), - clinvar: Some(format!("{prefix}/{assembly}/seqvars/clinvar")), + frequencies: Some(vec![format!("{prefix}/{assembly}/seqvars/freqs")]), + clinvar: Some(vec![format!("{prefix}/{assembly}/seqvars/clinvar")]), transcripts: Some(vec![format!("{prefix}/{assembly}/txs.bin.zst")]), }, hgnc: Some(format!("{prefix}/hgnc.tsv")), @@ -2207,8 +2235,8 @@ mod test { "tests/data/annotate/seqvars/badly_formed_vcf_entry.ped", )), sources: Sources { - frequencies: Some(format!("{prefix}/{assembly}/seqvars/freqs")), - clinvar: Some(format!("{prefix}/{assembly}/seqvars/clinvar")), + frequencies: Some(vec![format!("{prefix}/{assembly}/seqvars/freqs")]), + clinvar: Some(vec![format!("{prefix}/{assembly}/seqvars/clinvar")]), transcripts: Some(vec![format!("{prefix}/{assembly}/txs.bin.zst")]), }, hgnc: Some(format!("{prefix}/hgnc.tsv")), @@ -2254,8 +2282,8 @@ mod test { "tests/data/annotate/seqvars/mitochondrial_variants.ped", )), sources: Sources { - frequencies: Some(format!("{prefix}/{assembly}/seqvars/freqs")), - clinvar: Some(format!("{prefix}/{assembly}/seqvars/clinvar")), + frequencies: Some(vec![format!("{prefix}/{assembly}/seqvars/freqs")]), + clinvar: Some(vec![format!("{prefix}/{assembly}/seqvars/clinvar")]), transcripts: Some(vec![format!("{prefix}/{assembly}/txs.bin.zst")]), }, hgnc: Some(format!("{prefix}/hgnc.tsv")), @@ -2303,8 +2331,8 @@ mod test { "tests/data/annotate/seqvars/clair3-glnexus-min.ped", )), sources: Sources { - frequencies: Some(format!("{prefix}/{assembly}/seqvars/freqs")), - clinvar: Some(format!("{prefix}/{assembly}/seqvars/clinvar")), + frequencies: Some(vec![format!("{prefix}/{assembly}/seqvars/freqs")]), + clinvar: Some(vec![format!("{prefix}/{assembly}/seqvars/clinvar")]), transcripts: Some(vec![format!("{prefix}/{assembly}/txs.bin.zst")]), }, hgnc: Some(format!("{prefix}/hgnc.tsv")), @@ -2352,8 +2380,8 @@ mod test { "tests/data/annotate/seqvars/brca2_zar1l/brca2_zar1l.ped", )), sources: Sources { - frequencies: Some(format!("{prefix}/{assembly}/seqvars/freqs")), - clinvar: Some(format!("{prefix}/{assembly}/seqvars/clinvar")), + frequencies: Some(vec![format!("{prefix}/{assembly}/seqvars/freqs")]), + clinvar: Some(vec![format!("{prefix}/{assembly}/seqvars/clinvar")]), transcripts: Some(vec![format!("{prefix}/{assembly}/txs.bin.zst")]), }, hgnc: Some(format!("{prefix}/hgnc.tsv")), diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 9f310e95..32002a92 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -1,7 +1,7 @@ use crate::annotate::cli::{Sources, TranscriptSettings}; use crate::annotate::seqvars::csq::ConfigBuilder; use crate::annotate::seqvars::{ - load_transcript_dbs_for_assembly, ConsequenceAnnotator, FrequencyAnnotator, + load_frequency_dbs_for_assembly, load_transcript_dbs_for_assembly, ConsequenceAnnotator, }; use crate::db::merge::merge_transcript_databases; use crate::{ @@ -194,8 +194,9 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a tracing::info!("Loading database..."); let before_loading = std::time::Instant::now(); let mut data = actix_server::WebServerData::default(); + for genome_release in GenomeRelease::value_variants().iter().copied() { - tracing::info!(" - loading genome release {:?}", genome_release); + tracing::info!("Loading genome release {:?}", genome_release); let assembly = genome_release.into(); if let Some(tx_db_paths) = args.sources.transcripts.as_ref() { @@ -236,30 +237,21 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a ); } - if let Some(path) = args.sources.frequencies.as_ref() { - if path - .to_ascii_lowercase() - .contains(&genome_release.name().to_ascii_lowercase()) - { - if let Ok(freq) = FrequencyAnnotator::from_path(path) { + if let Some(paths) = args.sources.frequencies.as_ref() { + let freqs = load_frequency_dbs_for_assembly(paths, Some(assembly))?; + + match freqs.len() { + 0 => tracing::warn!( + "No frequency databases loaded, respective endpoint will be unavailable." + ), + 1 => { + let freq = freqs.into_iter().next().unwrap(); data.frequency_annotators.insert(genome_release, freq); - } else { - tracing::warn!( - "Failed to load frequencies predictor for genome release {:?}, respective endpoint will be unavailable.", - genome_release - ); } - } else { - tracing::warn!( - "No frequencies predictor for genome release {:?}, respective endpoint will be unavailable.", - genome_release - ); + _ => tracing::warn!( + "Multiple frequency databases loaded, only the first one will be used." + ), } - } else { - tracing::warn!( - "No frequencies predictor for genome release {:?}, respective endpoint will be unavailable.", - genome_release - ); } } let data = actix_web::web::Data::new(data); From b0af0ea42f02775528f6970536bcf5abb04dbfc0 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 15:12:46 +0100 Subject: [PATCH 21/31] whitespace --- src/annotate/strucvars/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs index e1fbee84..09d5809e 100644 --- a/src/annotate/strucvars/mod.rs +++ b/src/annotate/strucvars/mod.rs @@ -61,12 +61,15 @@ pub struct Args { /// Genome release to use, default is to auto-detect. #[arg(long, value_enum)] pub genome_release: Option, + /// Path to the input PED file. #[arg(long)] pub path_input_ped: String, + /// Path to the input VCF files. #[arg(long, required = true)] pub path_input_vcf: Vec, + #[command(flatten)] pub output: PathOutput, From 40976750b51a7e9394e271fe89cf81b2067acd4c Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 16:10:59 +0100 Subject: [PATCH 22/31] add clinvar endpoint --- src/annotate/seqvars/mod.rs | 77 ++++++++-- src/server/run/actix_server/mod.rs | 20 ++- .../run/actix_server/seqvars_clinvar.rs | 132 ++++++++++++++++++ ...{frequencies.rs => seqvars_frequencies.rs} | 14 +- src/server/run/mod.rs | 49 +++++-- 5 files changed, 260 insertions(+), 32 deletions(-) create mode 100644 src/server/run/actix_server/seqvars_clinvar.rs rename src/server/run/actix_server/{frequencies.rs => seqvars_frequencies.rs} (92%) diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 0f8b9fb1..ab8ac8af 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -8,6 +8,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Instant; +use self::ann::{AnnField, Consequence, FeatureBiotype}; use crate::annotate::cli::{Sources, TranscriptSettings}; use crate::annotate::genotype_string; use crate::annotate::seqvars::csq::{ @@ -52,8 +53,6 @@ use strum::Display; use thousands::Separable; use tokio::io::AsyncWriteExt; -use self::ann::{AnnField, Consequence, FeatureBiotype}; - pub mod ann; pub mod binning; pub mod csq; @@ -1527,8 +1526,9 @@ impl FrequencyAnnotator { pub(crate) fn annotate_variant( &self, vcf_var: &VcfVariant, - ) -> anyhow::Result> - { + ) -> anyhow::Result< + Option, + > { // Only attempt lookups into RocksDB for canonical contigs. if !is_canonical(&vcf_var.chromosome) { return Ok(None); @@ -1542,7 +1542,7 @@ impl FrequencyAnnotator { &vcf_var.alternative, ); let key: Vec = vcf_var.clone().into(); - use crate::server::run::actix_server::frequencies::*; + use crate::server::run::actix_server::seqvars_frequencies::*; // Annotate with frequency. if CHROM_AUTO.contains(vcf_var.chrom.as_str()) { if let Some(freq) = self @@ -1613,6 +1613,7 @@ impl FrequencyAnnotator { } } +#[derive(Debug)] pub struct ClinvarAnnotator { db: DBWithThreadMode, } @@ -1723,6 +1724,64 @@ impl ClinvarAnnotator { } Ok(()) } + + pub(crate) fn annotate_variant( + &self, + vcf_var: &VcfVariant, + ) -> anyhow::Result> + { + // Only attempt lookups into RocksDB for canonical contigs. + if !is_canonical(&vcf_var.chromosome) { + return Ok(None); + } + + // Build key for RocksDB database + let vcf_var = keys::Var::from( + &vcf_var.chromosome, + vcf_var.position, + &vcf_var.reference, + &vcf_var.alternative, + ); + let key: Vec = vcf_var.clone().into(); + + if let Some(raw_value) = self + .db + .get_cf(self.db.cf_handle("clinvar").as_ref().unwrap(), key)? + { + let record_list = annonars::pbs::clinvar::minimal::ExtractedVcvRecordList::decode( + &mut Cursor::new(&raw_value), + )?; + + let mut clinvar_vcvs = Vec::new(); + let mut clinvar_germline_classifications = Vec::new(); + for clinvar_record in record_list.records.iter() { + let accession = clinvar_record.accession.as_ref().expect("must have VCV"); + let vcv = format!("{}.{}", accession.accession, accession.version); + let classifications = clinvar_record + .classifications + .as_ref() + .expect("must have classifications"); + if let Some(germline_classification) = &classifications.germline_classification { + let description = germline_classification + .description + .as_ref() + .expect("description missing") + .to_string(); + clinvar_vcvs.push(vcv); + clinvar_germline_classifications.push(description); + } + } + + Ok(Some( + crate::server::run::actix_server::seqvars_clinvar::ClinvarResultEntry { + clinvar_vcv: clinvar_vcvs, + clinvar_germline_classification: clinvar_germline_classifications, + }, + )) + } else { + Ok(None) + } + } } pub(crate) struct ConsequenceAnnotator { @@ -1971,7 +2030,7 @@ pub(crate) fn setup_seqvars_annotator( // Add the frequency annotator if requested. if let Some(rocksdb_paths) = &sources.frequencies { - let freq_dbs = load_frequency_dbs_for_assembly(rocksdb_paths, assembly)?; + let freq_dbs = initialize_frequency_annotators_for_assembly(rocksdb_paths, assembly)?; for freq_db in freq_dbs { annotators.push(AnnotatorEnum::Frequency(freq_db)) } @@ -1979,7 +2038,7 @@ pub(crate) fn setup_seqvars_annotator( // Add the ClinVar annotator if requested. if let Some(rocksdb_paths) = &sources.clinvar { - let clinvar_dbs = load_clinvar_dbs_for_assembly(rocksdb_paths, assembly)?; + let clinvar_dbs = intialize_clinvar_annotators_for_assembly(rocksdb_paths, assembly)?; for clinvar_db in clinvar_dbs { annotators.push(AnnotatorEnum::Clinvar(clinvar_db)) } @@ -2010,7 +2069,7 @@ pub(crate) fn setup_seqvars_annotator( Ok(annotator) } -pub(crate) fn load_clinvar_dbs_for_assembly( +pub(crate) fn intialize_clinvar_annotators_for_assembly( rocksdb_paths: &[String], assembly: Option, ) -> Result, Error> { @@ -2036,7 +2095,7 @@ pub(crate) fn load_clinvar_dbs_for_assembly( .collect() } -pub(crate) fn load_frequency_dbs_for_assembly( +pub(crate) fn initialize_frequency_annotators_for_assembly( rocksdb_paths: &[String], assembly: Option, ) -> Result, Error> { diff --git a/src/server/run/actix_server/mod.rs b/src/server/run/actix_server/mod.rs index 24b21d54..1119692a 100644 --- a/src/server/run/actix_server/mod.rs +++ b/src/server/run/actix_server/mod.rs @@ -6,13 +6,14 @@ use actix_web::ResponseError; use utoipa::OpenApi as _; use crate::annotate::seqvars::provider::Provider as MehariProvider; -use crate::annotate::seqvars::FrequencyAnnotator; +use crate::annotate::seqvars::{ClinvarAnnotator, FrequencyAnnotator}; use crate::annotate::strucvars::csq::ConsequencePredictor as StrucvarConsequencePredictor; use crate::{annotate::seqvars::csq::ConsequencePredictor, common::GenomeRelease}; -pub mod frequencies; pub mod gene_txs; +pub mod seqvars_clinvar; pub mod seqvars_csq; +pub mod seqvars_frequencies; pub mod strucvars_csq; pub mod versions; @@ -44,12 +45,19 @@ pub struct WebServerData { /// `MehariProvider` to provide the transcript info. #[derivative(Debug = "ignore")] pub provider: std::collections::HashMap>, + /// The sequence variant consequence predictors for each assembly. pub seqvars_predictors: std::collections::HashMap, - /// The structural variant consequence predictors for eacha ssembly. + + /// The structural variant consequence predictors for each assembly. pub strucvars_predictors: std::collections::HashMap, + + /// The frequency annotators for each assembly. pub frequency_annotators: std::collections::HashMap, + + /// The clinvar annotators for each assembly. + pub clinvar_annotators: std::collections::HashMap, } /// Main entry point for running the REST server. @@ -67,8 +75,10 @@ pub async fn main( .service(seqvars_csq::handle_with_openapi) .service(strucvars_csq::handle) .service(strucvars_csq::handle_with_openapi) - .service(frequencies::handle) - .service(frequencies::handle_with_openapi) + .service(seqvars_frequencies::handle) + .service(seqvars_frequencies::handle_with_openapi) + .service(seqvars_clinvar::handle) + .service(seqvars_clinvar::handle_with_openapi) .service(versions::handle) .service( utoipa_swagger_ui::SwaggerUi::new("/swagger-ui/{_:.*}") diff --git a/src/server/run/actix_server/seqvars_clinvar.rs b/src/server/run/actix_server/seqvars_clinvar.rs new file mode 100644 index 00000000..9abc5a68 --- /dev/null +++ b/src/server/run/actix_server/seqvars_clinvar.rs @@ -0,0 +1,132 @@ +//! Implementation of endpoint `/api/v1/seqvars/clinvar`. +//! +//! Also includes the implementation of the `/seqvars/clinvar` endpoint (deprecated). + +use actix_web::{ + get, + web::{self, Data, Json, Path}, +}; + +use crate::{annotate::seqvars::csq::VcfVariant, common::GenomeRelease}; + +use super::{versions::VersionsInfoResponse, CustomError}; + +/// Query parameters of the `/api/v1/seqvars/clinvar` endpoint. +#[derive( + Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::IntoParams, utoipa::ToSchema, +)] +#[serde(rename_all = "snake_case")] +#[serde_with::skip_serializing_none] +pub(crate) struct ClinvarQuery { + /// The assembly. + pub genome_release: GenomeRelease, + /// SPDI sequence. + pub chromosome: String, + /// SPDI position. + pub position: u32, + /// SPDI deletion. + pub reference: String, + /// SPDI insertion. + pub alternative: String, +} + +/// One entry in `ClinvarResponse`. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +pub(crate) struct ClinvarResultEntry { + pub clinvar_vcv: Vec, + pub clinvar_germline_classification: Vec, +} + +/// Response of the `/api/v1/seqvars/clinvar` endpoint. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +pub(crate) struct ClinvarResponse { + /// Version information. + pub version: VersionsInfoResponse, + + /// The original query records. + pub query: ClinvarQuery, + + /// The resulting records for the scored genes. + pub result: Vec, +} + +/// Implementation of endpoints. +async fn handle_impl( + data: Data, + _path: Path<()>, + query: web::Query, +) -> actix_web::Result, super::CustomError> { + let ClinvarQuery { + genome_release, + chromosome, + position, + reference, + alternative, + } = query.clone().into_inner(); + + let annotator = data + .clinvar_annotators + .get(&genome_release) + .ok_or_else(|| { + super::CustomError::new(anyhow::anyhow!( + "genome release not supported: {:?}", + &query.genome_release + )) + })?; + + let mut result = Vec::new(); + let g_var = VcfVariant { + chromosome, + position: position as i32, + reference, + alternative, + }; + let annotations = annotator + .annotate_variant(&g_var) + .map_err(|e| super::CustomError::new(anyhow::anyhow!("annotation failed: {}", &e)))?; + if let Some(annotations) = annotations { + result.push(annotations); + } + + let result = ClinvarResponse { + version: VersionsInfoResponse::from_web_server_data(data.into_inner().as_ref()) + .map_err(|e| CustomError::new(anyhow::anyhow!("Problem determining version: {}", e)))?, + query: query.into_inner(), + result, + }; + + Ok(Json(result)) +} + +/// Query for gnomAD frequencies of a variant. +#[allow(clippy::unused_async)] +#[get("/seqvars/clinvar")] +async fn handle( + data: Data, + _path: Path<()>, + query: web::Query, +) -> actix_web::Result, super::CustomError> { + handle_impl(data, _path, query).await +} + +/// Query for gnomAD frequencies of a variant. +#[allow(clippy::unused_async)] +#[utoipa::path( + get, + operation_id = "seqvarsClinvar", + params( + ClinvarQuery + ), + responses( + (status = 200, description = "Clinvar information.", body = ClinvarResponse), + (status = 500, description = "Internal server error.", body = CustomError) + ) +)] +#[get("/api/v1/seqvars/clinvar")] +async fn handle_with_openapi( + data: Data, + _path: Path<()>, + query: web::Query, +) -> actix_web::Result, super::CustomError> { + handle_impl(data, _path, query).await +} diff --git a/src/server/run/actix_server/frequencies.rs b/src/server/run/actix_server/seqvars_frequencies.rs similarity index 92% rename from src/server/run/actix_server/frequencies.rs rename to src/server/run/actix_server/seqvars_frequencies.rs index 99635014..c0477c6f 100644 --- a/src/server/run/actix_server/frequencies.rs +++ b/src/server/run/actix_server/seqvars_frequencies.rs @@ -1,6 +1,6 @@ -//! Implementation of endpoint `/api/v1/frequency`. +//! Implementation of endpoint `/api/v1/seqvars/frequency`. //! -//! Also includes the implementation of the `/frequency` endpoint (deprecated). +//! Also includes the implementation of the `/seqvars/frequency` endpoint (deprecated). use actix_web::{ get, @@ -11,7 +11,7 @@ use crate::{annotate::seqvars::csq::VcfVariant, common::GenomeRelease}; use super::{versions::VersionsInfoResponse, CustomError}; -/// Query parameters of the `/api/v1/seqvars/csq` endpoint. +/// Query parameters of the `/api/v1/seqvars/frequency` endpoint. #[derive( Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::IntoParams, utoipa::ToSchema, )] @@ -87,7 +87,7 @@ pub(crate) struct MitochondrialResultEntry { pub gnomad_genomes_het: u32, } -/// Response of the `/api/v1/frequency` endpoint. +/// Response of the `/api/v1/seqvars/frequency` endpoint. #[derive(Debug, Clone, serde::Serialize, serde::Deserialize, utoipa::ToSchema)] pub(crate) struct FrequencyResponse { /// Version information. @@ -148,7 +148,7 @@ async fn handle_impl( /// Query for gnomAD frequencies of a variant. #[allow(clippy::unused_async)] -#[get("/frequency")] +#[get("/seqvars/frequency")] async fn handle( data: Data, _path: Path<()>, @@ -161,7 +161,7 @@ async fn handle( #[allow(clippy::unused_async)] #[utoipa::path( get, - operation_id = "frequency", + operation_id = "seqvarsFrequency", params( FrequencyQuery ), @@ -170,7 +170,7 @@ async fn handle( (status = 500, description = "Internal server error.", body = CustomError) ) )] -#[get("/api/v1/frequency")] +#[get("/api/v1/seqvars/frequency")] async fn handle_with_openapi( data: Data, _path: Path<()>, diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 32002a92..7427c261 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -1,7 +1,8 @@ use crate::annotate::cli::{Sources, TranscriptSettings}; use crate::annotate::seqvars::csq::ConfigBuilder; use crate::annotate::seqvars::{ - load_frequency_dbs_for_assembly, load_transcript_dbs_for_assembly, ConsequenceAnnotator, + initialize_frequency_annotators_for_assembly, intialize_clinvar_annotators_for_assembly, + load_transcript_dbs_for_assembly, ConsequenceAnnotator, }; use crate::db::merge::merge_transcript_databases; use crate::{ @@ -26,17 +27,20 @@ pub mod openapi { StrucvarsGeneTranscriptEffects, StrucvarsTranscriptEffect, }; use crate::common::GenomeRelease; - use crate::server::run::actix_server::frequencies::{ - AutosomalResultEntry, FrequencyQuery, FrequencyResponse, FrequencyResultEntry, - GonosomalResultEntry, MitochondrialResultEntry, - }; use crate::server::run::actix_server::gene_txs::{ ExonAlignment, GenesTranscriptsListQuery, GenesTranscriptsListResponse, GenomeAlignment, Strand, Transcript, TranscriptBiotype, TranscriptTag, }; + use crate::server::run::actix_server::seqvars_clinvar::{ + ClinvarQuery, ClinvarResponse, ClinvarResultEntry, + }; use crate::server::run::actix_server::seqvars_csq::{ SeqvarsCsqQuery, SeqvarsCsqResponse, SeqvarsCsqResultEntry, }; + use crate::server::run::actix_server::seqvars_frequencies::{ + AutosomalResultEntry, FrequencyQuery, FrequencyResponse, FrequencyResultEntry, + GonosomalResultEntry, MitochondrialResultEntry, + }; use crate::server::run::actix_server::strucvars_csq::{ StrucvarsCsqQuery, StrucvarsCsqResponse, }; @@ -45,7 +49,8 @@ pub mod openapi { }; use super::actix_server::{ - frequencies, gene_txs, seqvars_csq, strucvars_csq, versions, CustomError, + gene_txs, seqvars_clinvar, seqvars_csq, seqvars_frequencies, strucvars_csq, versions, + CustomError, }; /// Utoipa-based `OpenAPI` generation helper. @@ -57,7 +62,8 @@ pub mod openapi { seqvars_csq::handle_with_openapi, strucvars_csq::handle_with_openapi, strucvars_csq::handle_with_openapi, - frequencies::handle_with_openapi, + seqvars_frequencies::handle_with_openapi, + seqvars_clinvar::handle_with_openapi, ), components(schemas( Assembly, @@ -96,6 +102,9 @@ pub mod openapi { AutosomalResultEntry, GonosomalResultEntry, MitochondrialResultEntry, + ClinvarQuery, + ClinvarResponse, + ClinvarResultEntry, )) )] pub struct ApiDoc; @@ -238,21 +247,39 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a } if let Some(paths) = args.sources.frequencies.as_ref() { - let freqs = load_frequency_dbs_for_assembly(paths, Some(assembly))?; + let annotators = initialize_frequency_annotators_for_assembly(paths, Some(assembly))?; - match freqs.len() { + match annotators.len() { 0 => tracing::warn!( "No frequency databases loaded, respective endpoint will be unavailable." ), 1 => { - let freq = freqs.into_iter().next().unwrap(); - data.frequency_annotators.insert(genome_release, freq); + let frequency_db = annotators.into_iter().next().unwrap(); + data.frequency_annotators + .insert(genome_release, frequency_db); } _ => tracing::warn!( "Multiple frequency databases loaded, only the first one will be used." ), } } + + if let Some(paths) = args.sources.clinvar.as_ref() { + let annotators = intialize_clinvar_annotators_for_assembly(paths, Some(assembly))?; + + match annotators.len() { + 0 => tracing::warn!( + "No clinvar databases loaded, respective endpoint will be unavailable." + ), + 1 => { + let annotator = annotators.into_iter().next().unwrap(); + data.clinvar_annotators.insert(genome_release, annotator); + } + _ => tracing::warn!( + "Multiple clinvar databases loaded, only the first one will be used." + ), + } + } } let data = actix_web::web::Data::new(data); tracing::info!("... done loading data {:?}", before_loading.elapsed()); From 05784ebd6835efb114916c6718639b33a1a23efd Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 17:06:31 +0100 Subject: [PATCH 23/31] update warning for multiple clinvar or freq dbs --- src/server/run/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 7427c261..9d78c4db 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -259,7 +259,7 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a .insert(genome_release, frequency_db); } _ => tracing::warn!( - "Multiple frequency databases loaded, only the first one will be used." + "Multiple frequency databases loaded. This is not supported. The respective endpoint will be unavailable." ), } } @@ -276,7 +276,7 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a data.clinvar_annotators.insert(genome_release, annotator); } _ => tracing::warn!( - "Multiple clinvar databases loaded, only the first one will be used." + "Multiple clinvar databases specified. This is not supported. The respective endpoint will be unavailable." ), } } From ed2b7137f96edb7a9b5ac8bff8633438037bce1b Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Thu, 2 Jan 2025 17:13:06 +0100 Subject: [PATCH 24/31] update openapi schema --- openapi.schema.yaml | 345 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 344 insertions(+), 1 deletion(-) diff --git a/openapi.schema.yaml b/openapi.schema.yaml index e082f08a..d8a32709 100644 --- a/openapi.schema.yaml +++ b/openapi.schema.yaml @@ -7,7 +7,7 @@ info: email: manuel.holtgrewe@bih-charite.de license: name: MIT - version: 0.30.0 + version: 0.30.1 paths: /api/v1/genes/transcripts: get: @@ -56,6 +56,58 @@ paths: application/json: schema: $ref: '#/components/schemas/CustomError' + /api/v1/seqvars/clinvar: + get: + tags: + - seqvars_clinvar + summary: Query for gnomAD frequencies of a variant. + operationId: seqvarsClinvar + parameters: + - name: genome_release + in: query + description: The assembly. + required: true + schema: + $ref: '#/components/schemas/GenomeRelease' + - name: chromosome + in: query + description: SPDI sequence. + required: true + schema: + type: string + - name: position + in: query + description: SPDI position. + required: true + schema: + type: integer + format: int32 + minimum: 0 + - name: reference + in: query + description: SPDI deletion. + required: true + schema: + type: string + - name: alternative + in: query + description: SPDI insertion. + required: true + schema: + type: string + responses: + '200': + description: Clinvar information. + content: + application/json: + schema: + $ref: '#/components/schemas/ClinvarResponse' + '500': + description: Internal server error. + content: + application/json: + schema: + $ref: '#/components/schemas/CustomError' /api/v1/seqvars/csq: get: tags: @@ -115,6 +167,58 @@ paths: application/json: schema: $ref: '#/components/schemas/CustomError' + /api/v1/seqvars/frequency: + get: + tags: + - seqvars_frequencies + summary: Query for gnomAD frequencies of a variant. + operationId: seqvarsFrequency + parameters: + - name: genome_release + in: query + description: The assembly. + required: true + schema: + $ref: '#/components/schemas/GenomeRelease' + - name: chromosome + in: query + description: SPDI sequence. + required: true + schema: + type: string + - name: position + in: query + description: SPDI position. + required: true + schema: + type: integer + format: int32 + minimum: 0 + - name: reference + in: query + description: SPDI deletion. + required: true + schema: + type: string + - name: alternative + in: query + description: SPDI insertion. + required: true + schema: + type: string + responses: + '200': + description: Frequency information. + content: + application/json: + schema: + $ref: '#/components/schemas/FrequencyResponse' + '500': + description: Internal server error. + content: + application/json: + schema: + $ref: '#/components/schemas/CustomError' /api/v1/strucvars/csq: get: tags: @@ -198,6 +302,98 @@ components: enum: - grch37 - grch38 + AutosomalResultEntry: + type: object + required: + - gnomad_exomes_an + - gnomad_exomes_hom + - gnomad_exomes_het + - gnomad_genomes_an + - gnomad_genomes_hom + - gnomad_genomes_het + properties: + gnomad_exomes_an: + type: integer + format: int32 + minimum: 0 + gnomad_exomes_hom: + type: integer + format: int32 + minimum: 0 + gnomad_exomes_het: + type: integer + format: int32 + minimum: 0 + gnomad_genomes_an: + type: integer + format: int32 + minimum: 0 + gnomad_genomes_hom: + type: integer + format: int32 + minimum: 0 + gnomad_genomes_het: + type: integer + format: int32 + minimum: 0 + ClinvarQuery: + type: object + description: Query parameters of the `/api/v1/seqvars/clinvar` endpoint. + required: + - genome_release + - chromosome + - position + - reference + - alternative + properties: + genome_release: + $ref: '#/components/schemas/GenomeRelease' + chromosome: + type: string + description: SPDI sequence. + position: + type: integer + format: int32 + description: SPDI position. + minimum: 0 + reference: + type: string + description: SPDI deletion. + alternative: + type: string + description: SPDI insertion. + ClinvarResponse: + type: object + description: Response of the `/api/v1/seqvars/clinvar` endpoint. + required: + - version + - query + - result + properties: + version: + $ref: '#/components/schemas/VersionsInfoResponse' + query: + $ref: '#/components/schemas/ClinvarQuery' + result: + type: array + items: + $ref: '#/components/schemas/ClinvarResultEntry' + description: The resulting records for the scored genes. + ClinvarResultEntry: + type: object + description: One entry in `ClinvarResponse`. + required: + - clinvar_vcv + - clinvar_germline_classification + properties: + clinvar_vcv: + type: array + items: + type: string + clinvar_germline_classification: + type: array + items: + type: string Consequence: type: string description: Putative impact. @@ -338,6 +534,70 @@ components: value: type: string description: Enum for `AnnField::feature_type`. + FrequencyQuery: + type: object + description: Query parameters of the `/api/v1/seqvars/frequency` endpoint. + required: + - genome_release + - chromosome + - position + - reference + - alternative + properties: + genome_release: + $ref: '#/components/schemas/GenomeRelease' + chromosome: + type: string + description: SPDI sequence. + position: + type: integer + format: int32 + description: SPDI position. + minimum: 0 + reference: + type: string + description: SPDI deletion. + alternative: + type: string + description: SPDI insertion. + FrequencyResponse: + type: object + description: Response of the `/api/v1/seqvars/frequency` endpoint. + required: + - version + - query + - result + properties: + version: + $ref: '#/components/schemas/VersionsInfoResponse' + query: + $ref: '#/components/schemas/FrequencyQuery' + result: + type: array + items: + $ref: '#/components/schemas/FrequencyResultEntry' + description: The resulting records for the scored genes. + FrequencyResultEntry: + oneOf: + - type: object + required: + - Autosomal + properties: + Autosomal: + $ref: '#/components/schemas/AutosomalResultEntry' + - type: object + required: + - Gonosomal + properties: + Gonosomal: + $ref: '#/components/schemas/GonosomalResultEntry' + - type: object + required: + - Mitochondrial + properties: + Mitochondrial: + $ref: '#/components/schemas/MitochondrialResultEntry' + description: One entry in `FrequencyResponse`. GenesTranscriptsListQuery: type: object description: Query arguments for the `/api/v1/genes/transcripts` endpoint. @@ -411,6 +671,50 @@ components: enum: - grch37 - grch38 + GonosomalResultEntry: + type: object + required: + - gnomad_exomes_an + - gnomad_exomes_hom + - gnomad_exomes_het + - gnomad_exomes_hemi + - gnomad_genomes_an + - gnomad_genomes_hom + - gnomad_genomes_het + - gnomad_genomes_hemi + properties: + gnomad_exomes_an: + type: integer + format: int32 + minimum: 0 + gnomad_exomes_hom: + type: integer + format: int32 + minimum: 0 + gnomad_exomes_het: + type: integer + format: int32 + minimum: 0 + gnomad_exomes_hemi: + type: integer + format: int32 + minimum: 0 + gnomad_genomes_an: + type: integer + format: int32 + minimum: 0 + gnomad_genomes_hom: + type: integer + format: int32 + minimum: 0 + gnomad_genomes_het: + type: integer + format: int32 + minimum: 0 + gnomad_genomes_hemi: + type: integer + format: int32 + minimum: 0 Message: type: string description: A message to be used in `AnnField::messages`. @@ -425,6 +729,40 @@ components: - info_realign_three_prime - info_compound_annotation - info_non_reference_annotation + MitochondrialResultEntry: + type: object + required: + - helix_an + - helix_hom + - helix_het + - gnomad_genomes_an + - gnomad_genomes_hom + - gnomad_genomes_het + properties: + helix_an: + type: integer + format: int32 + minimum: 0 + helix_hom: + type: integer + format: int32 + minimum: 0 + helix_het: + type: integer + format: int32 + minimum: 0 + gnomad_genomes_an: + type: integer + format: int32 + minimum: 0 + gnomad_genomes_hom: + type: integer + format: int32 + minimum: 0 + gnomad_genomes_het: + type: integer + format: int32 + minimum: 0 Pos: type: object description: Position, optionally with total length. @@ -518,6 +856,7 @@ components: - feature_id - feature_biotype - feature_tag + - strand properties: consequences: type: array @@ -573,6 +912,10 @@ components: format: int32 description: Distance to feature. nullable: true + strand: + type: integer + format: int32 + description: Strand of the alignment messages: type: array items: From f1c010fe8dd74330d1293c47153e38ba35b4cb78 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 3 Jan 2025 13:58:10 +0100 Subject: [PATCH 25/31] update entrypoint to match new server run cli --- utils/docker/entrypoint.sh | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/utils/docker/entrypoint.sh b/utils/docker/entrypoint.sh index 90b34d11..e0979adf 100644 --- a/utils/docker/entrypoint.sh +++ b/utils/docker/entrypoint.sh @@ -7,16 +7,23 @@ set -euo pipefail # # PATH_DB -- path to the database directory containing, # e.g., `grch3{7,8}/*.zst`. -# default: /data/hpo +# default: /data/mehari/db # HTTP_HOST -- host to listen on # default: 0.0.0.0 # HTTP_PORT -- port # default: 8080 -PATH_DB=${PATH_DB-/data/mehari} +PATH_DB=${PATH_DB-/data/mehari/db} HTTP_HOST=${HTTP_HOST-0.0.0.0} HTTP_PORT=${HTTP_PORT-8080} +PATH_TRANSCRIPTS_37=$PATH_DB/grch37/seqvars/txs.bin.zst +PATH_TRANSCRIPTS_38=$PATH_DB/grch38/seqvars/txs.bin.zst +PATH_FREQUENCIES_37=$PATH_DB/grch37/seqvars/freqs +PATH_FREQUENCIES_38=$PATH_DB/grch38/seqvars/freqs +PATH_CLINVAR_37=$PATH_DB/grch37/seqvars/clinvar +PATH_CLINVAR_38=$PATH_DB/grch38/seqvars/clinvar + first=${1-} if [ "$first" == exec ]; then @@ -26,7 +33,12 @@ else exec \ mehari \ server run \ - --path-db "$PATH_DB" \ + $(test -e "$PATH_TRANSCRIPTS_37" && echo --transcripts "$PATH_TRANSCRIPTS_37") \ + $(test -e "$PATH_TRANSCRIPTS_38" && echo --transcripts "$PATH_TRANSCRIPTS_38") \ + $(test -e "$PATH_FREQUENCIES_37" && echo --frequencies "$PATH_FREQUENCIES_37") \ + $(test -e "$PATH_FREQUENCIES_38" && echo --frequencies "$PATH_FREQUENCIES_38") \ + $(test -e "$PATH_CLINVAR_37" && echo --clinvar "$PATH_CLINVAR_37") \ + $(test -e "$PATH_CLINVAR_38" && echo --clinvar "$PATH_CLINVAR_38") \ --listen-host "$HTTP_HOST" \ --listen-port "$HTTP_PORT" fi From ee9ebf15005f2fa8f4ff7cd8a8461a3f2650abcf Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 3 Jan 2025 14:03:26 +0100 Subject: [PATCH 26/31] fix server run clinvar docstrings --- src/server/run/actix_server/seqvars_clinvar.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server/run/actix_server/seqvars_clinvar.rs b/src/server/run/actix_server/seqvars_clinvar.rs index 9abc5a68..50f1eca2 100644 --- a/src/server/run/actix_server/seqvars_clinvar.rs +++ b/src/server/run/actix_server/seqvars_clinvar.rs @@ -98,7 +98,7 @@ async fn handle_impl( Ok(Json(result)) } -/// Query for gnomAD frequencies of a variant. +/// Query for ClinVar information of a variant. #[allow(clippy::unused_async)] #[get("/seqvars/clinvar")] async fn handle( @@ -109,7 +109,7 @@ async fn handle( handle_impl(data, _path, query).await } -/// Query for gnomAD frequencies of a variant. +/// Query for ClinVar information of a variant. #[allow(clippy::unused_async)] #[utoipa::path( get, From 5fd01b54124d49f7b55ec42cfdfe63e5c954b087 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 3 Jan 2025 14:04:35 +0100 Subject: [PATCH 27/31] update openapi.schema.yaml accordingly --- openapi.schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openapi.schema.yaml b/openapi.schema.yaml index d8a32709..639cabaa 100644 --- a/openapi.schema.yaml +++ b/openapi.schema.yaml @@ -60,7 +60,7 @@ paths: get: tags: - seqvars_clinvar - summary: Query for gnomAD frequencies of a variant. + summary: Query for ClinVar information of a variant. operationId: seqvarsClinvar parameters: - name: genome_release From ec077dc6cd55155d9f634f48b9fb0a1cf0552ca3 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 24 Jan 2025 12:30:25 +0100 Subject: [PATCH 28/31] remove unused path_db from test --- src/annotate/strucvars/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs index 2b923c74..69ada2b5 100644 --- a/src/annotate/strucvars/mod.rs +++ b/src/annotate/strucvars/mod.rs @@ -4227,7 +4227,6 @@ mod test { let out_path = temp.join("out.vcf"); let args = Args { - path_db: String::from("tests/data/db/create"), genome_release: Some(GenomeRelease::Grch38), path_input_ped: String::from("tests/data/annotate/strucvars/test.order.ped"), path_input_vcf: vec![String::from("tests/data/annotate/strucvars/test.order.vcf")], From 61ee0967f6e5c57966c448a888e6b73ffa801094 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 24 Jan 2025 13:18:38 +0100 Subject: [PATCH 29/31] only print hints for available endpoints --- src/server/run/mod.rs | 88 +++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index 9d78c4db..cd11f56b 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -13,6 +13,8 @@ use crate::{ common::GenomeRelease, }; use clap::ValueEnum; +use std::collections::HashMap; +use strum::EnumString; /// Implementation of Actix server. pub mod actix_server; @@ -135,8 +137,15 @@ pub struct Args { pub listen_port: u16, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EnumString)] +enum Endpoint { + Transcripts, + Frequency, + Clinvar, +} + /// Print some hints via `tracing::info!`. -pub fn print_hints(args: &Args) { +fn print_hints(args: &Args, enabled_sources: &[(GenomeRelease, Endpoint)]) { tracing::info!( "Launching server main on http://{}:{} ...", args.listen_host.as_str(), @@ -148,41 +157,42 @@ pub fn print_hints(args: &Args) { return; } - // The endpoint `/genes/txs` provides transcript information. - tracing::info!( - " try: http://{}:{}/api/v1/genes/transcripts?hgnc_id=HGNC:1100&genome_build=grch37&\ - genomeBuild=GENOME_BUILD_GRCH37", - args.listen_host.as_str(), - args.listen_port - ); - // The endpoint `/tx/csq` to compute the consequence of a variant; without and with filtering - // for HGNC gene ID. - tracing::info!( - " try: http://{}:{}/api/v1/seqvars/csq?genome_release=grch37\ - &chromosome=17&position=48275363&reference=C&alternative=A", - args.listen_host.as_str(), - args.listen_port - ); - tracing::info!( - " try: http://{}:{}/api/v1/seqvars/csq?genome_release=grch37\ - &chromosome=17&position=48275363&reference=C&alternative=A&hgnc_id=HGNC:2197", - args.listen_host.as_str(), - args.listen_port - ); - // The endpoint `/strucvars/csq` computes the consequence of an SV. - tracing::info!( - " try: http://{}:{}/api/v1/strucvars/csq?genome_release=grch37\ - &chromosome=17&start=48275360&&stop=48275370&sv_type=DEL", - args.listen_host.as_str(), - args.listen_port - ); - // The endpoint `/structvars/csq` computes the consequence of an SV. - tracing::info!( - " try: http://{}:{}/api/v1/strucvars/csq?genome_release=grch37\ - &chromosome=17&start=48275360&&stop=48275370&sv_type=DEL", - args.listen_host.as_str(), - args.listen_port + use Endpoint::*; + use GenomeRelease::*; + + let prefix = format!( + "try: http://{host}:{port}/api/v1/", + host = args.listen_host, + port = args.listen_port ); + let examples: HashMap<(GenomeRelease, Endpoint), Vec<&str>> = HashMap::from([ + ( + (Grch37, Transcripts), + vec![ + r#"genes/transcripts?hgnc_id=HGNC:1100&genome_build=grch37"#, + r#"seqvars/csq?genome_release=grch37&chromosome=17&position=48275363&reference=C&alternative=A"#, + r#"seqvars/csq?genome_release=grch37&chromosome=17&position=48275363&reference=C&alternative=A&hgnc_id=HGNC:2197"#, + r#"strucvars/csq?genome_release=grch37&chromosome=17&start=48275360&&stop=48275370&sv_type=DEL""#, + ], + ), + ( + (Grch38, Transcripts), + vec![r#"genes/transcripts?hgnc_id=HGNC:1100&genome_build=grch38"#], + ), + ( + (Grch37, Frequency), + vec![ + r#"seqvars/frequency?genome_release=grch37&chromosome=17&position=48275363&reference=C&alternative=A"#, + ], + ), + ]); + for (genome_release, endpoint) in enabled_sources { + if let Some(examples) = examples.get(&(*genome_release, *endpoint)) { + for example in examples { + tracing::info!("{}{}", prefix, example); + } + } + } } /// Main entry point for `server run` sub command. @@ -204,6 +214,9 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a let before_loading = std::time::Instant::now(); let mut data = actix_server::WebServerData::default(); + let mut enabled_sources = vec![]; + use Endpoint::*; + for genome_release in GenomeRelease::value_variants().iter().copied() { tracing::info!("Loading genome release {:?}", genome_release); let assembly = genome_release.into(); @@ -238,6 +251,7 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a genome_release, StrucvarConsequencePredictor::new(provider.clone(), assembly), ); + enabled_sources.push((genome_release, Transcripts)); } } else { tracing::warn!( @@ -257,6 +271,7 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a let frequency_db = annotators.into_iter().next().unwrap(); data.frequency_annotators .insert(genome_release, frequency_db); + enabled_sources.push((genome_release, Frequency)); } _ => tracing::warn!( "Multiple frequency databases loaded. This is not supported. The respective endpoint will be unavailable." @@ -274,6 +289,7 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a 1 => { let annotator = annotators.into_iter().next().unwrap(); data.clinvar_annotators.insert(genome_release, annotator); + enabled_sources.push((genome_release, Clinvar)); } _ => tracing::warn!( "Multiple clinvar databases specified. This is not supported. The respective endpoint will be unavailable." @@ -285,7 +301,7 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a tracing::info!("... done loading data {:?}", before_loading.elapsed()); // Print the server URL and some hints (the latter: unless suppressed). - print_hints(args); + print_hints(args, &enabled_sources); // Launch the Actix web server. actix_server::main(args, data).await?; From 096c9226eeef63b02e42eb6b7f76ea8875bf1a11 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 24 Jan 2025 13:33:49 +0100 Subject: [PATCH 30/31] server: add exemplary Grch38 hints --- src/server/run/mod.rs | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index cd11f56b..b31b6497 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -175,14 +175,35 @@ fn print_hints(args: &Args, enabled_sources: &[(GenomeRelease, Endpoint)]) { r#"strucvars/csq?genome_release=grch37&chromosome=17&start=48275360&&stop=48275370&sv_type=DEL""#, ], ), + ( + (Grch37, Frequency), + vec![ + r#"seqvars/frequency?genome_release=grch37&chromosome=17&position=48275363&reference=C&alternative=A"#, + ], + ), + ( + (Grch37, Clinvar), + vec![ + r#"seqvars/clinvar?genome_release=grch37&chromosome=17&position=48275363&reference=C&alternative=A"#, + ], + ), ( (Grch38, Transcripts), - vec![r#"genes/transcripts?hgnc_id=HGNC:1100&genome_build=grch38"#], + vec![ + r#"genes/transcripts?hgnc_id=HGNC:1100&genome_build=grch38"#, + r#"seqvars/csq?genome_release=grch38&chromosome=2&position=26364839&reference=C&alternative=T"#, + ], ), ( - (Grch37, Frequency), + (Grch38, Frequency), vec![ - r#"seqvars/frequency?genome_release=grch37&chromosome=17&position=48275363&reference=C&alternative=A"#, + r#"seqvars/frequency?genome_release=grch38&chromosome=2&position=26364839&reference=C&alternative=T"#, + ], + ), + ( + (Grch38, Clinvar), + vec![ + r#"seqvars/clinvar?genome_release=grch38&chromosome=2&position=26364839&reference=C&alternative=T"#, ], ), ]); From fa2cbebac74984af3c0f1416a22e2a4549e66466 Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 24 Jan 2025 13:39:18 +0100 Subject: [PATCH 31/31] fix typo in fn name --- src/annotate/seqvars/mod.rs | 4 ++-- src/server/run/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index b02f6eac..9cf1bd7b 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -2071,7 +2071,7 @@ pub(crate) fn setup_seqvars_annotator( // Add the ClinVar annotator if requested. if let Some(rocksdb_paths) = &sources.clinvar { - let clinvar_dbs = intialize_clinvar_annotators_for_assembly(rocksdb_paths, assembly)?; + let clinvar_dbs = initialize_clinvar_annotators_for_assembly(rocksdb_paths, assembly)?; for clinvar_db in clinvar_dbs { annotators.push(AnnotatorEnum::Clinvar(clinvar_db)) } @@ -2102,7 +2102,7 @@ pub(crate) fn setup_seqvars_annotator( Ok(annotator) } -pub(crate) fn intialize_clinvar_annotators_for_assembly( +pub(crate) fn initialize_clinvar_annotators_for_assembly( rocksdb_paths: &[String], assembly: Option, ) -> Result, Error> { diff --git a/src/server/run/mod.rs b/src/server/run/mod.rs index b31b6497..d7994568 100644 --- a/src/server/run/mod.rs +++ b/src/server/run/mod.rs @@ -1,7 +1,7 @@ use crate::annotate::cli::{Sources, TranscriptSettings}; use crate::annotate::seqvars::csq::ConfigBuilder; use crate::annotate::seqvars::{ - initialize_frequency_annotators_for_assembly, intialize_clinvar_annotators_for_assembly, + initialize_clinvar_annotators_for_assembly, initialize_frequency_annotators_for_assembly, load_transcript_dbs_for_assembly, ConsequenceAnnotator, }; use crate::db::merge::merge_transcript_databases; @@ -301,7 +301,7 @@ pub async fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), a } if let Some(paths) = args.sources.clinvar.as_ref() { - let annotators = intialize_clinvar_annotators_for_assembly(paths, Some(assembly))?; + let annotators = initialize_clinvar_annotators_for_assembly(paths, Some(assembly))?; match annotators.len() { 0 => tracing::warn!(