Skip to content

Commit

Permalink
feat!: include source information in transcript database (#615)
Browse files Browse the repository at this point in the history
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

Release-As: 0.30.0
  • Loading branch information
tedil authored Nov 11, 2024
1 parent 2fdf76e commit 111ffcc
Show file tree
Hide file tree
Showing 18 changed files with 252 additions and 100 deletions.
4 changes: 4 additions & 0 deletions openapi.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ components:
description: Specification of data version for a given genome build.
required:
- genome_build
- version_cdot
properties:
genome_build:
$ref: '#/components/schemas/Assembly'
Expand All @@ -220,6 +221,9 @@ components:
type: string
description: Version of the Ensembl database, if any.
nullable: true
version_cdot:
type: string
description: Version of cdot used.
FeatureBiotype:
type: string
description: Encode feature biotype.
Expand Down
40 changes: 38 additions & 2 deletions protos/mehari/txs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,42 @@ message SequenceDb {
repeated string seqs = 3;
}

// Indicates the reference assembly of the transcript database.
enum Assembly {
// Unknown.
ASSEMBLY_UNKNOWN = 0;
// GRCh37.
ASSEMBLY_GRCH37 = 1;
// GRCh38.
ASSEMBLY_GRCH38 = 2;
}

// Indicates the transcript source.
enum Source {
// Unknown.
SOURCE_UNKNOWN = 0;
// RefSeq.
SOURCE_REFSEQ = 1;
// Ensembl.
SOURCE_ENSEMBL = 2;
}

// Version information for the database.
message SourceVersion {
// Version of mehari used to build the database.
string mehari_version = 1;
// Assembly used, either GRCh37 or GRCh38 (or Unknown).
Assembly assembly = 2;
// Version of the assembly, optional.
optional string assembly_version = 3;
// Source, either RefSeq or Ensembl (or Unknown).
Source source_name = 4;
// Version of the source, e.g. 112 for Ensembl.
string source_version = 5;
// Version of cdot.
string cdot_version = 6;
}

// Mapping from gene to transcript ID.
message GeneToTxId {
// Gene HGNC ID; serves as gene identifier.
Expand Down Expand Up @@ -155,6 +191,6 @@ message TxSeqDatabase {
SequenceDb seq_db = 2;
// The version of the database.
optional string version = 3;
// The reference assembly that this database refers to.
optional string genome_release = 4;
// Version information; allow repeated here to be able to keep track of information when merging databases
repeated SourceVersion source_version = 5;
}
36 changes: 8 additions & 28 deletions src/annotate/seqvars/csq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
use std::{collections::HashMap, sync::Arc};

use crate::pbs::txs::{Strand, TranscriptBiotype, TranscriptTag};
use biocommons_bioutils::assemblies::Assembly;
use enumflags2::BitFlags;
use hgvs::parser::NoRef;
use hgvs::{
Expand Down Expand Up @@ -102,9 +101,10 @@ pub const PADDING: i32 = 5_000;
pub const ALT_ALN_METHOD: &str = "splign";

impl ConsequencePredictor {
pub fn new(provider: Arc<MehariProvider>, assembly: Assembly, config: Config) -> Self {
pub fn new(provider: Arc<MehariProvider>, config: Config) -> Self {
tracing::info!("Building transcript interval trees ...");
let acc_to_chrom: indexmap::IndexMap<String, String> = provider.get_assembly_map(assembly);
let acc_to_chrom: indexmap::IndexMap<String, String> =
provider.get_assembly_map(provider.assembly());
let mut chrom_to_acc = HashMap::new();
for (acc, chrom) in &acc_to_chrom {
let chrom = if chrom.starts_with("chr") {
Expand Down Expand Up @@ -1046,14 +1046,9 @@ mod test {

let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst";
let tx_db = load_tx_db(tx_path)?;
let provider = Arc::new(MehariProvider::new(
tx_db,
Assembly::Grch37p10,
Default::default(),
));
let provider = Arc::new(MehariProvider::new(tx_db, Default::default()));

let predictor =
ConsequencePredictor::new(provider, Assembly::Grch37p10, Default::default());
let predictor = ConsequencePredictor::new(provider, Default::default());

let res = predictor
.predict(&VcfVariant {
Expand Down Expand Up @@ -1130,7 +1125,6 @@ mod test {
let tx_db = load_tx_db(tx_path)?;
let provider = Arc::new(MehariProvider::new(
tx_db,
Assembly::Grch37p10,
MehariProviderConfigBuilder::default()
.pick_transcript(vec![
TranscriptPickType::ManePlusClinical,
Expand All @@ -1143,7 +1137,6 @@ mod test {
use crate::annotate::seqvars::ConsequencePredictorConfigBuilder;
let predictor = ConsequencePredictor::new(
provider,
Assembly::Grch37p10,
ConsequencePredictorConfigBuilder::default()
.report_most_severe_consequence_by(Some(ConsequenceBy::Gene))
.build()?,
Expand Down Expand Up @@ -1235,7 +1228,6 @@ mod test {
let tx_db = load_tx_db(tx_path)?;
let provider = Arc::new(MehariProvider::new(
tx_db,
Assembly::Grch37p10,
MehariProviderConfigBuilder::default()
.pick_transcript(vec![
TranscriptPickType::ManePlusClinical,
Expand All @@ -1245,8 +1237,7 @@ mod test {
.build()?,
));

let predictor =
ConsequencePredictor::new(provider, Assembly::Grch37p10, Default::default());
let predictor = ConsequencePredictor::new(provider, Default::default());

let res = predictor
.predict(&VcfVariant {
Expand Down Expand Up @@ -1295,7 +1286,6 @@ mod test {
let tx_db = load_tx_db(tx_path)?;
let provider = Arc::new(MehariProvider::new(
tx_db,
Assembly::Grch37p10,
MehariProviderConfigBuilder::default()
.pick_transcript(vec![
TranscriptPickType::ManePlusClinical,
Expand All @@ -1305,8 +1295,7 @@ mod test {
.build()?,
));

let predictor =
ConsequencePredictor::new(provider, Assembly::Grch37p10, Default::default());
let predictor = ConsequencePredictor::new(provider, Default::default());

let res = predictor
.predict(&VcfVariant {
Expand Down Expand Up @@ -1363,7 +1352,6 @@ mod test {
};
let provider = Arc::new(MehariProvider::new(
tx_db,
Assembly::Grch37p10,
MehariProviderConfigBuilder::default()
.pick_transcript(picks)
.build()
Expand All @@ -1377,7 +1365,6 @@ mod test {

let predictor = ConsequencePredictor::new(
provider,
Assembly::Grch37p10,
ConfigBuilder::default()
.report_most_severe_consequence_by(report_most_severe_consequence_by)
.build()
Expand Down Expand Up @@ -1435,7 +1422,6 @@ mod test {

let provider = Arc::new(MehariProvider::new(
tx_db,
Assembly::Grch37p10,
MehariProviderConfigBuilder::default()
.pick_transcript(picks)
.build()
Expand All @@ -1450,7 +1436,6 @@ mod test {

let predictor = ConsequencePredictor::new(
provider,
Assembly::Grch37p10,
ConfigBuilder::default()
.report_most_severe_consequence_by(report_most_severe_consequence_by)
.build()
Expand Down Expand Up @@ -1564,11 +1549,7 @@ mod test {
) -> Result<(), anyhow::Error> {
let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst";
let tx_db = load_tx_db(tx_path)?;
let provider = Arc::new(MehariProvider::new(
tx_db,
Assembly::Grch37p10,
Default::default(),
));
let provider = Arc::new(MehariProvider::new(tx_db, Default::default()));

let report_most_severe_consequence_by = if report_most_severe_consequence_only {
Some(ConsequenceBy::Gene)
Expand All @@ -1578,7 +1559,6 @@ mod test {

let predictor = ConsequencePredictor::new(
provider,
Assembly::Grch37p10,
ConfigBuilder::default()
.report_most_severe_consequence_by(report_most_severe_consequence_by)
.build()
Expand Down
25 changes: 7 additions & 18 deletions src/annotate/seqvars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ pub struct Args {
/// Path to the input PED file.
#[arg(long)]
pub path_input_ped: Option<String>,

/// Path to the input VCF file.
///
#[arg(long)]
pub path_input_vcf: String,

Expand Down Expand Up @@ -641,10 +641,10 @@ impl VarFishSeqvarTsvWriter {
/// chrX, chrY, chrM/chrMT).
fn fill_coords(
&self,
assembly: Assembly,
record: &VcfRecord,
tsv_record: &mut VarFishSeqvarTsvRecord,
) -> Result<bool, anyhow::Error> {
let assembly = self.assembly.expect("assembly must have been set");
tsv_record.release = match assembly {
Assembly::Grch37 | Assembly::Grch37p10 => String::from("GRCh37"),
Assembly::Grch38 => String::from("GRCh38"),
Expand Down Expand Up @@ -1362,11 +1362,7 @@ impl AsyncAnnotatedVariantWriter for VarFishSeqvarTsvWriter {
) -> Result<(), anyhow::Error> {
let mut tsv_record = VarFishSeqvarTsvRecord::default();

if !self.fill_coords(
self.assembly.expect("assembly must have been set"),
record,
&mut tsv_record,
)? {
if !self.fill_coords(record, &mut tsv_record)? {
// Record was not on canonical chromosome and should not be written out.
return Ok(());
}
Expand Down Expand Up @@ -1732,22 +1728,16 @@ impl ConsequenceAnnotator {
Self { predictor }
}

fn from_db_and_args(
tx_db: TxSeqDatabase,
args: &Args,
assembly: Assembly,
) -> anyhow::Result<Self> {
fn from_db_and_args(tx_db: TxSeqDatabase, args: &Args) -> anyhow::Result<Self> {
let provider = Arc::new(MehariProvider::new(
tx_db,
assembly,
MehariProviderConfigBuilder::default()
.pick_transcript(args.pick_transcript.clone())
.pick_transcript_mode(args.pick_transcript_mode)
.build()?,
));
let predictor = ConsequencePredictor::new(
provider,
assembly,
ConsequencePredictorConfigBuilder::default()
.report_most_severe_consequence_by(args.report_most_severe_consequence_by)
.transcript_source(args.transcript_source)
Expand Down Expand Up @@ -1892,7 +1882,7 @@ async fn run_with_writer(
writer.set_assembly(assembly);
tracing::info!("Determined input assembly to be {:?}", &assembly);

let annotator = setup_annotator(args, assembly)?;
let annotator = setup_annotator(args)?;

// Perform the VCF annotation.
tracing::info!("Annotating VCF ...");
Expand Down Expand Up @@ -1951,7 +1941,7 @@ async fn run_with_writer(
Ok(())
}

fn setup_annotator(args: &Args, assembly: Assembly) -> Result<Annotator, Error> {
fn setup_annotator(args: &Args) -> Result<Annotator, Error> {
let mut annotators = vec![];

// Add the frequency annotator if requested.
Expand All @@ -1976,8 +1966,7 @@ fn setup_annotator(args: &Args, assembly: Assembly) -> Result<Annotator, Error>
)?;

annotators.push(
ConsequenceAnnotator::from_db_and_args(tx_db, args, assembly)
.map(AnnotatorEnum::Consequence)?,
ConsequenceAnnotator::from_db_and_args(tx_db, args).map(AnnotatorEnum::Consequence)?,
);
}

Expand Down
32 changes: 15 additions & 17 deletions src/annotate/seqvars/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::collections::HashMap;

use crate::annotate::seqvars::{TranscriptPickMode, TranscriptPickType};
use crate::db::create::Reason;
use crate::db::TranscriptDatabase;
use crate::{
annotate::seqvars::csq::ALT_ALN_METHOD,
pbs::txs::{GeneToTxId, Strand, Transcript, TranscriptTag, TxSeqDatabase},
Expand Down Expand Up @@ -40,18 +41,16 @@ pub struct TxIntervalTrees {
}

impl TxIntervalTrees {
pub fn new(db: &TxSeqDatabase, assembly: Assembly) -> Self {
let (contig_to_idx, trees) = Self::build_indices(db, assembly);
pub fn new(db: &TxSeqDatabase) -> Self {
let (contig_to_idx, trees) = Self::build_indices(db);
Self {
contig_to_idx,
trees,
}
}

fn build_indices(
db: &TxSeqDatabase,
assembly: Assembly,
) -> (HashMap<String, usize>, Vec<IntervalTree>) {
fn build_indices(db: &TxSeqDatabase) -> (HashMap<String, usize>, Vec<IntervalTree>) {
let assembly = db.assembly();
let mut contig_to_idx = HashMap::new();
let mut trees: Vec<IntervalTree> = Vec::new();

Expand Down Expand Up @@ -134,8 +133,6 @@ pub struct Provider {
/// When transcript picking is enabled, contains the `GeneToTxIdx` entries
/// for each gene; the order matches the one of `tx_seq_db.gene_to_tx`.
picked_gene_to_tx_id: Option<Vec<GeneToTxId>>,
/// The assembly of the provider.
assembly: Assembly,
/// The data version.
data_version: String,
/// The schema version.
Expand Down Expand Up @@ -164,9 +161,8 @@ impl Provider {
/// # Arguments
///
/// * `tx_seq_db` - The `TxSeqDatabase` to use.
/// * `assembly` - The assembly to use.
pub fn new(mut tx_seq_db: TxSeqDatabase, assembly: Assembly, config: Config) -> Self {
let tx_trees = TxIntervalTrees::new(&tx_seq_db, assembly);
pub fn new(mut tx_seq_db: TxSeqDatabase, config: Config) -> Self {
let tx_trees = TxIntervalTrees::new(&tx_seq_db);
let gene_map = HashMap::from_iter(
tx_seq_db
.tx_db
Expand Down Expand Up @@ -339,10 +335,13 @@ impl Provider {
// unique, such that `data_version` and `schema_version` can be used as keys for
// caching.
let data_version = format!(
"{:?}{}{}{:?}",
assembly,
"{}{}{:?}",
tx_seq_db.version.as_ref().unwrap_or(&"".to_string()),
tx_seq_db.genome_release.as_ref().unwrap_or(&"".to_string()),
tx_seq_db
.source_version
.iter()
.map(|v| format!("{:#?}", v))
.join(","),
config
);
let schema_version = data_version.clone();
Expand All @@ -353,7 +352,6 @@ impl Provider {
tx_map,
seq_map,
picked_gene_to_tx_id,
assembly,
data_version,
schema_version,
}
Expand All @@ -365,7 +363,7 @@ impl Provider {
///
/// The assembly of the provider.
pub fn assembly(&self) -> Assembly {
self.assembly
self.tx_seq_db.assembly()
}

/// Return whether transcript picking is enabled.
Expand Down Expand Up @@ -603,7 +601,7 @@ impl ProviderInterface for Provider {
} else {
Some(Err(Error::NoAlignmentFound(
tx_ac.to_string(),
format!("{:?}", self.assembly),
format!("{:?}", self.assembly()),
)))
}
} else {
Expand Down
Loading

0 comments on commit 111ffcc

Please sign in to comment.