Merge branch 'v0.5.0' into main

bluenote-1577 · Dec 23, 2023 · 838cf0e · 838cf0e
2 parents c56eb25 + 82c7a61
commit 838cf0e
Show file tree

Hide file tree

Showing 15 changed files with 584 additions and 369 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,32 @@
+## sylph v0.5.0 release: **Big improvements on real illumina data** : Dec 23 2023 
+
+### Major
+
+**In previous versions, sylph was underperforming on real illumina data sets**. See https://github.com/bluenote-1577/sylph/issues/5 
+
+This is because many real illumina datasets have a non-trivial number of duplicate reads. Duplicate reads mess up sylph's statistical model.
+
+For the single and paired sketching options, a new deduplication routine has been added. This will be described in version 2 of our preprint. 
+
+**This increases sketching memory by 3-4x but greatly increases performance on real datasets with > 1-2% of duplication, especially for low-abundance genomes**. 
+
+For paired-end illumina reads with non-trivial (> 1% duplication), sylph can now 
+
+1. detect up to many more species low-abundance species below 0.3x coverage
+2. give better coverage/abundance estimates for low-abundance species 
+
+### BREAKING
+
+- sequence sketches (sylsp) have changed formats. Sequences will need to be re-sketched.
+- `--read-length` option removed and incorporated into the sketches by default. (suggested by @fplaza)
+
+### Other changes
+
+- New warning when `-o` specified and only reads are sketched (https://github.com/bluenote-1577/sylph/issues/7)
+- You can now rename sylph samples by specifing a sample naming file with `--sample-names` or `--lS` (suggested by @jolespin)
+- Newline delimited files are available in `profile` and `query` now (suggested by @jolespin)
+
+
 ## sylph v0.4.1 release: getting ready for preprinting
 
 ### Minor

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "sylph"
-version = "0.4.1"
+version = "0.5.0"
 edition = "2021"
 license = "MIT OR Apache-2.0"
 
@@ -14,7 +14,6 @@ rayon = "1"
 smallvec = { version = "1", features = ["union","serde","write"] }
 serde = { version = "1", features = ["derive"] }
 bincode = "1"
-seq_io = "0.3"
 fxhash = "0"
 clap = { version = "3", features = ["derive"] }
 flate2 = { version = "1.0.17", features = ["zlib-ng"], default-features = false }

diff --git a/scripts/sylph_to_taxprof.py b/scripts/sylph_to_taxprof.py
@@ -32,7 +32,7 @@ def genome_file_to_gtdb_acc(file_name):
 for row in f:
     spl = row.rstrip().split();
     accession = spl[0]
-    taxonomy = spl[1].rstrip() + ';t__' + accession
+    taxonomy = ' '.join(spl[1:]).rstrip() + ';t__' + accession
     genome_to_taxonomy[accession] = taxonomy
 
 ### Group by sample file. Output one file fo reach sample file. 
@@ -55,10 +55,12 @@ def genome_file_to_gtdb_acc(file_name):
         genome_file = genome_file_to_gtdb_acc(row['Genome_file'])
         ani = float(row['Adjusted_ANI'])
 
-        if genome_file not in genome_to_taxonomy:
-            tax_str = 'NO_TAXONOMY;t__' + genome_file
-        else:
+        if genome_file in genome_to_taxonomy:
             tax_str = genome_to_taxonomy[genome_file]
+        elif genome_file +'.gz' in genome_to_taxonomy:
+            tax_str = genome_to_taxonomy[genome_file + '.gz']
+        else:
+            tax_str = 'NO_TAXONOMY;t__' + genome_file
 
         abundance = float(row['Sequence_abundance'])
         rel_abundance = float(row['Taxonomic_abundance'])
@@ -79,10 +81,21 @@ def genome_file_to_gtdb_acc(file_name):
     of.write(f"#SampleID\t{sample_file}\n")
     of.write("#clade_name\trelative_abundance\tsequence_abundance\tANI (if strain-level)\n")
 
-    sorted_keys = sorted(tax_abundance.keys())
-    for tax in sorted_keys:
-        if tax in ani_dict:
-            of.write(f"{tax}\t{tax_abundance[tax]}\t{seq_abundance[tax]}\t{ani_dict[tax]}\n")
+    level_to_key = dict()
+    for key in tax_abundance.keys():
+        level = len(key.split('|'))
+        if level not in level_to_key:
+            level_to_key[level] = [key]
         else:
-            of.write(f"{tax}\t{tax_abundance[tax]}\t{seq_abundance[tax]}\tNA\n")
+            level_to_key[level].append(key)
+
+    sorted_keys = sorted(level_to_key.keys())
+
+    for level in sorted_keys:
+        keys_for_level = sorted(level_to_key[level], key = lambda x: tax_abundance[x], reverse=True)
+        for tax in keys_for_level:
+            if tax in ani_dict:
+                of.write(f"{tax}\t{tax_abundance[tax]}\t{seq_abundance[tax]}\t{ani_dict[tax]}\n")
+            else:
+                of.write(f"{tax}\t{tax_abundance[tax]}\t{seq_abundance[tax]}\tNA\n")
 
diff --git a/src/cmdline.rs b/src/cmdline.rs
@@ -20,27 +20,32 @@ pub enum Mode {
 
 #[derive(Args, Default)]
 pub struct SketchArgs {
-    #[clap(multiple=true, help = "fasta/fastq files; gzip optional. Default: fastq file produces a sample sketch (*.sylsp) while fasta files are combined into a database (*.syldb).")]
+    #[clap(multiple=true, help_heading = "INPUT", help = "fasta/fastq files; gzip optional. Default: fastq file produces a sample sketch (*.sylsp) while fasta files are combined into a database (*.syldb).")]
     pub files: Vec<String>,
     #[clap(short='o',long="out-name-db", default_value = "database", help_heading = "OUTPUT", help = "Output name for database sketch (with .syldb appended)")]
     pub db_out_name: String,
     #[clap(short='d',long="sample-output-directory", default_value = "./", help_heading = "OUTPUT", help = "Output directory for sample sketches")]
     pub sample_output_dir: String,
-    #[clap(short,long="individual-records", help_heading = "INPUT", help = "Use individual records (contigs) for database construction")]
+    #[clap(short,long="individual-records", help_heading = "GENOME INPUT", help = "Use individual records (contigs) for database construction")]
     pub individual: bool,
-    #[clap(multiple=true,short,long="reads", help_heading = "INPUT", help = "Single-end fasta/fastq reads")]
+    #[clap(multiple=true,short,long="reads", help_heading = "SINGLE-END INPUT", help = "Single-end fasta/fastq reads")]
     pub reads: Option<Vec<String>>,
-    #[clap(multiple=true,short='g', long="genomes", help_heading = "INPUT", help = "Genomes in fasta format")]
+    #[clap(multiple=true,short='g', long="genomes", help_heading = "GENOME INPUT", help = "Genomes in fasta format")]
     pub genomes: Option<Vec<String>>,
     #[clap(short,long="list", help_heading = "INPUT", help = "Newline delimited file with inputs; fastas -> database, fastq -> sample")]
     pub list_sequence: Option<String>,
-
-    #[clap(long="rl", hidden=true, help_heading = "INPUT", help = "Newline delimited file; inputs assumed reads")]
+    #[clap(long="rl", hidden=true, help_heading = "SINGLE-END INPUT", help = "Newline delimited file; inputs assumed reads")]
     pub list_reads: Option<String>,
-
-    #[clap(long="gl", help_heading = "INPUT", help = "Newline delimited file; inputs assumed genomes")]
+    #[clap(long="gl", help_heading = "GENOME INPUT", help = "Newline delimited file; inputs assumed genomes")]
     pub list_genomes: Option<String>,
-
+    #[clap(long="l1", help_heading = "PAIRED-END INPUT", help = "Newline delimited file; inputs are first pair of PE reads")]
+    pub list_first_pair: Option<String>,
+    #[clap(long="l2", help_heading = "PAIRED-END INPUT", help = "Newline delimited file; inputs are second pair of PE reads")]
+    pub list_second_pair: Option<String>,
+    #[clap(long="lS", help_heading = "INPUT", help = "Newline delimited file; read sketches are renamed to given sample names")]
+    pub list_sample_names: Option<String>,
+    #[clap(short='S', long="sample-names", help_heading = "INPUT", help = "Read sketches are renamed to given sample names")]
+    pub sample_names: Option<Vec<String>>,
 
     #[clap(short, default_value_t = 31,help_heading = "ALGORITHM", help ="Value of k. Only k = 21, 31 are currently supported")]
     pub k: usize,
@@ -55,20 +60,27 @@ pub struct SketchArgs {
     #[clap(long="debug", help = "Debug output")]
     pub debug: bool,
 
+
+    #[clap(long="no-dedup", help_heading = "ALGORITHM", help = "Disable read deduplication procedure. Reduces memory; not recommended for illumina data")]
+    pub no_dedup: bool,
     #[clap(long="disable-profiling", help_heading = "ALGORITHM", help = "Disable sylph profile usage for databases; may decrease size and make sylph query slightly faster")]
     pub no_pseudotax: bool,
     #[clap(long="min-spacing", default_value_t = 30, help_heading = "ALGORITHM", help = "Minimum spacing between selected k-mers on the genomes")]
     pub min_spacing_kmer: usize,
-    #[clap(short='1',long="first-pairs", multiple=true, help_heading = "INPUT", help = "First pairs for paired end reads")]
+    #[clap(short='1',long="first-pairs", multiple=true, help_heading = "PAIRED-END INPUT", help = "First pairs for paired end reads")]
     pub first_pair: Vec<String>,
-    #[clap(short='2',long="second-pairs", multiple=true, help_heading = "INPUT", help = "Second pairs for paired end reads")]
+    #[clap(short='2',long="second-pairs", multiple=true, help_heading = "PAIRED-END INPUT", help = "Second pairs for paired end reads")]
     pub second_pair: Vec<String>,
 }
 
 #[derive(Args)]
 pub struct ContainArgs {
     #[clap(multiple=true, help = "Pre-sketched *.syldb/*.sylsp files. Raw fastq/fasta are allowed and will be automatically sketched to .sylsp/.syldb")]
     pub files: Vec<String>,
+
+    #[clap(short='l',long="list", help = "Newline delimited file of file inputs")]
+    pub file_list: Option<String>,
+
     #[clap(long,default_value_t = 3., help_heading = "ALGORITHM", help = "Minimum k-mer multiplicity needed for coverage correction. Higher values gives more precision but lower sensitivity")]
     pub min_count_correct: f64,
     #[clap(short='M',long,default_value_t = 50., help_heading = "ALGORITHM", help = "Exclude genomes with less than this number of sampled k-mers")]
@@ -90,8 +102,11 @@ pub struct ContainArgs {
     #[clap(short='I',long="read-seq-id", help_heading = "ALGORITHM", help = "Mean sequence identity of reads (0-100). Only used if --estimate-unknown is toggled. Consider this if automatic identity estimate fails" )]
     pub seq_id: Option<f64>,
 
-    #[clap(short='l', long="read-length", help_heading = "ALGORITHM", help = "Read length (single-end length for pairs). Only necessary for short-read coverages when using --estimate-unknown. Not needed for long-reads" )]
-    pub read_length: Option<usize>,
+    //#[clap(short='l', long="read-length", help_heading = "ALGORITHM", help = "Read length (single-end length for pairs). Only necessary for short-read coverages when using --estimate-unknown. Not needed for long-reads" )]
+    //pub read_length: Option<usize>,
+
+    #[clap(short='R', long="redundancy-threshold", help_heading = "ALGORITHM", help = "Removes redundant genomes up to a rough ANI percentile when profiling", default_value_t = 99.0, hidden=true)]
+    pub redundant_ani: f64,
 
     #[clap(short, default_value_t = 200, help_heading = "SKETCHING", help = "Subsampling rate. Does nothing for pre-sketched files")]
     pub c: usize,

diff --git a/src/constants.rs b/src/constants.rs
@@ -10,3 +10,5 @@ pub const SAMPLE_FILE_SUFFIX_VALID : [&str;2] = [SAMPLE_FILE_SUFFIX, ".sylsample
 pub const MIN_ANI_DEF: f64 = 0.9;
 pub const MIN_ANI_P_DEF: f64 = 0.95;
 pub const MAX_MEDIAN_FOR_MEAN_FINAL_EST: f64 = 15.;
+pub const DEREP_PROFILE_ANI: f64 = 0.975;
+pub const MAX_DEDUP_COUNT: u32 = 4;