Skip to content

Commit

Permalink
Merge branch 'v0.5.0' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
bluenote-1577 committed Dec 23, 2023
2 parents c56eb25 + 82c7a61 commit 838cf0e
Show file tree
Hide file tree
Showing 15 changed files with 584 additions and 369 deletions.
29 changes: 29 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,32 @@
## sylph v0.5.0 release: **Big improvements on real illumina data** : Dec 23 2023

### Major

**In previous versions, sylph was underperforming on real illumina data sets**. See https://github.com/bluenote-1577/sylph/issues/5

This is because many real illumina datasets have a non-trivial number of duplicate reads. Duplicate reads mess up sylph's statistical model.

For the single and paired sketching options, a new deduplication routine has been added. This will be described in version 2 of our preprint.

**This increases sketching memory by 3-4x but greatly increases performance on real datasets with > 1-2% of duplication, especially for low-abundance genomes**.

For paired-end illumina reads with non-trivial (> 1% duplication), sylph can now

1. detect up to many more species low-abundance species below 0.3x coverage
2. give better coverage/abundance estimates for low-abundance species

### BREAKING

- sequence sketches (sylsp) have changed formats. Sequences will need to be re-sketched.
- `--read-length` option removed and incorporated into the sketches by default. (suggested by @fplaza)

### Other changes

- New warning when `-o` specified and only reads are sketched (https://github.com/bluenote-1577/sylph/issues/7)
- You can now rename sylph samples by specifing a sample naming file with `--sample-names` or `--lS` (suggested by @jolespin)
- Newline delimited files are available in `profile` and `query` now (suggested by @jolespin)


## sylph v0.4.1 release: getting ready for preprinting

### Minor
Expand Down
23 changes: 1 addition & 22 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sylph"
version = "0.4.1"
version = "0.5.0"
edition = "2021"
license = "MIT OR Apache-2.0"

Expand All @@ -14,7 +14,6 @@ rayon = "1"
smallvec = { version = "1", features = ["union","serde","write"] }
serde = { version = "1", features = ["derive"] }
bincode = "1"
seq_io = "0.3"
fxhash = "0"
clap = { version = "3", features = ["derive"] }
flate2 = { version = "1.0.17", features = ["zlib-ng"], default-features = false }
Expand Down
31 changes: 22 additions & 9 deletions scripts/sylph_to_taxprof.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def genome_file_to_gtdb_acc(file_name):
for row in f:
spl = row.rstrip().split();
accession = spl[0]
taxonomy = spl[1].rstrip() + ';t__' + accession
taxonomy = ' '.join(spl[1:]).rstrip() + ';t__' + accession
genome_to_taxonomy[accession] = taxonomy

### Group by sample file. Output one file fo reach sample file.
Expand All @@ -55,10 +55,12 @@ def genome_file_to_gtdb_acc(file_name):
genome_file = genome_file_to_gtdb_acc(row['Genome_file'])
ani = float(row['Adjusted_ANI'])

if genome_file not in genome_to_taxonomy:
tax_str = 'NO_TAXONOMY;t__' + genome_file
else:
if genome_file in genome_to_taxonomy:
tax_str = genome_to_taxonomy[genome_file]
elif genome_file +'.gz' in genome_to_taxonomy:
tax_str = genome_to_taxonomy[genome_file + '.gz']
else:
tax_str = 'NO_TAXONOMY;t__' + genome_file

abundance = float(row['Sequence_abundance'])
rel_abundance = float(row['Taxonomic_abundance'])
Expand All @@ -79,10 +81,21 @@ def genome_file_to_gtdb_acc(file_name):
of.write(f"#SampleID\t{sample_file}\n")
of.write("#clade_name\trelative_abundance\tsequence_abundance\tANI (if strain-level)\n")

sorted_keys = sorted(tax_abundance.keys())
for tax in sorted_keys:
if tax in ani_dict:
of.write(f"{tax}\t{tax_abundance[tax]}\t{seq_abundance[tax]}\t{ani_dict[tax]}\n")
level_to_key = dict()
for key in tax_abundance.keys():
level = len(key.split('|'))
if level not in level_to_key:
level_to_key[level] = [key]
else:
of.write(f"{tax}\t{tax_abundance[tax]}\t{seq_abundance[tax]}\tNA\n")
level_to_key[level].append(key)

sorted_keys = sorted(level_to_key.keys())

for level in sorted_keys:
keys_for_level = sorted(level_to_key[level], key = lambda x: tax_abundance[x], reverse=True)
for tax in keys_for_level:
if tax in ani_dict:
of.write(f"{tax}\t{tax_abundance[tax]}\t{seq_abundance[tax]}\t{ani_dict[tax]}\n")
else:
of.write(f"{tax}\t{tax_abundance[tax]}\t{seq_abundance[tax]}\tNA\n")

41 changes: 28 additions & 13 deletions src/cmdline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,32 @@ pub enum Mode {

#[derive(Args, Default)]
pub struct SketchArgs {
#[clap(multiple=true, help = "fasta/fastq files; gzip optional. Default: fastq file produces a sample sketch (*.sylsp) while fasta files are combined into a database (*.syldb).")]
#[clap(multiple=true, help_heading = "INPUT", help = "fasta/fastq files; gzip optional. Default: fastq file produces a sample sketch (*.sylsp) while fasta files are combined into a database (*.syldb).")]
pub files: Vec<String>,
#[clap(short='o',long="out-name-db", default_value = "database", help_heading = "OUTPUT", help = "Output name for database sketch (with .syldb appended)")]
pub db_out_name: String,
#[clap(short='d',long="sample-output-directory", default_value = "./", help_heading = "OUTPUT", help = "Output directory for sample sketches")]
pub sample_output_dir: String,
#[clap(short,long="individual-records", help_heading = "INPUT", help = "Use individual records (contigs) for database construction")]
#[clap(short,long="individual-records", help_heading = "GENOME INPUT", help = "Use individual records (contigs) for database construction")]
pub individual: bool,
#[clap(multiple=true,short,long="reads", help_heading = "INPUT", help = "Single-end fasta/fastq reads")]
#[clap(multiple=true,short,long="reads", help_heading = "SINGLE-END INPUT", help = "Single-end fasta/fastq reads")]
pub reads: Option<Vec<String>>,
#[clap(multiple=true,short='g', long="genomes", help_heading = "INPUT", help = "Genomes in fasta format")]
#[clap(multiple=true,short='g', long="genomes", help_heading = "GENOME INPUT", help = "Genomes in fasta format")]
pub genomes: Option<Vec<String>>,
#[clap(short,long="list", help_heading = "INPUT", help = "Newline delimited file with inputs; fastas -> database, fastq -> sample")]
pub list_sequence: Option<String>,

#[clap(long="rl", hidden=true, help_heading = "INPUT", help = "Newline delimited file; inputs assumed reads")]
#[clap(long="rl", hidden=true, help_heading = "SINGLE-END INPUT", help = "Newline delimited file; inputs assumed reads")]
pub list_reads: Option<String>,

#[clap(long="gl", help_heading = "INPUT", help = "Newline delimited file; inputs assumed genomes")]
#[clap(long="gl", help_heading = "GENOME INPUT", help = "Newline delimited file; inputs assumed genomes")]
pub list_genomes: Option<String>,

#[clap(long="l1", help_heading = "PAIRED-END INPUT", help = "Newline delimited file; inputs are first pair of PE reads")]
pub list_first_pair: Option<String>,
#[clap(long="l2", help_heading = "PAIRED-END INPUT", help = "Newline delimited file; inputs are second pair of PE reads")]
pub list_second_pair: Option<String>,
#[clap(long="lS", help_heading = "INPUT", help = "Newline delimited file; read sketches are renamed to given sample names")]
pub list_sample_names: Option<String>,
#[clap(short='S', long="sample-names", help_heading = "INPUT", help = "Read sketches are renamed to given sample names")]
pub sample_names: Option<Vec<String>>,

#[clap(short, default_value_t = 31,help_heading = "ALGORITHM", help ="Value of k. Only k = 21, 31 are currently supported")]
pub k: usize,
Expand All @@ -55,20 +60,27 @@ pub struct SketchArgs {
#[clap(long="debug", help = "Debug output")]
pub debug: bool,


#[clap(long="no-dedup", help_heading = "ALGORITHM", help = "Disable read deduplication procedure. Reduces memory; not recommended for illumina data")]
pub no_dedup: bool,
#[clap(long="disable-profiling", help_heading = "ALGORITHM", help = "Disable sylph profile usage for databases; may decrease size and make sylph query slightly faster")]
pub no_pseudotax: bool,
#[clap(long="min-spacing", default_value_t = 30, help_heading = "ALGORITHM", help = "Minimum spacing between selected k-mers on the genomes")]
pub min_spacing_kmer: usize,
#[clap(short='1',long="first-pairs", multiple=true, help_heading = "INPUT", help = "First pairs for paired end reads")]
#[clap(short='1',long="first-pairs", multiple=true, help_heading = "PAIRED-END INPUT", help = "First pairs for paired end reads")]
pub first_pair: Vec<String>,
#[clap(short='2',long="second-pairs", multiple=true, help_heading = "INPUT", help = "Second pairs for paired end reads")]
#[clap(short='2',long="second-pairs", multiple=true, help_heading = "PAIRED-END INPUT", help = "Second pairs for paired end reads")]
pub second_pair: Vec<String>,
}

#[derive(Args)]
pub struct ContainArgs {
#[clap(multiple=true, help = "Pre-sketched *.syldb/*.sylsp files. Raw fastq/fasta are allowed and will be automatically sketched to .sylsp/.syldb")]
pub files: Vec<String>,

#[clap(short='l',long="list", help = "Newline delimited file of file inputs")]
pub file_list: Option<String>,

#[clap(long,default_value_t = 3., help_heading = "ALGORITHM", help = "Minimum k-mer multiplicity needed for coverage correction. Higher values gives more precision but lower sensitivity")]
pub min_count_correct: f64,
#[clap(short='M',long,default_value_t = 50., help_heading = "ALGORITHM", help = "Exclude genomes with less than this number of sampled k-mers")]
Expand All @@ -90,8 +102,11 @@ pub struct ContainArgs {
#[clap(short='I',long="read-seq-id", help_heading = "ALGORITHM", help = "Mean sequence identity of reads (0-100). Only used if --estimate-unknown is toggled. Consider this if automatic identity estimate fails" )]
pub seq_id: Option<f64>,

#[clap(short='l', long="read-length", help_heading = "ALGORITHM", help = "Read length (single-end length for pairs). Only necessary for short-read coverages when using --estimate-unknown. Not needed for long-reads" )]
pub read_length: Option<usize>,
//#[clap(short='l', long="read-length", help_heading = "ALGORITHM", help = "Read length (single-end length for pairs). Only necessary for short-read coverages when using --estimate-unknown. Not needed for long-reads" )]
//pub read_length: Option<usize>,

#[clap(short='R', long="redundancy-threshold", help_heading = "ALGORITHM", help = "Removes redundant genomes up to a rough ANI percentile when profiling", default_value_t = 99.0, hidden=true)]
pub redundant_ani: f64,

#[clap(short, default_value_t = 200, help_heading = "SKETCHING", help = "Subsampling rate. Does nothing for pre-sketched files")]
pub c: usize,
Expand Down
2 changes: 2 additions & 0 deletions src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ pub const SAMPLE_FILE_SUFFIX_VALID : [&str;2] = [SAMPLE_FILE_SUFFIX, ".sylsample
pub const MIN_ANI_DEF: f64 = 0.9;
pub const MIN_ANI_P_DEF: f64 = 0.95;
pub const MAX_MEDIAN_FOR_MEAN_FINAL_EST: f64 = 15.;
pub const DEREP_PROFILE_ANI: f64 = 0.975;
pub const MAX_DEDUP_COUNT: u32 = 4;
Loading

0 comments on commit 838cf0e

Please sign in to comment.