From 09ea92da2df7604605079b14155d7c7edb944152 Mon Sep 17 00:00:00 2001 From: Franklin Delehelle Date: Wed, 3 Oct 2018 11:49:24 +0200 Subject: [PATCH] =?UTF-8?q?Rename=20=E2=80=9Ctranslate=E2=80=9D=20to=20mor?= =?UTF-8?q?e=20correct=20=E2=80=9Ccomplement=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 122 +++++++++++++++++++++++++++------------- src/automaton.rs | 2 +- src/bin/asgart-plot.rs | 45 ++++++++------- src/bin/asgart.rs | 30 +++++----- src/bin/asgart.yaml | 22 +++++--- src/bin/plot.yaml | 12 ++-- src/plot/chord_plot.rs | 11 +--- src/plot/flat_plot.rs | 1 - src/plot/genome_plot.rs | 1 - src/plot/mod.rs | 4 +- src/structs.rs | 8 ++- src/utils.rs | 6 +- 12 files changed, 153 insertions(+), 111 deletions(-) diff --git a/README.md b/README.md index f9262b3..88cced6 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,27 @@ # ASGART: a large duplications finder -`asgart` (A Segmental duplications Gathering and Refinement Tool) is a multiplatform (GNU/Linux, macOS, Windows) tool designed to search for large duplications amongst one or two DNA strands. +`asgart` (A Segmental duplications Gathering and Refinement Tool) is a +multiplatform (GNU/Linux, macOS, Windows) tool designed to search for +large duplications amongst one or two DNA strands. ## Licensing -Asgart is distributed under the GPLv3 license. Please see the LICENSE file. +Asgart is distributed under the GPLv3 license. Please see the LICENSE +file. # Why should I use ASGART? -![A map of the Human genome long segmental duplications](screenshots/chord.png) +![A map of the Human genome long segmental +duplications](screenshots/chord.png) You should use ASGART if -- you want to find segmental duplications, either direct, reversed and/or translate in a DNA sequence; +- you want to find segmental duplications, either direct, reversed + and/or complement in a DNA sequence; -- you want to find highly similar parts inbetween sequences up to the genome scale; +- you want to find highly similar parts inbetween sequences up to the + genome scale; - you want to map highly similar sequences amongst genomes; @@ -37,7 +43,8 @@ Binaries for Windows are not yet available. ## From sources -To build ASGART from sources, you need CMake, a C compiler and the [Rust compiler](https://www.rust-lang.org/en-US/install.html). +To build ASGART from sources, you need CMake, a C compiler and the +[Rust compiler](https://www.rust-lang.org/en-US/install.html). Once these requirement are installed, clone the repository @@ -54,7 +61,7 @@ You can then build ASGART by running the Rust building tool cargo build --release ``` -Once the build is finished, you'll find the binary in `target/release/`. +Once the build is finished, you will find the binary in `target/release/`. # Usage @@ -64,7 +71,7 @@ Once the build is finished, you'll find the binary in `target/release/`. First, let us take a look at a simple example: ``` -asgart seq.fasta seq.fasta 20 100 +asgart seq.fasta seq.fasta ``` This command will look for duplications in the `seq.fasta` file, then @@ -72,23 +79,27 @@ write them in a JSON file in the folder where it was launched. ASGART will probe using 20-mers, and guarantee that no duplication will include gaps longer than 100bp in their arm-to-arm pairwise alignment. -If you wish to look for reversed-translated duplications, use the -`-RT` option. And the `-v` option will give you more informations, as -well as a visual overview of the progress. +If you wish to look for reversed-complemented duplications, use the +`-R` and `-C` options, that can be combined in `-RC`. And the `-v` +option will give you more informations, as well as a visual overview +of the progress. ``` -asgart seq.fasta seq.fasta 20 100 -RTv +asgart seq.fasta seq.fasta -RCv ``` ## Input -As input, ASGART takes FASTA files containing the sequences within which to look for duplications. They can be either in the FASTA or multiFASTA format. If the input files are `s2 +As input, ASGART takes FASTA files containing the sequences within +which to look for duplications. They can be either in the FASTA or +multiFASTA format. ## Output ### JSON -By default, ASGART will write its result in a JSON file in the folder where it was launched, following the following structure: +By default, ASGART will write its result in a JSON file in the folder +where it was launched, following the following structure: ``` { @@ -116,8 +127,14 @@ By default, ASGART will write its result in a JSON file in the folder where it w ] }, - "kmer": probing kmer size, - "gap": maximum gap inbetween duplication arms, + "settings": { + "probe_size": probe size used, + "max_gap_size": maximal gap size used, + "min_duplication_length": minimal length for a duplicon, + "max_cardinality": maximal size of a family, + "skip_masked": were masked nucleotides skipped?, + "interlaced": were interlaced looked for? + }, "sds": [ { @@ -125,7 +142,7 @@ By default, ASGART will write its result in a JSON file in the folder where it w "right": position of the right arm in the second file, "length": length of the duplication (bp), "reversed": true if the duplication is reversed, false else, - "translated": true if the duplication is translated, false else + "complemented": true if the duplication is complemented, false else }, ... ] @@ -134,8 +151,9 @@ By default, ASGART will write its result in a JSON file in the folder where it w ### GFF -ASGART can also write its results in GFF2 or GFF3 files by using the `--format` option. For instance, use `--format gff3` to -save the results in a GFF3 file. +ASGART can also write its results in GFF2 or GFF3 files by using the +`--format` option. For instance, use `--format gff3` to save the +results in a GFF3 file. ## Options @@ -145,32 +163,42 @@ save the results in a GFF3 file. - `--reverse`/`-R` look for duplication which second arm is reversed - - `--translate`/`-T` look for duplication which second arm is translated + - `--complement`/`-C` look for duplication which second arm is + complemented - - `--max-cardinality` specifies the maximal count of members in a duplication family (default: 1000) + - `--max-cardinality` specifies the maximal count of members in a + duplication family (default: 1000) - - `--min-length SIZE` specifies the minimal length (in bp) over which a duplication is kept in the final result and not discarded (default: 1000) + - `--min-length SIZE` specifies the minimal length (in bp) over + which a duplication is kept in the final result and not discarded + (default: 1000) - - `--skip-masked`/`-S` skip soft-masked zones, _i.e._ lowercased parts of the input files (default: no) + - `--skip-masked`/`-S` skip soft-masked zones, _i.e._ lowercased + parts of the input files (default: no) ### Technical - `-h`, `--help` display an help screen - - `--out FILENAME` specifies the file in which the results will be written + - `--out FILENAME` specifies the file in which the results will be + written - - `--prefix NAME` defines a prefix to prepend to the standard out file name + - `--prefix NAME` defines a prefix to prepend to the standard out + file name - - `--format OUT_FORMAT` sets the output format. Default is `json`, but can be set to gff2 or gff3 + - `--format OUT_FORMAT` sets the output format. Default is `json`, + but can be set to gff2 or gff3 - - `--threads COUNT` set the numbers of thread to use. Defaults to the number of cores abailable on the CPU + - `--threads COUNT` set the numbers of thread to use. Defaults to + the number of cores abailable on the CPU - - `--trim START END` run ASGART only on the specified area of the first file + - `--trim START END` run ASGART only on the specified area of the + first file # Plotting ASGART comes with a plotting tool, producing a visual overview of the -duplications. Currently, two type of graphs are available: chord +duplications. Currently, two type of graphs are available: chord graphs, or flat graphs. ## Options @@ -179,28 +207,36 @@ graphs, or flat graphs. - `--out FILENAME` set output file name - - `--min-length` set the minimal length (in bp) for a duplication to be plotted (default: 5000bp) + - `--min-length` set the minimal length (in bp) for a duplication to + be plotted (default: 5000bp) - - `--min-identity` set the minimal identity rate (in %) for a duplication to be plotted (default: 0%). + - `--min-identity` set the minimal identity rate (in %) for a + duplication to be plotted (default: 0%). - `--no-direct` do not plot direct duplications - `--no-reversed` do not plot reversed duplications - - `--no-untranslated` do not plot non-translated duplications + - `--no-uncomplemented` do not plot non-complemented duplications - - `--no-translated` do not plot translated duplications + - `--no-complemented` do not plot complemented duplications - - `--features FILE` add an additional track containing features to plot alongside the duplications. + - `--features FILE` add an additional track containing features to + plot alongside the duplications. - - `--filter-features DISTANCE` don't plot duplications that are farther away then `DISTANCE` bp from the features in the track. + - `--filter-features DISTANCE` don't plot duplications that are + farther away then `DISTANCE` bp from the features in the track. ### Feature file format -The feature file format contains a list of lines with three values separated by semi-colons. +The feature file format contains a list of lines with three values +separated by semi-colons. 1. The label of the feature. -2. the start of the feaure. It may either be a single integer representing its absolute coordinate, or be of the form `NAME+OFFSET`, defining a start position at `OFFSET` from the start of `NAME` chromosomes (from the input FASTA file). +2. the start of the feaure. It may either be a single integer + representing its absolute coordinate, or be of the form + `NAME+OFFSET`, defining a start position at `OFFSET` from the start + of `NAME` chromosomes (from the input FASTA file). 3. The length of the feaure in base pairs. Comment lines starts with a `#`. @@ -217,7 +253,10 @@ Foo;123456789;1250 ## Chord graphs -A chord graph represents duplications amongst a DNA fragment as arcs linking point on a circle figuring a fragment bend over itself. Their width is directly proportional to the length of the duplications they represent. +A chord graph represents duplications amongst a DNA fragment as arcs +linking point on a circle figuring a fragment bend over itself. Their +width is directly proportional to the length of the duplications they +represent. ### Example @@ -227,10 +266,13 @@ A chord graph represents duplications amongst a DNA fragment as arcs linking poi ## Flat graphs -Flat graphs are made of two superposed horizontal lines, representing the two fragments analyzed by ASGART, with lines linking left and right parts of the duplications found, their width proportional to the length of the duplication. +Flat graphs are made of two superposed horizontal lines, representing +the two fragments analyzed by ASGART, with lines linking left and +right parts of the duplications found, their width proportional to the +length of the duplication. ### Example -`asgart-plot human_Y.json flat --out=flat.svg --no-direct --no-untranslated --min-length 2000` +`asgart-plot human_Y.json flat --out=flat.svg --no-direct --no-uncomplemented --min-length 2000` ![Flat graph example](screenshots/flat.png) diff --git a/src/automaton.rs b/src/automaton.rs index 86ded08..d1fd1a7 100644 --- a/src/automaton.rs +++ b/src/automaton.rs @@ -79,7 +79,7 @@ fn make_duplications(psd: &ProtoSD, length: size, identity: 0.0, reversed: false, - translated: false, + complemented: false, }); } diff --git a/src/bin/asgart-plot.rs b/src/bin/asgart-plot.rs index 186c2f0..b42357c 100644 --- a/src/bin/asgart-plot.rs +++ b/src/bin/asgart-plot.rs @@ -16,7 +16,6 @@ use std::fs::File; use std::path::Path; use clap::{App, AppSettings}; use colored::Colorize; -use bio::io::gff; use asgart::structs::*; use asgart::plot::*; use asgart::plot::chord_plot::ChordPlotter; @@ -110,7 +109,7 @@ fn read_feature_file(r: &RunResult, file: &str) -> Result> { } } -fn read_gff3_feature_file(r: &RunResult, file: &str) -> Result> { +fn read_gff3_feature_file(_r: &RunResult, file: &str) -> Result> { let f = File::open(file).chain_err(|| format!("Unable to open {}", file))?; let f = BufReader::new(f); @@ -238,37 +237,37 @@ fn run() -> Result<()> { let mut features_tracks = features_tracks.unwrap(); - if args.is_present("no-direct") { result.sds.retain(|sd| sd.reversed) } - if args.is_present("no-reversed") { result.sds.retain(|sd| !sd.reversed) } - if args.is_present("no-untranslated") { result.sds.retain(|sd| sd.translated) } - if args.is_present("no-translated") { result.sds.retain(|sd| !sd.translated) } + if args.is_present("no-direct") { result.sds.retain(|sd| sd.reversed) } + if args.is_present("no-reversed") { result.sds.retain(|sd| !sd.reversed) } + if args.is_present("no-uncomplemented") { result.sds.retain(|sd| sd.complemented) } + if args.is_present("no-complemented") { result.sds.retain(|sd| !sd.complemented) } if args.is_present("filter_duplications") {filter_sds_in_features(&mut result, &features_tracks, value_t!(args, "filter_duplications", usize).unwrap());} if args.is_present("filter_features") {filter_features_in_sds(&mut result, &mut features_tracks, value_t!(args, "filter_features", usize).unwrap());} let settings = Settings { - out_file: out_file, + out_file: out_file, - min_length: value_t!(args, "min_length", usize).unwrap(), - min_identity: value_t!(args, "min_identity", f32).unwrap(), - filter_direct: args.is_present("no-direct"), - filter_non_translated: args.is_present("no-untranslated"), - filter_reversed: args.is_present("no-reversed"), - filter_translated: args.is_present("no-translated"), + min_length: value_t!(args, "min_length", usize).unwrap(), + min_identity: value_t!(args, "min_identity", f32).unwrap(), + filter_direct: args.is_present("no-direct"), + filter_non_complemented: args.is_present("no-uncomplemented"), + filter_reversed: args.is_present("no-reversed"), + filter_complemented: args.is_present("no-complemented"), - size: 200.0, - thickness: 1.0, - color1: "#ff5b00".to_owned(), - color2: "#00b2ae".to_owned(), + size: 200.0, + thickness: 1.0, + color1: "#ff5b00".to_owned(), + color2: "#00b2ae".to_owned(), - feature_tracks: features_tracks, + feature_tracks: features_tracks, }; result.sds = result.sds - .into_iter() - .filter(|sd| !(settings.filter_direct && !sd.reversed)) - .filter(|sd| !(settings.filter_reversed && sd.reversed)) - .filter(|sd| !(settings.filter_non_translated && !sd.translated)) - .filter(|sd| !(settings.filter_translated && sd.translated)) + .into_iter() + .filter(|sd| !(settings.filter_direct && !sd.reversed)) + .filter(|sd| !(settings.filter_reversed && sd.reversed)) + .filter(|sd| !(settings.filter_non_complemented && !sd.complemented)) + .filter(|sd| !(settings.filter_complemented && sd.complemented)) .filter(|sd| sd.length >= settings.min_length) .filter(|sd| sd.identity >= settings.min_identity) .collect(); diff --git a/src/bin/asgart.rs b/src/bin/asgart.rs index 763a675..e65da19 100644 --- a/src/bin/asgart.rs +++ b/src/bin/asgart.rs @@ -52,7 +52,7 @@ struct Strand { fn prepare_data(strand1_file: &str, strand2_file: &str, reverse: bool, - translate: bool, + complement: bool, skip_masked: bool, trim: Option<(usize, usize)>) -> Result { @@ -124,7 +124,7 @@ fn prepare_data(strand1_file: &str, ) }; - if translate { strand1.data = utils::translated(&*strand1.data); } + if complement { strand1.data = utils::complemented(&*strand1.data); } if reverse { strand1.data.reverse(); } strand2.data.push(b'$'); @@ -227,7 +227,7 @@ fn merge(x: &SD, y: &SD) -> SD { length: cmp::min(lsize, rsize), identity: x.identity, reversed: x.reversed, - translated: x.translated, + complemented: x.complemented, } } @@ -289,7 +289,7 @@ fn run() -> Result<()> { skip_masked: bool, reverse: bool, - translate: bool, + complement: bool, interlaced: bool, trim: Vec, @@ -315,7 +315,7 @@ fn run() -> Result<()> { skip_masked: args.is_present("skipmasked"), reverse: args.is_present("reverse"), - translate: args.is_present("translate"), + complement: args.is_present("complement"), interlaced: args.is_present("interlaced"), trim: values_t!(args, "trim", usize).unwrap_or_else(|_| Vec::new()), @@ -341,7 +341,7 @@ fn run() -> Result<()> { settings.kmer_size, settings.gap_size, if settings.reverse {"r"} else {""}, - if settings.translate {"t"} else {""}, + if settings.complement {"c"} else {""}, ) } else { settings.out @@ -353,7 +353,7 @@ fn run() -> Result<()> { trace!("Max gap size {}", settings.gap_size); trace!("Output file {}", &out_file); trace!("Reverse 2nd strand {}", settings.reverse); - trace!("Translate 2nd strand {}", settings.translate); + trace!("Complement 2nd strand {}", settings.complement); trace!("Interlaced SD {}", settings.interlaced); trace!("Skipping soft-masked {}", settings.skip_masked); trace!("Min. length {}", settings.min_duplication_length); @@ -381,7 +381,7 @@ fn run() -> Result<()> { max_cardinality: settings.max_cardinality, reverse: settings.reverse, - translate: settings.translate, + complement: settings.complement, interlaced: settings.interlaced, skip_masked: settings.skip_masked, @@ -419,7 +419,7 @@ fn search_duplications( strand1_file, strand2_file, settings.reverse, - settings.translate, + settings.complement, settings.skip_masked, trim)?; @@ -492,12 +492,12 @@ fn search_duplications( let mut result = rx.iter().fold(Vec::new(), |mut a, b| { a.extend(b.iter().map(|sd| { SD { - left: if !settings.reverse {sd.left} else {strand1.data.len() - sd.left - sd.length - 1}, - right: sd.right, - length: sd.length, - identity: sd.identity, - reversed: settings.reverse, - translated: settings.translate, + left: if !settings.reverse {sd.left} else {strand1.data.len() - sd.left - sd.length - 1}, + right: sd.right, + length: sd.length, + identity: sd.identity, + reversed: settings.reverse, + complemented: settings.complement, } })); a diff --git a/src/bin/asgart.yaml b/src/bin/asgart.yaml index 88b79bb..1664419 100644 --- a/src/bin/asgart.yaml +++ b/src/bin/asgart.yaml @@ -9,23 +9,29 @@ args: help: Second strand, FASTA format required: true index: 2 + - probe_size: help: Length of the probing k-mers - required: true - index: 3 + long: probe-size + short: k + default_value: "20" + takes_value: true + - max_gap: help: Maximum length of a gap - required: true - index: 4 + long: gap-size + short: g + default_value: "100" + takes_value: true - reverse: short: R long: reverse help: Reverse the second strand - - translate: - short: T - long: translate - help: Translate the second strand + - complement: + short: C + long: complement + help: Complement the second strand - skipmasked: long: skip-masked short: S diff --git a/src/bin/plot.yaml b/src/bin/plot.yaml index 91dd5ca..df33a0f 100644 --- a/src/bin/plot.yaml +++ b/src/bin/plot.yaml @@ -35,14 +35,14 @@ args: long: no-reversed short: r help: filters out reversed duplications - - no-untranslated: - long: no-untranslated + - no-uncomplemented: + long: no-uncomplemented short: u - help: filters out non translated duplications - - no-translated: - long: no-translated + help: filters out non complemented duplications + - no-complemented: + long: no-complemented short: t - help: filters out translated duplications + help: filters out complemented duplications - features: long: features help: additional gene tracks to plot diff --git a/src/plot/chord_plot.rs b/src/plot/chord_plot.rs index 0645de9..5aab7c7 100644 --- a/src/plot/chord_plot.rs +++ b/src/plot/chord_plot.rs @@ -2,7 +2,6 @@ extern crate rand; use std::io::prelude::*; use std::fs::File; -use std::path::Path; use std::collections::HashMap; use std::f64::consts::PI; use ::plot::*; @@ -156,10 +155,7 @@ impl ChordPlotter { for sd in self.result.sds .iter() - .filter(|&sd| sd.identity >= self.settings.min_identity) - .filter(|&sd| !(self.settings.filter_reversed && sd.reversed)) - .filter(|&sd| !(self.settings.filter_translated && sd.translated)) - .filter(|&sd| self.inter_sd(sd) && sd.length >= self.settings.min_length) { + .filter(|&sd| self.inter_sd(sd)) { let (left, right) = (sd.left as i64, sd.right as i64); let t11 = self.angle(left as f64); @@ -209,10 +205,7 @@ impl ChordPlotter { for sd in self.result.sds .iter() - .filter(|&sd| sd.identity >= self.settings.min_identity) - .filter(|&sd| !(self.settings.filter_reversed && sd.reversed)) - .filter(|&sd| !(self.settings.filter_translated && sd.translated)) - .filter(|&sd| self.intra_sd(sd) && sd.length >= self.settings.min_length) { + .filter(|&sd| self.intra_sd(sd)) { let (left, right) = (sd.left as i64, sd.right as i64); let t11 = self.angle(left as f64); diff --git a/src/plot/flat_plot.rs b/src/plot/flat_plot.rs index 7637a2f..15f9438 100644 --- a/src/plot/flat_plot.rs +++ b/src/plot/flat_plot.rs @@ -5,7 +5,6 @@ use separator::Separatable; use std::cmp; use std::io::prelude::*; use std::fs::File; -use ::structs::*; use ::plot::*; const CHR_WIDTH: f64 = 4.0; diff --git a/src/plot/genome_plot.rs b/src/plot/genome_plot.rs index 1b4b25d..5f1a777 100644 --- a/src/plot/genome_plot.rs +++ b/src/plot/genome_plot.rs @@ -2,7 +2,6 @@ use ::structs::*; use std::fs::File; use std::io::Write; use ::plot::{Plotter, Settings}; -use plot::regex::Regex; pub struct GenomePlotter { result: RunResult, diff --git a/src/plot/mod.rs b/src/plot/mod.rs index 47a5cfa..9044258 100644 --- a/src/plot/mod.rs +++ b/src/plot/mod.rs @@ -20,8 +20,8 @@ pub struct Settings { pub min_identity: f32, pub filter_direct: bool, pub filter_reversed: bool, - pub filter_translated: bool, - pub filter_non_translated: bool, + pub filter_complemented: bool, + pub filter_non_complemented: bool, pub feature_tracks: Vec>, } diff --git a/src/structs.rs b/src/structs.rs index 2972a22..c2d78df 100644 --- a/src/structs.rs +++ b/src/structs.rs @@ -12,8 +12,12 @@ pub struct RunSettings { pub min_duplication_length: usize, pub max_cardinality: usize, + #[serde(skip_serializing)] + #[serde(default)] pub reverse: bool, - pub translate: bool, + #[serde(skip_serializing)] + #[serde(default)] + pub complement: bool, pub skip_masked: bool, pub interlaced: bool, @@ -74,7 +78,7 @@ pub struct SD { pub length: usize, pub identity: f32, pub reversed: bool, - pub translated: bool, + pub complemented: bool, } impl SD { diff --git a/src/utils.rs b/src/utils.rs index 3823045..eab86b5 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,4 +1,4 @@ -pub fn translate_nucleotide(n: u8) -> u8 { +pub fn complement_nucleotide(n: u8) -> u8 { match n { b'A' => b'T', b'T' => b'A', @@ -16,6 +16,6 @@ pub fn translate_nucleotide(n: u8) -> u8 { } } -pub fn translated(text: &[u8]) -> Vec { - text.iter().map(|x| translate_nucleotide(*x)).collect::>() +pub fn complemented(text: &[u8]) -> Vec { + text.iter().map(|x| complement_nucleotide(*x)).collect::>() }