Skip to content

Commit

Permalink
add subcommand grep
Browse files Browse the repository at this point in the history
  • Loading branch information
sharkLoc committed Mar 28, 2024
1 parent 60dac6e commit 44ded15
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 5 deletions.
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fakit"
version = "0.3.2"
version = "0.3.3"
edition = "2021"
authors = ["sharkLoc <mmtinfo@163.com>"]
rust-version = "1.65.0"
Expand All @@ -27,3 +27,5 @@ regex = "1.9.5"
rust-htslib = "0.40.2"
xz2 = "0.1.7"

[profile.release]
strip = true
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ cargo install fakit
```bash
Fakit: A simple program for fasta file manipulation

Version: 0.3.2
Version: 0.3.3

Authors: sharkLoc <mmtinfo@163.com>
Source code: https://github.com/sharkLoc/fakit.git
Expand Down Expand Up @@ -47,6 +47,7 @@ Commands:
rename rename sequence id in fasta file
reverse get a reverse-complement of fasta file [aliases: rev]
window stat dna fasta gc content by sliding windows [aliases: slide]
grep grep fasta sequences by name/seq
seq convert all bases to lower/upper case, filter by length
sort sort fasta file by name/seq/gc/length
search search subsequences/motifs from fasta file
Expand Down
27 changes: 24 additions & 3 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use clap::{Parser,value_parser};
#[command(
name = "Fakit",
author = "sharkLoc",
version = "0.3.2",
version = "0.3.3",
about = "A simple program for fasta file manipulation",
long_about = None,
next_line_help = false,
Expand Down Expand Up @@ -173,6 +173,27 @@ pub enum Subcli {
#[arg(short = 'o', long = "out",verbatim_doc_comment, value_name = "str")]
output: Option<String>,
},
/// grep fasta sequences by name/seq
grep {
/// input fasta file, or read from stdin
input: Option<String>,
/// specify regex pattern/motif, e.g., -p "ATC{2,}" or -p ATCCG, search multiple pattern/motif, -p "ATCCG|GCTAA"
/// when searching by sequence name, the sequence prefix ">" is not included in the header.
#[arg(short = 'p', long = "pattern",verbatim_doc_comment, value_name = "str")]
pat: String,
/// grep sequences by full name
#[arg(short = 'n', long = "by-name", help_heading = Some("FLAGS"))]
name: bool,
/// grep sequences by sequence
#[arg(short = 's', long = "by-seq", help_heading = Some("FLAGS"))]
seq: bool,
/// grep sequences by sequence
#[arg(short = 'i', long = "ignore-case", help_heading = Some("FLAGS"))]
ignore: bool,
/// output search result file name, or write to stdout, file name ending in .gz/.bz2/.xz will be compressed automatically
#[arg(short = 'o', long = "out", value_name = "str" )]
output: Option<String>,
},
/// convert all bases to lower/upper case, filter by length
seq {
/// input fasta file, or read from stdin
Expand Down Expand Up @@ -225,11 +246,11 @@ pub enum Subcli {
},
/// search subsequences/motifs from fasta file
search {
/// input fasta[.gz] file, or read from stdin
/// input fasta file, or read from stdin
input: Option<String>,
/// specify uppercase pattern/motif, e.g., -p "ATC{2,}" or -p ATCCG
///search multiple pattern/motif, -p "ATCCG|GCTAA"
#[arg(short = 'p', long = "pattern",verbatim_doc_comment)]
#[arg(short = 'p', long = "pattern",verbatim_doc_comment, value_name = "str")]
pat: String,
/// if specified, show header in result
#[arg(short = 'H', long = "header", help_heading = Some("FLAGS"))]
Expand Down
85 changes: 85 additions & 0 deletions src/grep.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
use anyhow::Error;
use regex::RegexBuilder;
use bio::io::fasta;
use std::time::Instant;
use log::*;
use crate::utils::*;
use crate::wrap::*;


pub fn grep_fasta(
file: &Option<&str>,
out: &Option<&str>,
pat: &str,
case: bool,
by_id: bool,
by_seq: bool,
line_width: usize,
compression_level: u32,
) -> Result<(),Error> {
let start = Instant::now();
if let Some(file) = file {
info!("reading from file: {}",file);
} else {
info!("reading from stdin");
}

let mut n = 0usize;
if by_id {
n += 1;
}
if by_seq {
n += 1;
}
match n {
1 => { () }
0 => {
error!("please specifiy one of the flags: -n or -s");
std::process::exit(1);
}
_ => {
error!("only one of the flags -n (--by-name) or -s (--by-seq), is allowed");
std::process::exit(1);
}
}
info!("regex pattern is: {}",pat);
let mut n = 0usize;

let re = RegexBuilder::new(pat)
.case_insensitive(case)
.unicode(true)
.build()?;

let mut fo = file_writer(out, compression_level).map(fasta::Writer::new)?;

if by_seq {
let fp = file_reader(file).map(fasta::Reader::new)?;
for rec in fp.records().flatten(){
let seq_str = std::str::from_utf8(rec.seq())?;
if let Some(_) = re.captures(seq_str) {
n += 1;
let seq_new = wrap_fasta(rec.seq(), line_width)?;
fo.write(rec.id(), rec.desc(), seq_new.as_slice())?;
}
}
}
if by_id {
let fp = file_reader(file).map(fasta::Reader::new)?;
for rec in fp.records().flatten(){
let name = if let Some(desc) = rec.desc() {
format!("{} {}",rec.id(), desc)
} else {
rec.id().to_owned()
};
if let Some(_) = re.captures(&name) {
n += 1;
let seq_new = wrap_fasta(rec.seq(), line_width)?;
fo.write(rec.id(), rec.desc(), seq_new.as_slice())?;
}
}
}
fo.flush()?;
info!("total match sequences number: {}",n);
info!("time elapsed is: {:?}",start.elapsed());
Ok(())
}
17 changes: 17 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ mod reverse;
use reverse::*;
mod search;
use search::*;
mod grep;
use grep::*;
mod seq;
use seq::*;
mod shuffle;
Expand Down Expand Up @@ -189,6 +191,21 @@ fn main() -> Result<(), Error> {
}
}
}
Subcli::grep { input, pat, name, seq, ignore, output } => {
if let Some(input) = input {
if let Some(out) = output {
grep_fasta(&Some(&input), &Some(&out), &pat, ignore, name, seq, args.width, args.compression_level)?;
} else {
grep_fasta(&Some(&input), &None, &pat, ignore, name, seq, args.width, args.compression_level)?;
}
} else {
if let Some(out) = output {
grep_fasta(&None, &Some(&out), &pat, ignore, name, seq, args.width, args.compression_level)?;
} else {
grep_fasta(&None, &None, &pat, ignore, name, seq, args.width, args.compression_level)?;
}
}
}
Subcli::seq { input, lower, upper, min, max, gc_min, gc_max, out } => {
if let Some(input) = input {
if let Some(out) = out {
Expand Down

0 comments on commit 44ded15

Please sign in to comment.