Skip to content

Commit

Permalink
add fasta file as optional input
Browse files Browse the repository at this point in the history
  • Loading branch information
angelovangel committed Dec 1, 2021
1 parent eda6749 commit 258c0d9
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 16 deletions.
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

# fastkmers

A simple program for getting k-mer counts from a fastq file, written in Rust.
A simple program for getting k-mer counts from a fastq/fasta file, written in Rust.

## Description

This command line program takes a fastq file as input (can be `.gz` also) and outputs the counts of [k-mers](https://en.wikipedia.org/wiki/K-mer) of a specified length. It is implemented using hash table and a simple algortihm but is still reasonably fast. The maximum supported k-mer size is 21.
This command line program takes a fastq/fasta file as input and outputs the counts of [k-mers](https://en.wikipedia.org/wiki/K-mer) of a specified length. It is implemented using hash tables and a simple algortihm but is still reasonably fast. The maximum supported k-mer size is 21.

## Install

Expand All @@ -26,15 +26,16 @@ The executable file `fastkmers` is now under `./target/release/`


```bash
# Make sure the executable is in your path
# check available options

# run it like this:
./target/release/fastkmers -k 4 /path/to/fastq/file.fastq.gz
fastkmers -h

# to get 4-mer counts and a summary
fastkmers -k 4 -s file.fastq.gz

# output json
fastkmers -k 4 -j file.fastq.gz
# output json, input fasta
fastkmers -k 4 -a -j file.fasta

```

Expand Down
48 changes: 40 additions & 8 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
use bio::io::fastq;
use bio::io::{fastq, fasta};
use flate2::bufread;
use std::{fs, str, io::BufReader, collections::HashMap, process};
use serde::{Deserialize, Serialize};

extern crate clap;
use clap::{App, Arg};


// own functions
// fastq reader, file as arg, decide based on extension
fn get_fastq_reader(path: &String) -> Box<dyn (::std::io::Read)> {
Expand All @@ -27,7 +29,7 @@ struct Kmer {
fn main() {
let matches = App::new("fastkmers")
.author("https://github.com/angelovangel")
.about("get k-mer counts from a fastq file")
.about("get k-mer counts and multiplicity frequency from a fastq file")

.arg(Arg::with_name("kmer")
.required(true)
Expand All @@ -41,7 +43,14 @@ fn main() {
.long("summary")
.short("s")
.takes_value(false)
.help("display fastq file summary information at the end of the output (optional)"))
.help("display fastq file summary information at the end of the output (optional)")).

arg(Arg::with_name("fasta")
.required(false)
.long("fasta")
.short("a")
.takes_value(false)
.help("input is fasta file (default is fastq)"))

.arg(Arg::with_name("json")
.conflicts_with("summary")
Expand All @@ -56,15 +65,15 @@ fn main() {
.short("f")
.required(false)
.takes_value(false)
.help("Output a frequency table of multiplicity versus number of occurence, for building k-mer spectra, see https://en.wikipedia.org/wiki/K-mer (optional)"))
.help("Output a frequency table (multiplicity versus occurence), for building k-mer spectra, see https://en.wikipedia.org/wiki/K-mer (optional)"))

.arg(Arg::with_name("INPUT")
.help("Path to a fastq file")
.help("Path to a fastq/fasta file")
.required(true)
.index(1)).get_matches();

let infile = matches.value_of("INPUT").unwrap().to_string();
let reader = fastq::Reader::new(get_fastq_reader(&infile));
// fastq or fasta
//let mut record = fastq::Record::new();
let mut kmer_counts: HashMap<String,i32> = HashMap::new();

Expand All @@ -77,6 +86,9 @@ fn main() {

let mut reads: i64 = 0;
let mut kmers: i64 = 0;

if matches.is_present("fasta") {
let reader = fasta::Reader::new(get_fastq_reader(&infile));

for result in reader.records(){
reads += 1;
Expand All @@ -94,6 +106,26 @@ fn main() {
}

}
} else {
let reader = fastq::Reader::new(get_fastq_reader(&infile));

for result in reader.records(){
reads += 1;

let record = result.expect("Error");
let seq = record.seq();
let seq_str = str::from_utf8(seq).unwrap().to_string();
//println!("{:?}", seq);
for c in 0..seq_str.len() - k + 1 {
let subseq = &seq_str[c..c + k];

kmers += subseq.len() as i64;

*kmer_counts.entry(subseq.to_string() ).or_insert(0) += 1;
}
}
}

if matches.is_present("json") {
let j = serde_json::to_string(&kmer_counts).unwrap();
println!("{}", j);
Expand Down Expand Up @@ -126,8 +158,8 @@ fn main() {
if matches.is_present("summary") {
let unique_kmers = kmer_counts.len();
println!("---");
println!("reads \t total_k-mers \t unique_k-mers");
println!("{} \t {} \t {}", reads, kmers, unique_kmers);
println!("reads\ttotal_k-mers\tunique_k-mers");
println!("{}\t{}\t{}", reads, kmers, unique_kmers);
println!("---");
}

Expand Down
18 changes: 16 additions & 2 deletions tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::process::Command; // Run programs


#[test]
fn test_kmer_output() -> Result<(), Box<dyn std::error::Error>> {
fn test_kmer_fastq() -> Result<(), Box<dyn std::error::Error>> {

let mut cmd = Command::cargo_bin("fastkmers")?;
cmd.arg("-k 3").arg("tests/test.fastq");
Expand All @@ -17,7 +17,21 @@ fn test_kmer_output() -> Result<(), Box<dyn std::error::Error>> {
}

#[test]
fn test_freq_output() -> Result<(), Box<dyn std::error::Error>> {
fn test_kmer_fasta() -> Result<(), Box<dyn std::error::Error>> {

let mut cmd = Command::cargo_bin("fastkmers")?;
cmd.arg("-k 3").arg("--fasta").arg("tests/test.fasta");

cmd.assert()
.success()
.stdout(predicate::str::contains("ATC\t10"));

Ok(())
}


#[test]
fn test_freq() -> Result<(), Box<dyn std::error::Error>> {

let mut cmd = Command::cargo_bin("fastkmers")?;

Expand Down
16 changes: 16 additions & 0 deletions tests/test.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
>NC_006347.1_0/1
AACGTTGGCTCGTCTGACAAGCAAAGGTAAGAACAAAGGTGCCAAATATCGTAAAGAAAA
ACGTGACATGGCGTCCAACCGTATGCAGGAACTGGAAGATCAGGAAATGGCAGAAAGCAA
GGTACT
>NC_006347.1_1/1
CGCAAAGCCAGCTTCGCGAATTGATCCGCCAGAGTTTCAATCATCCTTCCATTTATGTAT
GGGGGCTTCACAATGAAGTATATCAACCACATGAGTATACAGCTGCATTGACCCGTTCTC
TCCATG
>NC_006347.1_2/1
TTTGTTCGTTCGATCCGAAGACCGAAACATTTATTGATTTCGATCCGGATAATACAATAT
TGCCCAACCGGGTCATCTACTCCATTGAACAAGACAAGACCGGTGACTTCTGGATATCCA
GTAATG
>NC_006347.1_3/1
ATATTACATATAAACAAACGAAAAGTTCCGCAATTACAAAGATTTATGATTCAACTACCC
TTTCAACCCGGAAGACGAAGGCAAGCGATAGACGTACTTTTTCCTCTTTTCTCGCACAAA
GATATG

0 comments on commit 258c0d9

Please sign in to comment.