bug fix

eric9n · Jun 2, 2024 · 2b45f05 · 2b45f05
1 parent b9a7df4
commit 2b45f05
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 23 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -9,7 +9,7 @@ on:
 # On each push to the `release` branch it will create or update a GitHub release, build your app, and upload the artifacts to the release.
 env:
   CARGO_TERM_COLOR: always
-  BINARIES_LIST: 'ncbi KunPeng'
+  BINARIES_LIST: 'ncbi kun_peng'
   PROJECT_PREFIX: 'kraken2-rust-'
 
 jobs:

diff --git a/README.md b/README.md
@@ -105,8 +105,8 @@ To get started with Squid, you can invoke the tool with the -h or --help option
 
 
 ```bash
-./KunPeng -h
-Usage: KunPeng <COMMAND>
+./kun_peng -h
+Usage: kun_peng <COMMAND>
 
 Commands:
   estimate     estimate capacity
@@ -128,7 +128,7 @@ This will provide you with an overview of all available commands and options. Fo
 
 
 ```bash
-./KunPeng <COMMAND> -h
+./kun_peng <COMMAND> -h
 ```
 Replace <COMMAND> with the name of the subcommand for which you need detailed help, such as estimate, build, or classify.
 
@@ -140,9 +140,9 @@ The seqid2taxid tool is a utility within the kr2r package designed to facilitate
 ### Usage
 
 ```bash
-KunPeng seqid2taxid -h
+kun_peng seqid2taxid -h
 
-Usage: KunPeng seqid2taxid [OPTIONS] --source <SOURCE>
+Usage: kun_peng seqid2taxid [OPTIONS] --source <SOURCE>
 
 Options:
       --source <SOURCE>      the database directory
@@ -155,7 +155,7 @@ Options:
 To use the seqid2taxid tool, execute it with the required and optional arguments as follows:
 
 ```bash
-KunPeng seqid2taxid [OPTIONS] --source <SOURCE>
+kun_peng seqid2taxid [OPTIONS] --source <SOURCE>
 ```
 
 ### Required Options
@@ -170,13 +170,13 @@ KunPeng seqid2taxid [OPTIONS] --source <SOURCE>
 To run the seqid2taxid tool with a specific source directory:
 
 ```bash
-KunPeng seqid2taxid --source /path/to/database
+kun_peng seqid2taxid --source /path/to/database
 ```
 
 To specify a custom map file path:
 
 ```bash
-KunPeng seqid2taxid --source /path/to/database -f /path/to/custom/seqid2taxid.map
+kun_peng seqid2taxid --source /path/to/database -f /path/to/custom/seqid2taxid.map
 ```
 
 ## 2.2 Estimate Capacity Tool
@@ -188,7 +188,7 @@ The estimate_capacity tool is designed for estimating the capacity required for
 To use the estimate_capacity tool, execute it from the command line with the desired options:
 
 ```bash
-KunPeng estimate_capacity [OPTIONS]
+kun_peng estimate_capacity [OPTIONS]
 ```
 
 Options
@@ -207,7 +207,7 @@ Options
 ### Example
 
 ```bash
-KunPeng estimate_capacity -k 35 -l 31 --source /data/ncbi/path -p 10 --load-factor 0.7
+kun_peng estimate_capacity -k 35 -l 31 --source /data/ncbi/path -p 10 --load-factor 0.7
 ```
 
 ### Output
@@ -220,10 +220,10 @@ estimate count: 1213069985, required capacity: 1732968825.0, Estimated hash tabl
 ## 2.3 build
 
 ```bash
-./KunPeng build -h
+./kun_peng build -h
 build database
 
-Usage: KunPeng build [OPTIONS] --source <SOURCE> -H <HASHTABLE_FILENAME> -o <OPTIONS_FILENAME> -t <TAXONOMY_FILENAME> -m <ID_TO_TAXON_MAP_FILENAME> --ncbi-taxonomy-directory <NCBI_TAXONOMY_DIRECTORY> --required-capacity <REQUIRED_CAPACITY> --chunk-dir <CHUNK_DIR>
+Usage: kun_peng build [OPTIONS] --source <SOURCE> -H <HASHTABLE_FILENAME> -o <OPTIONS_FILENAME> -t <TAXONOMY_FILENAME> -m <ID_TO_TAXON_MAP_FILENAME> --ncbi-taxonomy-directory <NCBI_TAXONOMY_DIRECTORY> --required-capacity <REQUIRED_CAPACITY> --chunk-dir <CHUNK_DIR>
 
 Options:
       --source <SOURCE>
@@ -273,10 +273,10 @@ Options:
 ## 2.4 hashshard
 
 ```bash
-./KunPeng hashshard -h
+./kun_peng hashshard -h
 split hash file
 
-Usage: KunPeng hashshard [OPTIONS] --db <DB>
+Usage: kun_peng hashshard [OPTIONS] --db <DB>
 
 Options:
       --db <DB>                The database directory for the Kraken 2 index. contains index file(hash.k2d opts.k2d taxo.k2d)
@@ -289,10 +289,10 @@ Options:
 ## 2.5 splitr
 
 ```bash
-./KunPeng splitr -h
+./kun_peng splitr -h
 Split fast(q/a) file into ranges
 
-Usage: KunPeng splitr [OPTIONS] --hash-dir <HASH_DIR> --chunk-dir <CHUNK_DIR> [INPUT_FILES]...
+Usage: kun_peng splitr [OPTIONS] --hash-dir <HASH_DIR> --chunk-dir <CHUNK_DIR> [INPUT_FILES]...
 
 Arguments:
   [INPUT_FILES]...  A list of input file paths (FASTA/FASTQ) to be processed by the classify program
@@ -321,7 +321,7 @@ Options:
 ```bash
 annotate a set of sequences
 
-Usage: KunPeng annotate [OPTIONS] --hash-dir <HASH_DIR> --chunk-dir <CHUNK_DIR>
+Usage: kun_peng annotate [OPTIONS] --hash-dir <HASH_DIR> --chunk-dir <CHUNK_DIR>
 
 Options:
       --hash-dir <HASH_DIR>      database hash chunk directory and other files
@@ -337,7 +337,7 @@ Options:
 ```bash
 resolve taxonomy tree
 
-Usage: KunPeng resolve [OPTIONS] --hash-dir <HASH_DIR> --chunk-dir <CHUNK_DIR>
+Usage: kun_peng resolve [OPTIONS] --hash-dir <HASH_DIR> --chunk-dir <CHUNK_DIR>
 
 Options:
       --hash-dir <HASH_DIR>
@@ -368,10 +368,10 @@ Options:
 ## 2.8 classify
 
 ```bash
-./KunPeng classify -h
+./kun_peng classify -h
 Integrates 'splitr', 'annotate', and 'resolve' into a unified workflow for sequence classification. classify a set of sequences
 
-Usage: KunPeng classify [OPTIONS] --hash-dir <HASH_DIR> --chunk-dir <CHUNK_DIR> [INPUT_FILES]...
+Usage: kun_peng classify [OPTIONS] --hash-dir <HASH_DIR> --chunk-dir <CHUNK_DIR> [INPUT_FILES]...
 
 Arguments:
   [INPUT_FILES]...  A list of input file paths (FASTA/FASTQ) to be processed by the classify program

diff --git a/kr2r.sh b/kr2r.sh
@@ -21,4 +21,4 @@ ${DIR}/ncbi --db $DATABASE gen -g bacteria,viral fna
 # ${DIR}/Kun estimate_capacity --db $DATABASE -k 35 -l 31
 
 # 4. build
-${DIR}/KunPeng build --db $DATABASE --chunk-dir ${DATABASE_CHUNK}
+${DIR}/kun_peng build --db $DATABASE --chunk-dir ${DATABASE_CHUNK}
diff --git a/kr2r/Cargo.toml b/kr2r/Cargo.toml
@@ -11,7 +11,7 @@ path = "src/bin/inspect.rs"
 
 
 [[bin]]
-name = "KunPeng"
+name = "kun_peng"
 path = "src/bin/kun.rs"
 
 [features]

diff --git a/kr2r/src/bin/resolve.rs b/kr2r/src/bin/resolve.rs
@@ -410,13 +410,35 @@ pub fn run(args: Args) -> Result<()> {
                 });
         }
 
+        let mut sample_taxon_counts: HashMap<
+            u64,
+            kr2r::readcounts::ReadCounts<hyperloglogplus::HyperLogLogPlus<u64, kr2r::KBuildHasher>>,
+        > = HashMap::new();
         thread_taxon_counts.iter().for_each(|entry| {
             total_taxon_counts
                 .entry(*entry.key())
                 .or_default()
                 .merge(&entry.value())
                 .unwrap();
+            sample_taxon_counts
+                .entry(*entry.key())
+                .or_default()
+                .merge(&entry.value())
+                .unwrap();
         });
+        if let Some(output) = &args.kraken_output_dir {
+            let filename = output.join(format!("output_{}.kreport2", i + 1));
+            report_kraken_style(
+                filename,
+                args.report_zero_counts,
+                args.report_kmer_data,
+                &taxo,
+                &sample_taxon_counts,
+                thread_sequences as u64,
+                (thread_sequences - thread_classified) as u64,
+            )?;
+        }
+
         total_seqs += thread_sequences;
         total_unclassified += thread_sequences - thread_classified;
     }

diff --git a/kr2r/src/readcounts.rs b/kr2r/src/readcounts.rs
@@ -75,6 +75,19 @@ where
     kmers: T,
 }
 
+// impl<T> Clone for ReadCounts<T>
+// where
+//     T: Unionable + Clone,
+// {
+//     fn clone(&self) -> Self {
+//         ReadCounts {
+//             n_reads: AtomicU64::new(self.n_reads.load(Ordering::Relaxed)),
+//             n_kmers: AtomicU64::new(self.n_kmers.load(Ordering::Relaxed)),
+//             kmers: self.kmers.clone(),
+//         }
+//     }
+// }
+
 impl<T> ReadCounts<T>
 where
     T: Unionable,