优化代码结构

eric9n · Apr 24, 2024 · da60162 · da60162
1 parent ed94634
commit da60162
Show file tree

Hide file tree

Showing 12 changed files with 298 additions and 699 deletions.
diff --git a/kr2r/src/args.rs b/kr2r/src/args.rs
@@ -15,16 +15,11 @@ pub const ONEGB: u64 = 1073741824;
 pub struct Build {
     /// ncbi library fna database directory
     #[arg(long = "db", required = true)]
-    pub source: PathBuf,
-
-    /// Kraken 2 hash table filename, default = $database/hash.k2d
-    #[clap(short = 'H')]
-    pub hashtable_filename: Option<PathBuf>,
-
-    /// Kraken 2 options filename, default = $database/opts.k2d
-    #[clap(short = 'o')]
-    pub options_filename: Option<PathBuf>,
+    pub database: PathBuf,
 
+    // /// Kraken 2 options filename, default = $database/opts.k2d
+    // #[clap(short = 'o')]
+    // pub options_filename: Option<PathBuf>,
     /// 包含原始配置
     #[clap(flatten)]
     pub klmt: KLMTArgs,
@@ -41,9 +36,9 @@ pub struct Build {
 #[derive(Parser, Debug, Clone)]
 #[clap(version, about = "taxonomy")]
 pub struct Taxo {
-    /// Kraken 2 taxonomy filename, default = $database/taxo.k2d
-    #[clap(short = 't')]
-    pub taxonomy_filename: Option<PathBuf>,
+    // /// Kraken 2 taxonomy filename, default = $database/taxo.k2d
+    // #[clap(short = 't')]
+    // pub taxonomy_filename: Option<PathBuf>,
 
     // #[clap(short = 'm', required = true)]
     // pub id_to_taxon_map_filename: PathBuf,
@@ -75,7 +70,11 @@ const BATCH_SIZE: usize = 8 * 1024 * 1024;
 pub struct ClassifyArgs {
     /// database hash chunk directory and other files
     #[clap(long)]
-    pub hash_dir: PathBuf,
+    pub k2d_dir: PathBuf,
+
+    /// Enables use of a Kraken 2 compatible shared database. Default is false.
+    #[clap(long, default_value_t = false)]
+    pub kraken_db_type: bool,
 
     // /// The file path for the Kraken 2 options.
     // #[clap(short = 'o', long = "options-filename", value_parser, required = true)]

diff --git a/kr2r/src/bin/annotate.rs b/kr2r/src/bin/annotate.rs
@@ -1,4 +1,4 @@
-use clap::Parser;
+use clap::{Parser, ValueEnum};
 use kr2r::compact_hash::{CHPage, Compact, HashConfig, K2Compact, Row, Slot};
 use kr2r::utils::find_and_sort_files;
 // use std::collections::HashMap;
@@ -13,6 +13,11 @@ use std::time::Instant;
 // 定义每批次处理的 Slot 数量
 pub const BATCH_SIZE: usize = 8 * 1024 * 1024;
 
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
+enum DbType {
+    Kraken,
+    Squid,
+}
 /// Command line arguments for the splitr program.
 ///
 /// This structure defines the command line arguments that are accepted by the splitr program.
@@ -26,13 +31,11 @@ pub const BATCH_SIZE: usize = 8 * 1024 * 1024;
 pub struct Args {
     /// database hash chunk directory and other files
     #[clap(long)]
-    pub hash_dir: PathBuf,
-    // /// The file path for the Kraken 2 index.
-    // #[clap(short = 'H', long = "index-filename", value_parser, required = true)]
-    // index_filename: PathBuf,
-    /// The file path for the Kraken 2 options.
-    // #[clap(short = 'o', long = "options-filename", value_parser, required = true)]
-    // options_filename: String,
+    pub k2d_dir: PathBuf,
+
+    /// Enables use of a Kraken 2 compatible shared database. Default is false.
+    #[clap(long, default_value_t = false)]
+    pub kraken_db_type: bool,
 
     /// chunk directory
     #[clap(long)]
@@ -97,9 +100,10 @@ fn process_batch<R, K>(
     chtm: &K,
     chunk_dir: PathBuf,
     batch_size: usize,
+    kraken_db_type: bool,
 ) -> std::io::Result<()>
 where
-    K: K2Compact<u32> + Send,
+    K: K2Compact + Send,
     R: Read + Send,
 {
     let slot_size = std::mem::size_of::<Slot<u64>>();
@@ -129,7 +133,7 @@ where
             .into_par_iter()
             .filter_map(|slot| {
                 let indx = slot.idx & idx_mask;
-                let taxid = chtm.get_from_page(indx, slot.value);
+                let taxid = chtm.get_from_page(indx, slot.value, kraken_db_type);
 
                 if taxid > 0 {
                     let kmer_id = slot.idx >> idx_bits;
@@ -200,7 +204,7 @@ fn process_chunk_file<P: AsRef<Path>>(
 
     let start = Instant::now();
 
-    let config = HashConfig::<u32>::from_hash_header(&args.hash_dir.join("hash_config.k2d"))?;
+    let config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?;
     let parition = hash_files.len();
     let chtm = CHPage::from(
         config,
@@ -211,15 +215,21 @@ fn process_chunk_file<P: AsRef<Path>>(
     let duration = start.elapsed();
     // 打印运行时间
     println!("load table took: {:?}", duration);
-    process_batch(&mut reader, &chtm, args.chunk_dir.clone(), args.batch_size)?;
+    process_batch(
+        &mut reader,
+        &chtm,
+        args.chunk_dir.clone(),
+        args.batch_size,
+        args.kraken_db_type,
+    )?;
 
     Ok(())
 }
 
 pub fn run(args: Args) -> Result<()> {
     let chunk_files = find_and_sort_files(&args.chunk_dir, "sample", ".k2")?;
 
-    let hash_files = find_and_sort_files(&args.hash_dir, "hash", ".k2d")?;
+    let hash_files = find_and_sort_files(&args.k2d_dir, "hash", ".k2d")?;
 
     // 开始计时
     let start = Instant::now();

diff --git a/kr2r/src/bin/build_k2_db.rs b/kr2r/src/bin/build_k2_db.rs
@@ -1,13 +1,10 @@
 // 使用时需要引用模块路径
 use clap::Parser;
-use kr2r::args::{Build, Taxo, ONEGB, U32MAXPLUS};
-use kr2r::compact_hash::{CHTableMut, HashConfig};
+use kr2r::args::{parse_size, Build, Taxo};
+use kr2r::compact_hash::HashConfig;
 use kr2r::db::{
-    convert_fna_to_k2_format,
-    generate_taxonomy,
-    get_bits_for_taxid,
-    process_k2file,
-    // process_k2file1,
+    convert_fna_to_k2_format, generate_taxonomy, get_bits_for_taxid, process_k2file,
+    write_config_to_file,
 };
 use kr2r::utils::{
     create_partition_files, create_partition_writers, find_library_fna_files, format_bytes,
@@ -24,21 +21,19 @@ pub struct Args {
     #[clap(flatten)]
     pub build: Build,
 
+    /// database hash chunk directory and other files
+    #[clap(long)]
+    pub k2d_dir: Option<PathBuf>,
+
     #[clap(flatten)]
     pub taxo: Taxo,
 
-    // // /// Name of Kraken 2 database
-    // // #[arg(short, long = "db")]
-    // // database: PathBuf,
-    // #[arg(short = 'c', long, required = true)]
-    // pub required_capacity: u64,
-    /// chunk directory
+    #[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")]
+    pub hash_capacity: usize,
+
+    /// chunk temp directory
     #[clap(long)]
     pub chunk_dir: PathBuf,
-
-    /// chunk size 1-4(GB) [1073741824-4294967295] default: 1GB
-    #[clap(long, value_parser = clap::value_parser!(u64).range(ONEGB..U32MAXPLUS + 1), default_value_t = ONEGB)]
-    pub chunk_size: u64,
 }
 
 pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::error::Error>> {
@@ -48,19 +43,19 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro
     let id_to_taxon_map_filename = args
         .taxo
         .id_to_taxon_map_filename
-        .unwrap_or(args.build.source.join("seqid2taxid.map"));
+        .unwrap_or(args.build.database.join("seqid2taxid.map"));
 
     let id_to_taxon_map = read_id_to_taxon_map(&id_to_taxon_map_filename)?;
 
-    let taxonomy_filename = args
-        .taxo
-        .taxonomy_filename
-        .unwrap_or(args.build.source.join("taxo.k2d"));
+    let source: PathBuf = args.build.database.clone();
+    let k2d_dir = args.k2d_dir.unwrap_or(source.clone());
+
+    let taxonomy_filename = k2d_dir.join("taxo.k2d");
 
     let ncbi_taxonomy_directory = args
         .taxo
         .ncbi_taxonomy_directory
-        .unwrap_or(args.build.source.join("taxonomy"));
+        .unwrap_or(args.build.database.join("taxonomy"));
 
     let taxonomy = generate_taxonomy(
         &ncbi_taxonomy_directory,
@@ -75,14 +70,13 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro
     .expect("more bits required for storing taxid");
 
     let capacity = required_capacity;
-    let hash_config = HashConfig::<u32>::new(capacity, value_bits, 0, 0, 0);
+    let partition = (capacity + args.hash_capacity - 1) / args.hash_capacity;
+    let hash_config = HashConfig::new(capacity, value_bits, 0, partition, args.hash_capacity);
 
     // 开始计时
     let start = Instant::now();
 
-    let chunk_size = args.chunk_size as usize;
-
-    let partition = (capacity + chunk_size - 1) / chunk_size;
+    let chunk_size = args.hash_capacity as usize;
 
     if partition >= file_num_limit {
         panic!("Exceeds File Number Limit");
@@ -93,12 +87,7 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro
 
     println!("chunk_size {}", format_bytes(chunk_size as f64));
 
-    let source: PathBuf = args.build.source.clone();
-    let fna_files = if source.is_file() {
-        vec![source.to_string_lossy().to_string()]
-    } else {
-        find_library_fna_files(args.build.source)
-    };
+    let fna_files = find_library_fna_files(args.build.database);
 
     for fna_file in &fna_files {
         println!("convert fna file {:?}", fna_file);
@@ -114,32 +103,44 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro
         );
     }
 
-    let hash_filename = args
-        .build
-        .hashtable_filename
-        .unwrap_or(source.join("hash.k2d"))
-        .clone();
+    let hash_filename = source.join("hash_config.k2d");
     let partition = chunk_files.len();
-    for i in 0..partition {
+    let mut size: u64 = 0;
+
+    for i in 1..=partition {
         // 计算持续时间
-        // process_k2file1(hash_config, &chunk_files[i], &taxonomy, chunk_size, i)?;
+        let count = process_k2file(
+            hash_config,
+            &k2d_dir,
+            &chunk_files[i - 1],
+            &taxonomy,
+            chunk_size,
+            i,
+        )?;
+        size += count as u64;
         let duration = start.elapsed();
         println!(
             "process chunk file {:?}/{:}: duration: {:?}",
             i, partition, duration
         );
-        let mut chtm = CHTableMut::new(&hash_filename, hash_config, i, chunk_size)?;
-        process_k2file(&chunk_files[i], &mut chtm, &taxonomy)?;
     }
+
+    write_config_to_file(
+        &hash_filename,
+        partition as u64,
+        args.hash_capacity as u64,
+        capacity as u64,
+        size,
+        32 - hash_config.value_bits as u64,
+        hash_config.value_bits as u64,
+    )?;
+
     // 计算持续时间
     let duration = start.elapsed();
     // 打印运行时间
     println!("build k2 db took: {:?}", duration);
 
-    let options_filename = args
-        .build
-        .options_filename
-        .unwrap_or(source.clone().join("opts.k2d"));
+    let options_filename = k2d_dir.join("opts.k2d");
     let idx_opts = IndexOptions::from_meros(meros);
     idx_opts.write_to_file(options_filename)?;
 

diff --git a/kr2r/src/bin/estimate_capacity.rs b/kr2r/src/bin/estimate_capacity.rs
@@ -21,7 +21,7 @@ use std::path::{Path, PathBuf};
 pub struct Args {
     /// build database directory or file
     #[arg(long, default_value = "lib")]
-    pub source: PathBuf,
+    pub database: PathBuf,
 
     /// 包含原始配置
     #[clap(flatten)]
@@ -131,16 +131,16 @@ pub fn run(args: Args) -> usize {
     let mut hllp: HyperLogLogPlus<u64, KBuildHasher> =
         HyperLogLogPlus::new(16, KBuildHasher::default()).unwrap();
 
-    let source: PathBuf = args.source.clone();
+    let source: PathBuf = args.database.clone();
     let fna_files = if source.is_file() {
         vec![source.to_string_lossy().to_string()]
     } else {
-        find_library_fna_files(args.source)
+        find_library_fna_files(args.database)
     };
 
     for fna_file in fna_files {
         let args_clone = Args {
-            source: source.clone(),
+            database: source.clone(),
             ..args
         };
         let local_hllp = process_sequence(&fna_file, args_clone);