Skip to content

Commit

Permalink
优化代码结构
Browse files Browse the repository at this point in the history
  • Loading branch information
eric committed Apr 24, 2024
1 parent ed94634 commit da60162
Show file tree
Hide file tree
Showing 12 changed files with 298 additions and 699 deletions.
25 changes: 12 additions & 13 deletions kr2r/src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,11 @@ pub const ONEGB: u64 = 1073741824;
pub struct Build {
/// ncbi library fna database directory
#[arg(long = "db", required = true)]
pub source: PathBuf,

/// Kraken 2 hash table filename, default = $database/hash.k2d
#[clap(short = 'H')]
pub hashtable_filename: Option<PathBuf>,

/// Kraken 2 options filename, default = $database/opts.k2d
#[clap(short = 'o')]
pub options_filename: Option<PathBuf>,
pub database: PathBuf,

// /// Kraken 2 options filename, default = $database/opts.k2d
// #[clap(short = 'o')]
// pub options_filename: Option<PathBuf>,
/// 包含原始配置
#[clap(flatten)]
pub klmt: KLMTArgs,
Expand All @@ -41,9 +36,9 @@ pub struct Build {
#[derive(Parser, Debug, Clone)]
#[clap(version, about = "taxonomy")]
pub struct Taxo {
/// Kraken 2 taxonomy filename, default = $database/taxo.k2d
#[clap(short = 't')]
pub taxonomy_filename: Option<PathBuf>,
// /// Kraken 2 taxonomy filename, default = $database/taxo.k2d
// #[clap(short = 't')]
// pub taxonomy_filename: Option<PathBuf>,

// #[clap(short = 'm', required = true)]
// pub id_to_taxon_map_filename: PathBuf,
Expand Down Expand Up @@ -75,7 +70,11 @@ const BATCH_SIZE: usize = 8 * 1024 * 1024;
pub struct ClassifyArgs {
/// database hash chunk directory and other files
#[clap(long)]
pub hash_dir: PathBuf,
pub k2d_dir: PathBuf,

/// Enables use of a Kraken 2 compatible shared database. Default is false.
#[clap(long, default_value_t = false)]
pub kraken_db_type: bool,

// /// The file path for the Kraken 2 options.
// #[clap(short = 'o', long = "options-filename", value_parser, required = true)]
Expand Down
36 changes: 23 additions & 13 deletions kr2r/src/bin/annotate.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use clap::Parser;
use clap::{Parser, ValueEnum};
use kr2r::compact_hash::{CHPage, Compact, HashConfig, K2Compact, Row, Slot};
use kr2r::utils::find_and_sort_files;
// use std::collections::HashMap;
Expand All @@ -13,6 +13,11 @@ use std::time::Instant;
// 定义每批次处理的 Slot 数量
pub const BATCH_SIZE: usize = 8 * 1024 * 1024;

#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
enum DbType {
Kraken,
Squid,
}
/// Command line arguments for the splitr program.
///
/// This structure defines the command line arguments that are accepted by the splitr program.
Expand All @@ -26,13 +31,11 @@ pub const BATCH_SIZE: usize = 8 * 1024 * 1024;
pub struct Args {
/// database hash chunk directory and other files
#[clap(long)]
pub hash_dir: PathBuf,
// /// The file path for the Kraken 2 index.
// #[clap(short = 'H', long = "index-filename", value_parser, required = true)]
// index_filename: PathBuf,
/// The file path for the Kraken 2 options.
// #[clap(short = 'o', long = "options-filename", value_parser, required = true)]
// options_filename: String,
pub k2d_dir: PathBuf,

/// Enables use of a Kraken 2 compatible shared database. Default is false.
#[clap(long, default_value_t = false)]
pub kraken_db_type: bool,

/// chunk directory
#[clap(long)]
Expand Down Expand Up @@ -97,9 +100,10 @@ fn process_batch<R, K>(
chtm: &K,
chunk_dir: PathBuf,
batch_size: usize,
kraken_db_type: bool,
) -> std::io::Result<()>
where
K: K2Compact<u32> + Send,
K: K2Compact + Send,
R: Read + Send,
{
let slot_size = std::mem::size_of::<Slot<u64>>();
Expand Down Expand Up @@ -129,7 +133,7 @@ where
.into_par_iter()
.filter_map(|slot| {
let indx = slot.idx & idx_mask;
let taxid = chtm.get_from_page(indx, slot.value);
let taxid = chtm.get_from_page(indx, slot.value, kraken_db_type);

if taxid > 0 {
let kmer_id = slot.idx >> idx_bits;
Expand Down Expand Up @@ -200,7 +204,7 @@ fn process_chunk_file<P: AsRef<Path>>(

let start = Instant::now();

let config = HashConfig::<u32>::from_hash_header(&args.hash_dir.join("hash_config.k2d"))?;
let config = HashConfig::from_hash_header(&args.k2d_dir.join("hash_config.k2d"))?;
let parition = hash_files.len();
let chtm = CHPage::from(
config,
Expand All @@ -211,15 +215,21 @@ fn process_chunk_file<P: AsRef<Path>>(
let duration = start.elapsed();
// 打印运行时间
println!("load table took: {:?}", duration);
process_batch(&mut reader, &chtm, args.chunk_dir.clone(), args.batch_size)?;
process_batch(
&mut reader,
&chtm,
args.chunk_dir.clone(),
args.batch_size,
args.kraken_db_type,
)?;

Ok(())
}

pub fn run(args: Args) -> Result<()> {
let chunk_files = find_and_sort_files(&args.chunk_dir, "sample", ".k2")?;

let hash_files = find_and_sort_files(&args.hash_dir, "hash", ".k2d")?;
let hash_files = find_and_sort_files(&args.k2d_dir, "hash", ".k2d")?;

// 开始计时
let start = Instant::now();
Expand Down
93 changes: 47 additions & 46 deletions kr2r/src/bin/build_k2_db.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
// 使用时需要引用模块路径
use clap::Parser;
use kr2r::args::{Build, Taxo, ONEGB, U32MAXPLUS};
use kr2r::compact_hash::{CHTableMut, HashConfig};
use kr2r::args::{parse_size, Build, Taxo};
use kr2r::compact_hash::HashConfig;
use kr2r::db::{
convert_fna_to_k2_format,
generate_taxonomy,
get_bits_for_taxid,
process_k2file,
// process_k2file1,
convert_fna_to_k2_format, generate_taxonomy, get_bits_for_taxid, process_k2file,
write_config_to_file,
};
use kr2r::utils::{
create_partition_files, create_partition_writers, find_library_fna_files, format_bytes,
Expand All @@ -24,21 +21,19 @@ pub struct Args {
#[clap(flatten)]
pub build: Build,

/// database hash chunk directory and other files
#[clap(long)]
pub k2d_dir: Option<PathBuf>,

#[clap(flatten)]
pub taxo: Taxo,

// // /// Name of Kraken 2 database
// // #[arg(short, long = "db")]
// // database: PathBuf,
// #[arg(short = 'c', long, required = true)]
// pub required_capacity: u64,
/// chunk directory
#[clap(long, value_parser = parse_size, default_value = "1G", help = "Specifies the hash file capacity.\nAcceptable formats include numeric values followed by 'K', 'M', or 'G' (e.g., '1.5G', '250M', '1024K').\nNote: The specified capacity affects the index size, with a factor of 4 applied.\nFor example, specifying '1G' results in an index size of '4G'.\nDefault: 1G (capacity 1G = file size 4G)")]
pub hash_capacity: usize,

/// chunk temp directory
#[clap(long)]
pub chunk_dir: PathBuf,

/// chunk size 1-4(GB) [1073741824-4294967295] default: 1GB
#[clap(long, value_parser = clap::value_parser!(u64).range(ONEGB..U32MAXPLUS + 1), default_value_t = ONEGB)]
pub chunk_size: u64,
}

pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::error::Error>> {
Expand All @@ -48,19 +43,19 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro
let id_to_taxon_map_filename = args
.taxo
.id_to_taxon_map_filename
.unwrap_or(args.build.source.join("seqid2taxid.map"));
.unwrap_or(args.build.database.join("seqid2taxid.map"));

let id_to_taxon_map = read_id_to_taxon_map(&id_to_taxon_map_filename)?;

let taxonomy_filename = args
.taxo
.taxonomy_filename
.unwrap_or(args.build.source.join("taxo.k2d"));
let source: PathBuf = args.build.database.clone();
let k2d_dir = args.k2d_dir.unwrap_or(source.clone());

let taxonomy_filename = k2d_dir.join("taxo.k2d");

let ncbi_taxonomy_directory = args
.taxo
.ncbi_taxonomy_directory
.unwrap_or(args.build.source.join("taxonomy"));
.unwrap_or(args.build.database.join("taxonomy"));

let taxonomy = generate_taxonomy(
&ncbi_taxonomy_directory,
Expand All @@ -75,14 +70,13 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro
.expect("more bits required for storing taxid");

let capacity = required_capacity;
let hash_config = HashConfig::<u32>::new(capacity, value_bits, 0, 0, 0);
let partition = (capacity + args.hash_capacity - 1) / args.hash_capacity;
let hash_config = HashConfig::new(capacity, value_bits, 0, partition, args.hash_capacity);

// 开始计时
let start = Instant::now();

let chunk_size = args.chunk_size as usize;

let partition = (capacity + chunk_size - 1) / chunk_size;
let chunk_size = args.hash_capacity as usize;

if partition >= file_num_limit {
panic!("Exceeds File Number Limit");
Expand All @@ -93,12 +87,7 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro

println!("chunk_size {}", format_bytes(chunk_size as f64));

let source: PathBuf = args.build.source.clone();
let fna_files = if source.is_file() {
vec![source.to_string_lossy().to_string()]
} else {
find_library_fna_files(args.build.source)
};
let fna_files = find_library_fna_files(args.build.database);

for fna_file in &fna_files {
println!("convert fna file {:?}", fna_file);
Expand All @@ -114,32 +103,44 @@ pub fn run(args: Args, required_capacity: usize) -> Result<(), Box<dyn std::erro
);
}

let hash_filename = args
.build
.hashtable_filename
.unwrap_or(source.join("hash.k2d"))
.clone();
let hash_filename = source.join("hash_config.k2d");
let partition = chunk_files.len();
for i in 0..partition {
let mut size: u64 = 0;

for i in 1..=partition {
// 计算持续时间
// process_k2file1(hash_config, &chunk_files[i], &taxonomy, chunk_size, i)?;
let count = process_k2file(
hash_config,
&k2d_dir,
&chunk_files[i - 1],
&taxonomy,
chunk_size,
i,
)?;
size += count as u64;
let duration = start.elapsed();
println!(
"process chunk file {:?}/{:}: duration: {:?}",
i, partition, duration
);
let mut chtm = CHTableMut::new(&hash_filename, hash_config, i, chunk_size)?;
process_k2file(&chunk_files[i], &mut chtm, &taxonomy)?;
}

write_config_to_file(
&hash_filename,
partition as u64,
args.hash_capacity as u64,
capacity as u64,
size,
32 - hash_config.value_bits as u64,
hash_config.value_bits as u64,
)?;

// 计算持续时间
let duration = start.elapsed();
// 打印运行时间
println!("build k2 db took: {:?}", duration);

let options_filename = args
.build
.options_filename
.unwrap_or(source.clone().join("opts.k2d"));
let options_filename = k2d_dir.join("opts.k2d");
let idx_opts = IndexOptions::from_meros(meros);
idx_opts.write_to_file(options_filename)?;

Expand Down
8 changes: 4 additions & 4 deletions kr2r/src/bin/estimate_capacity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use std::path::{Path, PathBuf};
pub struct Args {
/// build database directory or file
#[arg(long, default_value = "lib")]
pub source: PathBuf,
pub database: PathBuf,

/// 包含原始配置
#[clap(flatten)]
Expand Down Expand Up @@ -131,16 +131,16 @@ pub fn run(args: Args) -> usize {
let mut hllp: HyperLogLogPlus<u64, KBuildHasher> =
HyperLogLogPlus::new(16, KBuildHasher::default()).unwrap();

let source: PathBuf = args.source.clone();
let source: PathBuf = args.database.clone();
let fna_files = if source.is_file() {
vec![source.to_string_lossy().to_string()]
} else {
find_library_fna_files(args.source)
find_library_fna_files(args.database)
};

for fna_file in fna_files {
let args_clone = Args {
source: source.clone(),
database: source.clone(),
..args
};
let local_hllp = process_sequence(&fna_file, args_clone);
Expand Down
Loading

0 comments on commit da60162

Please sign in to comment.