eric9n
diff --git a/‎kr2r/src/args.rs
Lines changed: 3 additions & 0 deletions b/‎kr2r/src/args.rs
Lines changed: 3 additions & 0 deletions
diff --git a/‎kr2r/src/bin/annotate.rs
Lines changed: 14 additions & 6 deletions b/‎kr2r/src/bin/annotate.rs
Lines changed: 14 additions & 6 deletions
diff --git a/‎kr2r/src/bin/resolve.rs
Lines changed: 150 additions & 50 deletions b/‎kr2r/src/bin/resolve.rs
Lines changed: 150 additions & 50 deletions
@@ -133,6 +133,9 @@ pub struct ClassifyArgs {
     #[clap(short = 'z', long, value_parser, default_value_t = false)]
     pub report_zero_counts: bool,
 
+    #[clap(long, value_parser, default_value_t = false)]
+    pub full_output: bool,
+
     /// A list of input file paths (FASTA/FASTQ) to be processed by the classify program.
     // #[clap(short = 'F', long = "files")]
     pub input_files: Vec<String>,
 
@@ -1,5 +1,5 @@
 use clap::Parser;
-use kr2r::compact_hash::{CHPage, Compact, HashConfig, K2Compact, Slot};
+use kr2r::compact_hash::{CHPage, Compact, HashConfig, K2Compact, Row, Slot};
 use kr2r::utils::find_and_sort_files;
 // use std::collections::HashMap;
 use rayon::prelude::*;
@@ -103,12 +103,15 @@ where
     R: Read + Send,
 {
     let slot_size = std::mem::size_of::<Slot<u64>>();
+    let row_size = std::mem::size_of::<Row>();
     let mut batch_buffer = vec![0u8; slot_size * batch_size];
     let mut last_file_index: Option<u64> = None;
     let mut writer: Option<BufWriter<File>> = None;
 
     let value_mask = chtm.get_value_mask();
     let value_bits = chtm.get_value_bits();
+    let idx_mask = chtm.get_idx_mask();
+    let idx_bits = chtm.get_idx_bits();
 
     while let Ok(bytes_read) = reader.read(&mut batch_buffer) {
         if bytes_read == 0 {
@@ -125,14 +128,19 @@ where
         let result: HashMap<u64, Vec<u8>> = slots
             .into_par_iter()
             .filter_map(|slot| {
-                let taxid = chtm.get_from_page(slot);
+                let indx = slot.idx & idx_mask;
+                let taxid = chtm.get_from_page(indx, slot.value);
 
                 if taxid > 0 {
+                    let kmer_id = slot.idx >> idx_bits;
                     let file_index = slot.value.right(value_mask) >> 32;
+                    let seq_id = slot.get_seq_id() as u32;
                     let left = slot.value.left(value_bits) as u32;
-                    let high = u32::combined(left, taxid, value_bits) as u64;
-                    let value = slot.to_b(high);
-                    let value_bytes = value.to_le_bytes(); // 将u64转换为[u8; 8]
+                    let high = u32::combined(left, taxid, value_bits);
+                    let row = Row::new(high, seq_id, kmer_id as u32);
+                    // let value = slot.to_b(high);
+                    // let value_bytes = value.to_le_bytes(); // 将u64转换为[u8; 8]
+                    let value_bytes = row.as_slice(row_size);
                     Some((file_index, value_bytes.to_vec()))
                 } else {
                     None
@@ -223,7 +231,7 @@ pub fn run(args: Args) -> Result<()> {
     // 计算持续时间
     let duration = start.elapsed();
     // 打印运行时间
-    println!("squid took: {:?}", duration);
+    println!("annotate took: {:?}", duration);
 
     Ok(())
 }
 
@@ -1,13 +1,13 @@
 use clap::Parser;
-use dashmap::DashMap;
-use kr2r::compact_hash::{Compact, HashConfig};
+use dashmap::{DashMap, DashSet};
+use kr2r::compact_hash::{Compact, HashConfig, Row};
 use kr2r::iclassify::{resolve_tree, trim_pair_info};
 use kr2r::readcounts::{TaxonCounters, TaxonCountersDash};
 use kr2r::report::report_kraken_style;
 use kr2r::taxonomy::Taxonomy;
 use kr2r::utils::find_and_sort_files;
 use rayon::prelude::*;
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 use std::fs::File;
 use std::io::{self, BufRead, BufReader, BufWriter, Read, Result, Write};
 use std::path::{Path, PathBuf};
@@ -16,57 +16,130 @@ use std::sync::Mutex;
 
 const BATCH_SIZE: usize = 8 * 1024 * 1024;
 
-pub fn read_id_to_seq_map<P: AsRef<Path>>(filename: P) -> Result<DashMap<u32, (String, usize)>> {
+pub fn read_id_to_seq_map<P: AsRef<Path>>(
+    filename: P,
+) -> Result<DashMap<u32, (String, String, u32, Option<u32>)>> {
     let file = File::open(filename)?;
     let reader = BufReader::new(file);
     let id_map = DashMap::new();
 
     reader.lines().par_bridge().for_each(|line| {
         let line = line.expect("Could not read line");
         let parts: Vec<&str> = line.trim().split_whitespace().collect();
-        if parts.len() >= 3 {
+        if parts.len() >= 4 {
             // 解析序号为u32类型的键
             if let Ok(id) = parts[0].parse::<u32>() {
                 // 第二列是序列标识符，直接作为字符串
                 let seq_id = parts[1].to_string();
-                if let Ok(count) = parts[2].parse::<usize>() {
-                    // 插入到DashMap中
-                    id_map.insert(id, (seq_id, count));
-                }
+                let seq_size = parts[2].to_string();
+                let count_parts: Vec<&str> = parts[3].split('|').collect();
+                let kmer_count1 = count_parts[0].parse::<u32>().unwrap();
+                let kmer_count2 = count_parts[1].parse::<u32>().map_or(None, |i| Some(i));
+                id_map.insert(id, (seq_id, seq_size, kmer_count1, kmer_count2));
             }
         }
     });
 
     Ok(id_map)
 }
 
+fn generate_hit_string(
+    count: u32,
+    rows: &Vec<Row>,
+    taxonomy: &Taxonomy,
+    value_mask: usize,
+    offset: u32,
+) -> String {
+    let mut result = String::new();
+    let mut last_pos = 0;
+    let mut has_key = false; // 标记是否处理了特定位置
+
+    for row in rows {
+        let value = row.value;
+        let key = value.right(value_mask);
+        let ext_code = taxonomy.nodes[key as usize].external_id;
+
+        // 忽略不在当前段的位置
+        if row.kmer_id < offset || row.kmer_id >= offset + count {
+            continue;
+        }
+        // 调整位置为相对于当前段的起始
+        let adjusted_pos = row.kmer_id - offset;
+        // 填充前导0
+        if adjusted_pos > last_pos {
+            if has_key {
+                result.push_str(&format!("0:{} ", adjusted_pos - last_pos - 1));
+            } else {
+                result.push_str(&format!("0:{} ", adjusted_pos));
+            }
+        }
+        // 添加当前键的计数
+        result.push_str(&format!("{}:1 ", ext_code));
+        last_pos = adjusted_pos;
+        has_key = true;
+    }
+
+    // 填充尾随0
+    if last_pos < count - 1 {
+        if has_key {
+            result.push_str(&format!("0:{} ", count - last_pos - 1));
+        } else {
+            result.push_str(&format!("0:{} ", count));
+        }
+    }
+
+    result.trim_end().to_string()
+}
+
+pub fn add_hitlist_string(
+    rows: &Vec<Row>,
+    value_mask: usize,
+    kmer_count1: u32,
+    kmer_count2: Option<u32>,
+    taxonomy: &Taxonomy,
+) -> String {
+    let result1 = generate_hit_string(kmer_count1, &rows, taxonomy, value_mask, 0);
+    if let Some(count) = kmer_count2 {
+        let result2 = generate_hit_string(count, &rows, taxonomy, value_mask, kmer_count1);
+        format!("{} |:| {}", result1, result2)
+    } else {
+        format!("{}", result1)
+    }
+}
+
 pub fn count_values(
-    vec: Vec<u32>,
+    rows: &Vec<Row>,
     value_mask: usize,
+    kmer_count1: u32,
 ) -> (HashMap<u32, u64>, TaxonCountersDash, usize) {
     let mut counts = HashMap::new();
 
-    let mut unique_elements = HashSet::new();
+    let mut hit_count: usize = 0;
 
+    let mut last_row: Row = Row::new(0, 0, 0);
     let cur_taxon_counts = TaxonCountersDash::new();
 
-    for value in vec {
-        // 使用entry API处理计数
-        // entry返回的是一个Entry枚举，它代表了可能存在也可能不存在的值
-        // or_insert方法在键不存在时插入默认值（在这里是0）
-        // 然后无论哪种情况，我们都对计数器加1
+    for row in rows {
+        let value = row.value;
         let key = value.right(value_mask);
         *counts.entry(key).or_insert(0) += 1;
-        if !unique_elements.contains(&value) {
+
+        // 如果切换到第2条seq,就重新计算
+        if last_row.kmer_id < kmer_count1 && row.kmer_id > kmer_count1 {
+            last_row = Row::new(0, 0, 0);
+        }
+        if !(last_row.value == value && row.kmer_id - last_row.kmer_id == 1) {
             cur_taxon_counts
                 .entry(key as u64)
                 .or_default()
                 .add_kmer(value as u64);
+            hit_count += 1;
         }
-        unique_elements.insert(value);
+
+        last_row = *row;
     }
 
-    (counts, cur_taxon_counts, unique_elements.len())
+    (counts, cur_taxon_counts, hit_count)
 }
 
 #[derive(Parser, Debug, Clone)]
@@ -84,13 +157,8 @@ pub struct Args {
     #[clap(long, value_parser, required = true)]
     pub chunk_dir: PathBuf,
 
-    // /// The file path for the Kraken 2 index.
-    // #[clap(short = 'H', long = "index-filename", value_parser, required = true)]
-    // index_filename: PathBuf,
-
-    // /// The file path for the Kraken 2 taxonomy.
-    // #[clap(short = 't', long = "taxonomy-filename", value_parser, required = true)]
-    // taxonomy_filename: String,
+    #[clap(long, value_parser, default_value_t = false)]
+    pub full_output: bool,
     /// Confidence score threshold, default is 0.0.
     #[clap(
         short = 'T',
@@ -126,20 +194,21 @@ pub struct Args {
     pub kraken_output_dir: Option<PathBuf>,
 }
 
-fn process_batch<P: AsRef<Path>, B: Compact>(
+fn process_batch<P: AsRef<Path>>(
     sample_file: P,
     args: &Args,
     taxonomy: &Taxonomy,
-    id_map: DashMap<u32, (String, usize)>,
-    writer: Box<dyn Write + Send>,
+    id_map: &DashMap<u32, (String, String, u32, Option<u32>)>,
+    writer: &Mutex<Box<dyn Write + Send>>,
     value_mask: usize,
-) -> Result<(TaxonCountersDash, usize)> {
+) -> Result<(TaxonCountersDash, usize, DashSet<u32>)> {
     let file = File::open(sample_file)?;
     let mut reader = BufReader::new(file);
-    let size = std::mem::size_of::<B>();
+    let size = std::mem::size_of::<Row>();
     let mut batch_buffer = vec![0u8; size * BATCH_SIZE];
 
     let hit_counts = DashMap::new();
+    let hit_seq_id_set = DashSet::new();
     let confidence_threshold = args.confidence_threshold;
     let minimum_hit_groups = args.minimum_hit_groups;
 
@@ -150,31 +219,36 @@ fn process_batch<P: AsRef<Path>, B: Compact>(
 
         // 处理读取的数据批次
         let slots_in_batch = bytes_read / size;
-
         let slots = unsafe {
-            std::slice::from_raw_parts(batch_buffer.as_ptr() as *const B, slots_in_batch)
+            std::slice::from_raw_parts(batch_buffer.as_ptr() as *const Row, slots_in_batch)
         };
 
         slots.into_par_iter().for_each(|item| {
-            let cell = item.left(0).to_u32();
-            let seq_id = item.right(0).to_u32();
-            hit_counts.entry(seq_id).or_insert_with(Vec::new).push(cell)
+            let seq_id = item.seq_id;
+            hit_seq_id_set.insert(seq_id);
+            hit_counts
+                .entry(seq_id)
+                .or_insert_with(Vec::new)
+                .push(*item)
         });
     }
 
-    let writer = Mutex::new(writer);
+    // let writer = Mutex::new(writer);
     let classify_counter = AtomicUsize::new(0);
     let cur_taxon_counts = TaxonCountersDash::new();
 
-    hit_counts.into_par_iter().for_each(|(k, cells)| {
+    hit_counts.into_par_iter().for_each(|(k, mut rows)| {
         if let Some(item) = id_map.get(&k) {
-            let total_kmers: usize = item.1;
+            rows.sort_unstable();
+            let total_kmers: usize = item.2 as usize + item.3.unwrap_or(0) as usize;
             let dna_id = trim_pair_info(&item.0);
-            let (counts, cur_counts, hit_groups) = count_values(cells, value_mask);
+            let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, item.2);
+            let hit_string = add_hitlist_string(&rows, value_mask, item.2, item.3, taxonomy);
             let mut call = resolve_tree(&counts, taxonomy, total_kmers, confidence_threshold);
             if call > 0 && hit_groups < minimum_hit_groups {
                 call = 0;
             };
+
             cur_counts.iter().for_each(|entry| {
                 cur_taxon_counts
                     .entry(*entry.key())
@@ -184,20 +258,31 @@ fn process_batch<P: AsRef<Path>, B: Compact>(
             });
 
             let ext_call = taxonomy.nodes[call as usize].external_id;
-            if call > 0 {
-                let output_line = format!("C\t{}\t{}\n", dna_id, ext_call);
-                // 使用锁来同步写入
-                let mut file = writer.lock().unwrap();
-                file.write_all(output_line.as_bytes()).unwrap();
+            let clasify = if call > 0 {
                 classify_counter.fetch_add(1, Ordering::SeqCst);
                 cur_taxon_counts
                     .entry(call as u64)
                     .or_default()
                     .increment_read_count();
-            }
+
+                "C"
+            } else {
+                "U"
+            };
+            // 使用锁来同步写入
+            let output_line = format!(
+                "{}\t{}\t{}\t{}\t{}\n",
+                clasify, dna_id, ext_call, item.1, hit_string
+            );
+            let mut file = writer.lock().unwrap();
+            file.write_all(output_line.as_bytes()).unwrap();
         }
     });
-    Ok((cur_taxon_counts, classify_counter.load(Ordering::SeqCst)))
+    Ok((
+        cur_taxon_counts,
+        classify_counter.load(Ordering::SeqCst),
+        hit_seq_id_set,
+    ))
 }
 
 pub fn run(args: Args) -> Result<()> {
@@ -228,14 +313,29 @@ pub fn run(args: Args) -> Result<()> {
             }
             None => Box::new(io::stdout()) as Box<dyn Write + Send>,
         };
-        let (thread_taxon_counts, thread_classified) = process_batch::<&PathBuf, u64>(
+        let writer = Mutex::new(writer);
+        let (thread_taxon_counts, thread_classified, hit_seq_set) = process_batch::<&PathBuf>(
             sample_file,
             &args,
             &taxo,
-            sample_id_map,
-            writer,
+            &sample_id_map,
+            &writer,
             value_mask,
         )?;
+
+        if args.full_output {
+            sample_id_map
+                .iter()
+                .filter(|item| !hit_seq_set.contains(item.key()))
+                .for_each(|item| {
+                    let dna_id = trim_pair_info(&item.0);
+                    let hit_string = add_hitlist_string(&vec![], value_mask, item.2, item.3, &taxo);
+                    let output_line = format!("U\t{}\t0\t{}\t{}\n", dna_id, item.1, hit_string);
+                    let mut file = writer.lock().unwrap();
+                    file.write_all(output_line.as_bytes()).unwrap();
+                });
+        }
+
         thread_taxon_counts.iter().for_each(|entry| {
             total_taxon_counts
                 .entry(*entry.key())