From ea1005f57cb54b635c8a1dc32bbf2964920b5028 Mon Sep 17 00:00:00 2001 From: Vikram Shivakumar Date: Tue, 27 May 2025 17:11:08 -0400 Subject: [PATCH 1/4] added initial rust bindings --- README.md | 50 ++++++++++++++++++-------- digest-rs/Cargo.toml | 24 +++++++++++++ digest-rs/build.rs | 27 ++++++++++++++ digest-rs/examples/demo.rs | 57 +++++++++++++++++++++++++++++ digest-rs/src/bindings.cpp | 60 +++++++++++++++++++++++++++++++ digest-rs/src/lib.rs | 74 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 278 insertions(+), 14 deletions(-) create mode 100644 digest-rs/Cargo.toml create mode 100644 digest-rs/build.rs create mode 100644 digest-rs/examples/demo.rs create mode 100644 digest-rs/src/bindings.cpp create mode 100644 digest-rs/src/lib.rs diff --git a/README.md b/README.md index b345a0b..300e069 100644 --- a/README.md +++ b/README.md @@ -112,20 +112,6 @@ Once installed, you can import and use the Digest library in Python: >>> [seq[p:p+5] for p in window_minimizer(seq, k=5, w=11)] ['ACGTA', 'CGTAG', 'AGCTA', 'TAGCT', 'GCTGA', 'TTACA', 'TACAT', 'GTATG', 'GCAAG', 'TGATC', 'CGTAG', 'TAGTG', 'ATGCT'] ``` -## CLI -A basic cli can be found [here](https://github.com/BenLangmead/gester/tree/main) - -## Benchmark / Tests -```bash -cd build -meson configure -Dbuildtype=debug -meson compile -``` -This will set the build flage from release to debug allowing you to generate proper executables for benchmark/testing. The executables will be located in the build folder and can be run directly from there. You can look at the meson.build file for more details. - -## Contributing -Use clang format version 17. -run `ninja clang-format` before submitting a PR. We have also implemented parallel execution in the python library: ``` @@ -158,6 +144,42 @@ meson compile ``` This will set the build flage from release to debug allowing you to generate proper executables for benchmark/testing. The executables will be located in the build folder and can be run directly from there. You can look at the meson.build file for more details. +## Rust Bindings + +We include Rust bindings for the digest library. To use the Rust bindings, add the following to your `Cargo.toml`: + +```toml +[dependencies] +digest-rs = "0.1.0" +``` + +### Example Usage + +```rust +use digest_rs; + +// Window minimizer example +let sequence = "ACGTACGT"; +let k = 4; +let window = 2; +let minimizers = digest_rs::window_minimizer_rs(sequence, k, window)?; + +// Modimizer example +let mod_val = 3; +let modimizers = digest_rs::modimizer_rs(sequence, k, mod_val)?; + +// Syncmer example +let syncmers = digest_rs::syncmer_rs(sequence, k, window)?; +``` + +The three primary functions are available as bindings to the C++ library (similar to the python bindings). All functions return a `Result, DigestError>` containing the positions of the minimizers in the sequence. + +> [!NOTE] +> These are only Rust bindings of the C++ implementation, not a full Rust implementation. As such, they contain unsafe code blocks. + + + + ## Contributing Use clang format version 17. run `ninja clang-format` before submitting a PR. diff --git a/digest-rs/Cargo.toml b/digest-rs/Cargo.toml new file mode 100644 index 0000000..2d70832 --- /dev/null +++ b/digest-rs/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "digest-rs" +version = "0.1.0" +edition = "2021" +description = "Rust bindings for the digest library" +license = "MIT" +repository = "https://github.com/VeryAmazed/digest" +readme = "README.md" +keywords = ["bioinformatics", "kmer", "minimizer", "syncmer"] +categories = ["science", "bioinformatics"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[lib] +name = "digest" +crate-type = ["cdylib", "rlib"] + +[dependencies] +libc = "0.2" +thiserror = "1.0" + +[build-dependencies] +cc = "1.0" + diff --git a/digest-rs/build.rs b/digest-rs/build.rs new file mode 100644 index 0000000..4e92696 --- /dev/null +++ b/digest-rs/build.rs @@ -0,0 +1,27 @@ +use std::env; +use std::path::PathBuf; + +fn main() { + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + let digest_dir = PathBuf::from(env::var("HOME").unwrap()).join("vast/digest"); + + // Build the C++ bindings + cc::Build::new() + .cpp(true) + .file("src/bindings.cpp") + .include(digest_dir.join("include")) + .include(digest_dir.join("extern/nthash/include")) + .flag("-std=c++17") + .flag("-fPIC") // Add position-independent code flag + .compile("bindings"); // This creates libbindings.a + + // Link against nthash library + println!("cargo:rustc-link-search=native={}/lib", digest_dir.display()); + println!("cargo:rustc-link-lib=static=nthash"); + + // Link against our bindings library + println!("cargo:rustc-link-lib=static=bindings"); + + // Link against C++ standard library + println!("cargo:rustc-link-lib=dylib=stdc++"); +} \ No newline at end of file diff --git a/digest-rs/examples/demo.rs b/digest-rs/examples/demo.rs new file mode 100644 index 0000000..12df2ac --- /dev/null +++ b/digest-rs/examples/demo.rs @@ -0,0 +1,57 @@ +use digest::{window_minimizer_rs, modimizer_rs, syncmer_rs}; + +fn main() { + // Longer test sequences + let sequences = vec![ + "ACTGCTGACTACTAGCTAGTCGATGACTGCTGACTACTAGCTAGTCGATGAC", + "ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG", + "GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACA", + ]; + + // Test parameters + let k_values = vec![4, 5]; + let window_sizes = vec![5, 10]; + let mod_values = vec![3, 5]; + + println!("=== Window Minimizer Tests ==="); + for seq in &sequences { + println!("\nSequence: {}", seq); + for k in &k_values { + for window in &window_sizes { + println!("k={}, window={}", k, window); + match window_minimizer_rs(seq, *k, *window) { + Ok(positions) => println!(" Positions: {:?}", positions), + Err(e) => println!(" Error: {}", e), + } + } + } + } + + println!("\n=== Modimizer Tests ==="); + for seq in &sequences { + println!("\nSequence: {}", seq); + for k in &k_values { + for mod_val in &mod_values { + println!("k={}, mod={}", k, mod_val); + match modimizer_rs(seq, *k, *mod_val) { + Ok(positions) => println!(" Positions: {:?}", positions), + Err(e) => println!(" Error: {}", e), + } + } + } + } + + println!("\n=== Syncmer Tests ==="); + for seq in &sequences { + println!("\nSequence: {}", seq); + for k in &k_values { + for window in &window_sizes { + println!("k={}, window={}", k, window); + match syncmer_rs(seq, *k, *window) { + Ok(positions) => println!(" Positions: {:?}", positions), + Err(e) => println!(" Error: {}", e), + } + } + } + } +} \ No newline at end of file diff --git a/digest-rs/src/bindings.cpp b/digest-rs/src/bindings.cpp new file mode 100644 index 0000000..fab7c48 --- /dev/null +++ b/digest-rs/src/bindings.cpp @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +extern "C" { + using namespace digest; + + // Window minimizer wrapper functions + size_t window_minimizer(const char* seq, size_t len, unsigned k, unsigned large_window, uint32_t* out) { + try { + std::string sequence(seq, len); + WindowMin digester(sequence, k, large_window); + std::vector output; + digester.roll_minimizer(sequence.length(), output); + if (!output.empty()) { + std::memcpy(out, output.data(), output.size() * sizeof(uint32_t)); + } + return output.size(); + } catch (...) { + return 0; + } + } + + // Modimizer wrapper function + size_t modimizer(const char* seq, size_t len, unsigned k, uint32_t mod_val, uint32_t* out) { + try { + std::string sequence(seq, len); + ModMin digester(sequence, k, mod_val); + std::vector output; + digester.roll_minimizer(sequence.length(), output); + if (!output.empty()) { + std::memcpy(out, output.data(), output.size() * sizeof(uint32_t)); + } + return output.size(); + } catch (...) { + return 0; + } + } + + // Syncmer wrapper function + size_t syncmer(const char* seq, size_t len, unsigned k, unsigned large_window, uint32_t* out) { + try { + std::string sequence(seq, len); + Syncmer digester(sequence, k, large_window); + std::vector output; + digester.roll_minimizer(sequence.length(), output); + if (!output.empty()) { + std::memcpy(out, output.data(), output.size() * sizeof(uint32_t)); + } + return output.size(); + } catch (...) { + return 0; + } + } +} \ No newline at end of file diff --git a/digest-rs/src/lib.rs b/digest-rs/src/lib.rs new file mode 100644 index 0000000..0c0bb32 --- /dev/null +++ b/digest-rs/src/lib.rs @@ -0,0 +1,74 @@ +use std::ffi::{c_char, CString}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum DigestError { + #[error("Invalid k-mer size: must be greater than 3")] + InvalidKmerSize, + #[error("Invalid window size: must be greater than 0")] + InvalidWindowSize, + #[error("Invalid mod value: must be greater than 0")] + InvalidModValue, + #[error("C++ library error: {0}")] + LibraryError(String), +} + +extern "C" { + fn window_minimizer(seq: *const c_char, len: usize, k: u32, large_window: u32, out: *mut u32) -> usize; + fn modimizer(seq: *const c_char, len: usize, k: u32, mod_val: u32, out: *mut u32) -> usize; + fn syncmer(seq: *const c_char, len: usize, k: u32, large_window: u32, out: *mut u32) -> usize; +} + +pub fn window_minimizer_rs(seq: &str, k: u32, window: u32) -> Result, DigestError> { + if k < 4 { + return Err(DigestError::InvalidKmerSize); + } + if window == 0 { + return Err(DigestError::InvalidWindowSize); + } + + unsafe { + let c_seq = CString::new(seq).unwrap(); + let char_count = seq.chars().count(); + let mut result = Vec::with_capacity(char_count); + let size = window_minimizer(c_seq.as_ptr(), char_count, k, window, result.as_mut_ptr()); + result.set_len(size); + Ok(result) + } +} + +pub fn modimizer_rs(seq: &str, k: u32, mod_val: u32) -> Result, DigestError> { + if k < 4 { + return Err(DigestError::InvalidKmerSize); + } + if mod_val == 0 { + return Err(DigestError::InvalidModValue); + } + + unsafe { + let c_seq = CString::new(seq).unwrap(); + let char_count = seq.chars().count(); + let mut result = Vec::with_capacity(char_count); + let size = modimizer(c_seq.as_ptr(), char_count, k, mod_val, result.as_mut_ptr()); + result.set_len(size); + Ok(result) + } +} + +pub fn syncmer_rs(seq: &str, k: u32, window: u32) -> Result, DigestError> { + if k < 4 { + return Err(DigestError::InvalidKmerSize); + } + if window == 0 { + return Err(DigestError::InvalidWindowSize); + } + + unsafe { + let c_seq = CString::new(seq).unwrap(); + let char_count = seq.chars().count(); + let mut result = Vec::with_capacity(char_count); + let size = syncmer(c_seq.as_ptr(), char_count, k, window, result.as_mut_ptr()); + result.set_len(size); + Ok(result) + } +} \ No newline at end of file From a35124504e962f9811cec3911240dd067902d124 Mon Sep 17 00:00:00 2001 From: Vikram Shivakumar Date: Tue, 27 May 2025 17:56:54 -0400 Subject: [PATCH 2/4] changed installation to be flexible --- README.md | 2 ++ digest-rs/build.rs | 41 ++++++++++++++++++++++++++++++++++------- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 300e069..99ff692 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,8 @@ We include Rust bindings for the digest library. To use the Rust bindings, add t digest-rs = "0.1.0" ``` +When compiling any package that uses `digest-seq`, set the env variable DIGEST_DIR to the build directory after building with meson (or, alternatively, install with Conda). + ### Example Usage ```rust diff --git a/digest-rs/build.rs b/digest-rs/build.rs index 4e92696..7523fa7 100644 --- a/digest-rs/build.rs +++ b/digest-rs/build.rs @@ -2,9 +2,36 @@ use std::env; use std::path::PathBuf; fn main() { - let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); - let digest_dir = PathBuf::from(env::var("HOME").unwrap()).join("vast/digest"); - + // 1. Try to get DIGEST_DIR from environment + let digest_dir = env::var("DIGEST_DIR") + .map(PathBuf::from) + // 2. Fallback: try conda env or system install + .or_else(|_| { + // Try conda environment + if let Ok(prefix) = env::var("CONDA_PREFIX") { + let candidate = PathBuf::from(&prefix).join("include/digest"); + if candidate.exists() { + return Ok(PathBuf::from(prefix)); + } + } + // Try /usr/local or /usr + for prefix in ["/usr/local", "/usr"] { + let candidate = PathBuf::from(prefix).join("include/digest"); + if candidate.exists() { + return Ok(PathBuf::from(prefix)); + } + } + Err(env::VarError::NotPresent) + }) + .unwrap_or_else(|_| { + eprintln!( + "Could not find Digest C++ library. \ + Please set the DIGEST_DIR environment variable to the install prefix \ + (containing 'include/digest' and 'lib/libnthash.a')." + ); + std::process::exit(1); + }); + // Build the C++ bindings cc::Build::new() .cpp(true) @@ -12,16 +39,16 @@ fn main() { .include(digest_dir.join("include")) .include(digest_dir.join("extern/nthash/include")) .flag("-std=c++17") - .flag("-fPIC") // Add position-independent code flag - .compile("bindings"); // This creates libbindings.a + .flag("-fPIC") + .compile("bindings"); // Link against nthash library println!("cargo:rustc-link-search=native={}/lib", digest_dir.display()); println!("cargo:rustc-link-lib=static=nthash"); - + // Link against our bindings library println!("cargo:rustc-link-lib=static=bindings"); - + // Link against C++ standard library println!("cargo:rustc-link-lib=dylib=stdc++"); } \ No newline at end of file From 589e636e16db9f4fc4e3de96bfe8ef7800a6282d Mon Sep 17 00:00:00 2001 From: Vikram Shivakumar Date: Tue, 27 May 2025 17:57:48 -0400 Subject: [PATCH 3/4] added tests --- digest-rs/tests/minimizer_tests.rs | 465 +++++++++++++++++++++++++++++ 1 file changed, 465 insertions(+) create mode 100644 digest-rs/tests/minimizer_tests.rs diff --git a/digest-rs/tests/minimizer_tests.rs b/digest-rs/tests/minimizer_tests.rs new file mode 100644 index 0000000..bf5bb03 --- /dev/null +++ b/digest-rs/tests/minimizer_tests.rs @@ -0,0 +1,465 @@ +use digest::{window_minimizer_rs, modimizer_rs, syncmer_rs}; + +const TEST_SEQUENCES: &[(&str, &str)] = &[ + ("basic", "ACTGCTGACTACTAGCTAGTCGATGACTGCTGACTACTAGCTAGTCGATGACACTGCTGACTACTAGCTAGTCGATGAC"), + ("homopolymer", "AAAAAAAAAAAAAAAAAAAAAAAACCCCCTTTTTTAAAAAAAAAAAAAAAAAAAGGGGGAAAAAAAAAAAAAAAAAAAA"), + ("repeating", "ACGTACGTACGTACGTACGTACGTACGTACGTATATATATATCGCGCGCGACGTACGTACGTACGTACGTACGTACGT"), + ("with_n", "NACTGCTGACTACTAGNATCGATCGATCGANTCGATCGATCGANACTGCTGACTACTAGNATCGATCGATCGANTCGA"), + ("complex", "ATCGATCGATCGATCGGGGGGATCGATCGATCGATCGAAAAATCGATCGATCGATCGTTTTTATCGATCGATCGATCG"), + ("gc_rich", "GCGCGCGCGCGCGCGCGCATGCATGCGCGCGCGCGCGCGCGCTATATGCGCGCGCGCGCGCGCGCATGCATGCGCGC"), + ("at_rich", "ATATATATATATATATATAGCTAGCTATATATATATATATAGCTAGCTATATATATATATATATATAGCTAGCTATA"), +]; + +#[test] +fn test_window_minimizer_basic() { + let seq = TEST_SEQUENCES[0].1; + + // k=4, w=5 + let result = window_minimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![4, 6, 8, 12, 14, 17, 21, 24, 29, 31, 33, 37, 39, 42, 46, 51, 52, 56, 58, 60, 64, 66, 69, 73]); + + // k=4, w=10 + let result = window_minimizer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![8, 12, 14, 21, 31, 33, 37, 39, 46, 56, 58, 60, 64, 66, 73]); + + // k=5, w=5 + let result = window_minimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![2, 3, 8, 10, 15, 17, 21, 25, 27, 28, 33, 35, 40, 42, 46, 48, 52, 54, 55, 60, 62, 67, 69, 73]); + + // k=5, w=10 + let result = window_minimizer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![2, 3, 10, 15, 17, 27, 28, 35, 40, 42, 52, 54, 55, 62, 67, 69]); +} + +#[test] +fn test_window_minimizer_homopolymer() { + let seq = TEST_SEQUENCES[1].1; + + // k=4, w=5 + let result = window_minimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 26, 28, 32, 34, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 57, 58, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75]); + + // k=4, w=10 + let result = window_minimizer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 32, 34, 44, 45, 46, 47, 48, 49, 50, 51, 53, 57, 58, 68, 69, 70, 71, 72, 73, 74, 75]); + + // k=5, w=5 + let result = window_minimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 24, 28, 29, 30, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 55, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]); + + // k=5, w=10 + let result = window_minimizer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 29, 30, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]); +} + +#[test] +fn test_modimizer_basic() { + let seq = TEST_SEQUENCES[0].1; + + // k=4, m=3 + let result = modimizer_rs(seq, 4, 3).unwrap(); + assert_eq!(result, vec![0, 1, 6, 13, 17, 24, 25, 26, 31, 38, 42, 50, 51, 52, 53, 58, 65, 69]); + + // k=4, m=5 + let result = modimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![3, 7, 8, 10, 11, 12, 14, 15, 16, 19, 28, 32, 33, 35, 36, 37, 39, 40, 41, 44, 55, 59, 60, 62, 63, 64, 66, 67, 68, 71]); + + // k=5, m=3 + let result = modimizer_rs(seq, 5, 3).unwrap(); + assert_eq!(result, vec![1, 2, 7, 9, 10, 15, 17, 26, 27, 32, 34, 35, 40, 42, 50, 53, 54, 59, 61, 62, 67, 69]); + + // k=5, m=5 + let result = modimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![10, 15, 20, 35, 40, 45, 48, 49, 62, 67, 72]); +} + +#[test] +fn test_modimizer_homopolymer() { + let seq = TEST_SEQUENCES[1].1; + + // k=4, m=3 + let result = modimizer_rs(seq, 4, 3).unwrap(); + assert_eq!(result, vec![21, 28, 32, 34, 51, 58]); + + // k=4, m=5 + let result = modimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![]); + + // k=5, m=3 + let result = modimizer_rs(seq, 5, 3).unwrap(); + assert_eq!(result, vec![20, 21, 23, 25, 26, 52, 53, 55, 57]); + + // k=5, m=5 + let result = modimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![21, 22, 24, 26, 27, 28, 50, 51, 52, 54]); +} + +#[test] +fn test_syncmer_basic() { + let seq = TEST_SEQUENCES[0].1; + + // k=4, w=5 + let result = syncmer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![0, 2, 4, 8, 10, 12, 14, 17, 21, 24, 25, 27, 29, 33, 35, 37, 39, 42, 46, 47, 48, 52, 54, 56, 60, 62, 64, 66, 69]); + + // k=4, w=10 + let result = syncmer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![3, 5, 12, 21, 22, 24, 28, 30, 37, 46, 47, 49, 51, 55, 57, 64]); + + // k=5, w=5 + let result = syncmer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![2, 3, 4, 6, 10, 11, 13, 17, 21, 23, 27, 28, 29, 31, 35, 36, 38, 42, 46, 48, 50, 54, 55, 56, 58, 62, 63, 65, 69]); + + // k=5, w=10 + let result = syncmer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![2, 3, 6, 8, 17, 18, 27, 28, 31, 33, 42, 43, 45, 54, 55, 58, 60]); +} + +#[test] +fn test_syncmer_homopolymer() { + let seq = TEST_SEQUENCES[1].1; + + // k=4, w=5 + let result = syncmer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 22, 26, 28, 30, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 49, 53, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]); + + // k=4, w=10 + let result = syncmer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 22, 23, 25, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 53, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66]); + + // k=5, w=5 + let result = syncmer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 24, 25, 26, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]); + + // k=5, w=10 + let result = syncmer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]); +} + +#[test] +fn test_error_cases() { + // Test invalid k-mer size + assert!(window_minimizer_rs("ACGT", 2, 5).is_err()); + assert!(modimizer_rs("ACGT", 2, 3).is_err()); + assert!(syncmer_rs("ACGT", 2, 5).is_err()); + + // Test invalid window/mod size + assert!(window_minimizer_rs("ACGT", 4, 0).is_err()); + assert!(modimizer_rs("ACGT", 4, 0).is_err()); + assert!(syncmer_rs("ACGT", 4, 0).is_err()); +} + +#[test] +fn test_window_minimizer_repeating() { + let seq = TEST_SEQUENCES[2].1; + + // k=4, w=5 + let result = window_minimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 30, 31, 33, 35, 37, 40, 44, 46, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73]); + + // k=4, w=10 + let result = window_minimizer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 30, 31, 33, 35, 37, 40, 46, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73]); + + // k=5, w=5 + let result = window_minimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 29, 30, 35, 36, 37, 38, 40, 41, 46, 47, 49, 54, 57, 58, 61, 62, 65, 66, 69, 70, 73]); + + // k=5, w=10 + let result = window_minimizer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 38, 40, 41, 46, 47, 49, 58, 61, 62, 65, 66, 69, 70, 73]); +} + +#[test] +fn test_window_minimizer_with_n() { + let seq = TEST_SEQUENCES[3].1; + + // k=4, w=5 + let result = window_minimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![5, 7, 9, 12, 21, 23, 25, 32, 34, 36, 38, 44, 48, 50, 52, 55, 64, 66, 68]); + + // k=4, w=10 + let result = window_minimizer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![9, 12, 25, 32, 34, 36, 38, 44, 48, 50, 52, 55, 68]); + + // k=5, w=5 + let result = window_minimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![3, 4, 9, 11, 20, 23, 24, 32, 33, 36, 37, 44, 46, 47, 52, 54, 63, 66, 67]); + + // k=5, w=10 + let result = window_minimizer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![3, 4, 11, 24, 32, 33, 36, 37, 44, 46, 47, 54]); +} + +#[test] +fn test_window_minimizer_complex() { + let seq = TEST_SEQUENCES[4].1; + + // k=4, w=5 + let result = window_minimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![4, 6, 8, 10, 12, 13, 14, 19, 20, 21, 23, 25, 27, 29, 31, 33, 36, 41, 43, 45, 47, 49, 51, 53, 54, 59, 64, 66, 68, 70, 72, 74]); + + // k=4, w=10 + let result = window_minimizer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![8, 10, 12, 21, 23, 25, 27, 29, 31, 33, 41, 43, 45, 47, 49, 51, 53, 59, 68, 70, 72, 74]); + + // k=5, w=5 + let result = window_minimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![3, 6, 7, 10, 11, 12, 13, 17, 20, 23, 24, 27, 28, 31, 32, 34, 37, 40, 44, 47, 48, 51, 52, 54, 57, 60, 65, 68, 69, 72, 73]); + + // k=5, w=10 + let result = window_minimizer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![7, 10, 11, 12, 13, 17, 27, 28, 31, 32, 34, 37, 40, 48, 51, 52, 54, 57, 60, 69, 72, 73]); +} + +#[test] +fn test_window_minimizer_gc_rich() { + let seq = TEST_SEQUENCES[5].1; + + // k=4, w=5 + let result = window_minimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![3, 5, 7, 9, 11, 13, 15, 19, 23, 27, 29, 31, 33, 35, 37, 39, 40, 42, 44, 46, 50, 52, 54, 56, 58, 60, 62, 66, 70]); + + // k=4, w=10 + let result = window_minimizer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![9, 11, 13, 15, 19, 23, 33, 35, 37, 39, 40, 42, 44, 46, 56, 58, 60, 62, 66]); + + // k=5, w=5 + let result = window_minimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 20, 21, 22, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 43, 45, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 67, 68, 69]); + + // k=5, w=10 + let result = window_minimizer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![9, 10, 11, 12, 13, 15, 22, 32, 33, 34, 35, 36, 37, 39, 43, 45, 55, 56, 57, 58, 59, 60, 62, 69]); +} + +#[test] +fn test_window_minimizer_at_rich() { + let seq = TEST_SEQUENCES[6].1; + + // k=4, w=5 + let result = window_minimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71]); + + // k=4, w=10 + let result = window_minimizer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![9, 11, 13, 15, 17, 19, 21, 23, 33, 35, 37, 39, 41, 43, 45, 55, 57, 59, 61, 63, 65, 67, 69, 71]); + + // k=5, w=5 + let result = window_minimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 43, 44, 45, 46, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 69, 70]); + + // k=5, w=10 + let result = window_minimizer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 34, 35, 36, 37, 38, 39, 40, 43, 44, 45, 46, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 69, 70]); +} + +#[test] +fn test_modimizer_repeating() { + let seq = TEST_SEQUENCES[2].1; + + // k=4, m=3 + let result = modimizer_rs(seq, 4, 3).unwrap(); + assert_eq!(result, vec![1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 17, 18, 19, 21, 22, 23, 25, 26, 27, 29, 39, 49, 51, 52, 53, 55, 56, 57, 59, 60, 61, 63, 64, 65, 67, 68, 69, 71, 72, 73]); + + // k=4, m=5 + let result = modimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![31, 32, 33, 34, 35, 36, 37, 38]); + + // k=5, m=3 + let result = modimizer_rs(seq, 5, 3).unwrap(); + assert_eq!(result, vec![29, 31, 32, 33, 34, 35, 36, 37, 42, 43, 44, 45, 48]); + + // k=5, m=5 + let result = modimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![]); +} + +#[test] +fn test_modimizer_with_n() { + let seq = TEST_SEQUENCES[3].1; + + // k=4, m=3 + let result = modimizer_rs(seq, 4, 3).unwrap(); + assert_eq!(result, vec![1, 2, 7, 20, 24, 33, 37, 44, 45, 50, 63, 67]); + + // k=4, m=5 + let result = modimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![4, 8, 9, 11, 12, 18, 22, 26, 31, 35, 39, 47, 51, 52, 54, 55, 61, 65, 69, 74]); + + // k=5, m=3 + let result = modimizer_rs(seq, 5, 3).unwrap(); + assert_eq!(result, vec![2, 3, 8, 10, 11, 45, 46, 51, 53, 54]); + + // k=5, m=5 + let result = modimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![11, 54]); +} + +#[test] +fn test_modimizer_complex() { + let seq = TEST_SEQUENCES[4].1; + + // k=4, m=3 + let result = modimizer_rs(seq, 4, 3).unwrap(); + assert_eq!(result, vec![3, 7, 11, 13, 19, 20, 24, 28, 32, 35, 36, 44, 48, 52, 55, 56, 59, 60, 61, 65, 69, 73]); + + // k=4, m=5 + let result = modimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![1, 5, 9, 22, 26, 30, 34, 35, 40, 42, 46, 50, 63, 67, 71]); + + // k=5, m=3 + let result = modimizer_rs(seq, 5, 3).unwrap(); + assert_eq!(result, vec![12, 14, 17, 19, 38, 39, 53, 54, 56, 60]); + + // k=5, m=5 + let result = modimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![12, 15, 16, 18, 54, 60]); +} + +#[test] +fn test_modimizer_gc_rich() { + let seq = TEST_SEQUENCES[5].1; + + // k=4, m=3 + let result = modimizer_rs(seq, 4, 3).unwrap(); + assert_eq!(result, vec![15, 23, 46, 62, 70]); + + // k=4, m=5 + let result = modimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![15, 23, 40, 42, 43, 46, 62, 70]); + + // k=5, m=3 + let result = modimizer_rs(seq, 5, 3).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 42, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 65, 66, 71, 72]); + + // k=5, m=5 + let result = modimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![40]); +} + +#[test] +fn test_modimizer_at_rich() { + let seq = TEST_SEQUENCES[6].1; + + // k=4, m=3 + let result = modimizer_rs(seq, 4, 3).unwrap(); + assert_eq!(result, vec![18, 22, 40, 44, 66, 70]); + + // k=4, m=5 + let result = modimizer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19, 20, 21, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39, 41, 42, 43, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 67, 68, 69, 71, 73]); + + // k=5, m=3 + let result = modimizer_rs(seq, 5, 3).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62]); + + // k=5, m=5 + let result = modimizer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![16, 23, 38, 45, 64, 71]); +} + +#[test] +fn test_syncmer_repeating() { + let seq = TEST_SEQUENCES[2].1; + + // k=4, w=5 + let result = syncmer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 26, 27, 29, 31, 33, 35, 37, 40, 42, 44, 46, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69]); + + // k=4, w=10 + let result = syncmer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 26, 28, 31, 33, 35, 37, 40, 42, 44, 46, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]); + + // k=5, w=5 + let result = syncmer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 29, 30, 31, 32, 33, 34, 36, 37, 41, 42, 43, 45, 49, 50, 53, 54, 57, 58, 61, 62, 65, 66, 69]); + + // k=5, w=10 + let result = syncmer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![0, 2, 3, 4, 6, 7, 8, 10, 11, 12, 14, 15, 16, 18, 19, 20, 23, 24, 27, 28, 29, 31, 32, 37, 38, 40, 49, 50, 52, 53, 54, 56, 57, 58, 60, 61, 62, 64]); +} + +#[test] +fn test_syncmer_with_n() { + let seq = TEST_SEQUENCES[3].1; + + // k=4, w=5 + let result = syncmer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![1, 3, 5, 9, 12, 17, 19, 21, 23, 24, 25, 26, 32, 34, 36, 44, 46, 48, 52, 55, 60, 62, 64, 66]); + + // k=4, w=10 + let result = syncmer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![9, 12, 17, 19, 21, 23, 25, 31, 35, 37, 39, 52, 55, 60]); + + // k=5, w=5 + let result = syncmer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![3, 4, 5, 7, 11, 19, 20, 23, 24, 32, 33, 35, 37, 46, 47, 48, 50, 54, 62, 63]); + + // k=5, w=10 + let result = syncmer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![3, 4, 11, 18, 19, 20, 22, 23, 24, 25, 32, 46, 47, 54]); +} + +#[test] +fn test_syncmer_complex() { + let seq = TEST_SEQUENCES[4].1; + + // k=4, w=5 + let result = syncmer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![0, 2, 4, 6, 8, 10, 12, 13, 14, 15, 16, 17, 19, 21, 23, 25, 27, 29, 31, 33, 36, 37, 39, 41, 43, 45, 47, 49, 51, 53, 54, 55, 59, 60, 62, 64, 66, 68, 70]); + + // k=4, w=10 + let result = syncmer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 21, 22, 23, 24, 25, 27, 29, 31, 32, 33, 34, 36, 38, 40, 41, 42, 43, 44, 45, 47, 49, 50, 59, 61, 62, 63, 64, 65]); + + // k=5, w=5 + let result = syncmer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![2, 3, 6, 7, 8, 9, 13, 17, 19, 20, 23, 24, 27, 28, 30, 34, 37, 40, 43, 44, 47, 48, 50, 53, 57, 60, 61, 64, 65, 68, 69]); + + // k=5, w=10 + let result = syncmer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![1, 2, 3, 4, 8, 17, 18, 19, 20, 22, 23, 24, 25, 34, 37, 40, 42, 43, 44, 45, 48, 57, 60, 63, 64]); +} + +#[test] +fn test_syncmer_gc_rich() { + let seq = TEST_SEQUENCES[5].1; + + // k=4, w=5 + let result = syncmer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![1, 3, 5, 7, 9, 11, 15, 19, 23, 25, 27, 29, 31, 33, 35, 36, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 62, 66]); + + // k=4, w=10 + let result = syncmer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 10, 19, 23, 24, 25, 26, 27, 28, 29, 30, 31, 40, 42, 44, 46, 47, 48, 49, 50, 51, 52, 53, 57]); + + // k=5, w=5 + let result = syncmer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 39, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 62, 63, 64, 65]); + + // k=5, w=10 + let result = syncmer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 13, 15, 22, 23, 24, 25, 26, 27, 28, 29, 30, 34, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 60, 62]); +} + +#[test] +fn test_syncmer_at_rich() { + let seq = TEST_SEQUENCES[6].1; + + // k=4, w=5 + let result = syncmer_rs(seq, 4, 5).unwrap(); + assert_eq!(result, vec![1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69]); + + // k=4, w=10 + let result = syncmer_rs(seq, 4, 10).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 17, 19, 21, 23, 24, 25, 26, 27, 28, 29, 30, 32, 34, 36, 39, 41, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 58, 60, 62]); + + // k=5, w=5 + let result = syncmer_rs(seq, 5, 5).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 39, 40, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 65, 66]); + + // k=5, w=10 + let result = syncmer_rs(seq, 5, 10).unwrap(); + assert_eq!(result, vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 34, 35, 39, 40, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 60, 61]); +} \ No newline at end of file From 1cf03997b6d5200169f13f0ae770138055081b2d Mon Sep 17 00:00:00 2001 From: Vikram Shivakumar Date: Tue, 27 May 2025 18:02:12 -0400 Subject: [PATCH 4/4] rust readme --- digest-rs/README.md | 47 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 digest-rs/README.md diff --git a/digest-rs/README.md b/digest-rs/README.md new file mode 100644 index 0000000..ad87e42 --- /dev/null +++ b/digest-rs/README.md @@ -0,0 +1,47 @@ +# digest-rs + +Rust bindings for the [digest](https://github.com/VeryAmazed/digest) C++ library, providing efficient kmer minimizer and syncmer digestion for DNA sequences. + +## Requirements + +- The C++ `digest` library must be available at `$DIGEST_DIR` (see `build.rs`) or installed via conda. + +## Usage + +Add to your `Cargo.toml`: + +```toml +[dependencies] +digest-rs = "0.1.0" +``` + +### Example + +```rust +use digest_rs::{window_minimizer_rs, modimizer_rs, syncmer_rs}; + +let sequence = "ACGTACGT"; +let k = 4; +let window = 2; +let mod_val = 3; + +// Window minimizer +let minimizers = window_minimizer_rs(sequence, k, window)?; + +// Modimizer +let modimizers = modimizer_rs(sequence, k, mod_val)?; + +// Syncmer +let syncmers = syncmer_rs(sequence, k, window)?; +``` + +Each function returns a `Result, DigestError>` with the minimizer positions. + +## Building + +This crate uses a `build.rs` script to compile the C++ bindings and link against the C++ `digest` library. +Make sure the C++ library and its dependencies are built and available at the expected paths. + +## License + +MIT \ No newline at end of file