Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ Once installed, you can import and use the Digest library in Python:
>>> [seq[p:p+5] for p in window_minimizer(seq, k=5, w=11)]
['ACGTA', 'CGTAG', 'AGCTA', 'TAGCT', 'GCTGA', 'TTACA', 'TACAT', 'GTATG', 'GCAAG', 'TGATC', 'CGTAG', 'TAGTG', 'ATGCT']
```

We have also implemented parallel execution in the python library:
```
>>> import timeit
Expand Down Expand Up @@ -143,6 +144,44 @@ meson compile
```
This will set the build flage from release to debug allowing you to generate proper executables for benchmark/testing. The executables will be located in the build folder and can be run directly from there. You can look at the meson.build file for more details.

## Rust Bindings

We include Rust bindings for the digest library. To use the Rust bindings, add the following to your `Cargo.toml`:

```toml
[dependencies]
digest-rs = "0.1.0"
```

When compiling any package that uses `digest-seq`, set the env variable DIGEST_DIR to the build directory after building with meson (or, alternatively, install with Conda).

### Example Usage

```rust
use digest_rs;

// Window minimizer example
let sequence = "ACGTACGT";
let k = 4;
let window = 2;
let minimizers = digest_rs::window_minimizer_rs(sequence, k, window)?;

// Modimizer example
let mod_val = 3;
let modimizers = digest_rs::modimizer_rs(sequence, k, mod_val)?;

// Syncmer example
let syncmers = digest_rs::syncmer_rs(sequence, k, window)?;
```

The three primary functions are available as bindings to the C++ library (similar to the python bindings). All functions return a `Result<Vec<u32>, DigestError>` containing the positions of the minimizers in the sequence.

> [!NOTE]
> These are only Rust bindings of the C++ implementation, not a full Rust implementation. As such, they contain unsafe code blocks.




## Contributing
Use clang format version 17.
run `ninja clang-format` before submitting a PR.
Expand Down
24 changes: 24 additions & 0 deletions digest-rs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[package]
name = "digest-rs"
version = "0.1.0"
edition = "2021"
description = "Rust bindings for the digest library"
license = "MIT"
repository = "https://github.com/VeryAmazed/digest"
readme = "README.md"
keywords = ["bioinformatics", "kmer", "minimizer", "syncmer"]
categories = ["science", "bioinformatics"]

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[lib]
name = "digest"
crate-type = ["cdylib", "rlib"]

[dependencies]
libc = "0.2"
thiserror = "1.0"

[build-dependencies]
cc = "1.0"

47 changes: 47 additions & 0 deletions digest-rs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# digest-rs

Rust bindings for the [digest](https://github.com/VeryAmazed/digest) C++ library, providing efficient kmer minimizer and syncmer digestion for DNA sequences.

## Requirements

- The C++ `digest` library must be available at `$DIGEST_DIR` (see `build.rs`) or installed via conda.

## Usage

Add to your `Cargo.toml`:

```toml
[dependencies]
digest-rs = "0.1.0"
```

### Example

```rust
use digest_rs::{window_minimizer_rs, modimizer_rs, syncmer_rs};

let sequence = "ACGTACGT";
let k = 4;
let window = 2;
let mod_val = 3;

// Window minimizer
let minimizers = window_minimizer_rs(sequence, k, window)?;

// Modimizer
let modimizers = modimizer_rs(sequence, k, mod_val)?;

// Syncmer
let syncmers = syncmer_rs(sequence, k, window)?;
```

Each function returns a `Result<Vec<u32>, DigestError>` with the minimizer positions.

## Building

This crate uses a `build.rs` script to compile the C++ bindings and link against the C++ `digest` library.
Make sure the C++ library and its dependencies are built and available at the expected paths.

## License

MIT
54 changes: 54 additions & 0 deletions digest-rs/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
use std::env;
use std::path::PathBuf;

fn main() {
// 1. Try to get DIGEST_DIR from environment
let digest_dir = env::var("DIGEST_DIR")
.map(PathBuf::from)
// 2. Fallback: try conda env or system install
.or_else(|_| {
// Try conda environment
if let Ok(prefix) = env::var("CONDA_PREFIX") {
let candidate = PathBuf::from(&prefix).join("include/digest");
if candidate.exists() {
return Ok(PathBuf::from(prefix));
}
}
// Try /usr/local or /usr
for prefix in ["/usr/local", "/usr"] {
let candidate = PathBuf::from(prefix).join("include/digest");
if candidate.exists() {
return Ok(PathBuf::from(prefix));
}
}
Err(env::VarError::NotPresent)
})
.unwrap_or_else(|_| {
eprintln!(
"Could not find Digest C++ library. \
Please set the DIGEST_DIR environment variable to the install prefix \
(containing 'include/digest' and 'lib/libnthash.a')."
);
std::process::exit(1);
});

// Build the C++ bindings
cc::Build::new()
.cpp(true)
.file("src/bindings.cpp")
.include(digest_dir.join("include"))
.include(digest_dir.join("extern/nthash/include"))
.flag("-std=c++17")
.flag("-fPIC")
.compile("bindings");

// Link against nthash library
println!("cargo:rustc-link-search=native={}/lib", digest_dir.display());
println!("cargo:rustc-link-lib=static=nthash");

// Link against our bindings library
println!("cargo:rustc-link-lib=static=bindings");

// Link against C++ standard library
println!("cargo:rustc-link-lib=dylib=stdc++");
}
57 changes: 57 additions & 0 deletions digest-rs/examples/demo.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use digest::{window_minimizer_rs, modimizer_rs, syncmer_rs};

fn main() {
// Longer test sequences
let sequences = vec![
"ACTGCTGACTACTAGCTAGTCGATGACTGCTGACTACTAGCTAGTCGATGAC",
"ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG",
"GATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACAGATTACA",
];

// Test parameters
let k_values = vec![4, 5];
let window_sizes = vec![5, 10];
let mod_values = vec![3, 5];

println!("=== Window Minimizer Tests ===");
for seq in &sequences {
println!("\nSequence: {}", seq);
for k in &k_values {
for window in &window_sizes {
println!("k={}, window={}", k, window);
match window_minimizer_rs(seq, *k, *window) {
Ok(positions) => println!(" Positions: {:?}", positions),
Err(e) => println!(" Error: {}", e),
}
}
}
}

println!("\n=== Modimizer Tests ===");
for seq in &sequences {
println!("\nSequence: {}", seq);
for k in &k_values {
for mod_val in &mod_values {
println!("k={}, mod={}", k, mod_val);
match modimizer_rs(seq, *k, *mod_val) {
Ok(positions) => println!(" Positions: {:?}", positions),
Err(e) => println!(" Error: {}", e),
}
}
}
}

println!("\n=== Syncmer Tests ===");
for seq in &sequences {
println!("\nSequence: {}", seq);
for k in &k_values {
for window in &window_sizes {
println!("k={}, window={}", k, window);
match syncmer_rs(seq, *k, *window) {
Ok(positions) => println!(" Positions: {:?}", positions),
Err(e) => println!(" Error: {}", e),
}
}
}
}
}
60 changes: 60 additions & 0 deletions digest-rs/src/bindings.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include <digest/window_minimizer.hpp>
#include <digest/syncmer.hpp>
#include <digest/mod_minimizer.hpp>
#include <digest/data_structure.hpp>
#include <cstring>
#include <memory>
#include <vector>
#include <string>

extern "C" {
using namespace digest;

// Window minimizer wrapper functions
size_t window_minimizer(const char* seq, size_t len, unsigned k, unsigned large_window, uint32_t* out) {
try {
std::string sequence(seq, len);
WindowMin<BadCharPolicy::SKIPOVER, ds::Adaptive> digester(sequence, k, large_window);
std::vector<uint32_t> output;
digester.roll_minimizer(sequence.length(), output);
if (!output.empty()) {
std::memcpy(out, output.data(), output.size() * sizeof(uint32_t));
}
return output.size();
} catch (...) {
return 0;
}
}

// Modimizer wrapper function
size_t modimizer(const char* seq, size_t len, unsigned k, uint32_t mod_val, uint32_t* out) {
try {
std::string sequence(seq, len);
ModMin<BadCharPolicy::SKIPOVER> digester(sequence, k, mod_val);
std::vector<uint32_t> output;
digester.roll_minimizer(sequence.length(), output);
if (!output.empty()) {
std::memcpy(out, output.data(), output.size() * sizeof(uint32_t));
}
return output.size();
} catch (...) {
return 0;
}
}

// Syncmer wrapper function
size_t syncmer(const char* seq, size_t len, unsigned k, unsigned large_window, uint32_t* out) {
try {
std::string sequence(seq, len);
Syncmer<BadCharPolicy::SKIPOVER, ds::Adaptive> digester(sequence, k, large_window);
std::vector<uint32_t> output;
digester.roll_minimizer(sequence.length(), output);
if (!output.empty()) {
std::memcpy(out, output.data(), output.size() * sizeof(uint32_t));
}
return output.size();
} catch (...) {
return 0;
}
}
}
74 changes: 74 additions & 0 deletions digest-rs/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
use std::ffi::{c_char, CString};
use thiserror::Error;

#[derive(Error, Debug)]
pub enum DigestError {
#[error("Invalid k-mer size: must be greater than 3")]
InvalidKmerSize,
#[error("Invalid window size: must be greater than 0")]
InvalidWindowSize,
#[error("Invalid mod value: must be greater than 0")]
InvalidModValue,
#[error("C++ library error: {0}")]
LibraryError(String),
}

extern "C" {
fn window_minimizer(seq: *const c_char, len: usize, k: u32, large_window: u32, out: *mut u32) -> usize;
fn modimizer(seq: *const c_char, len: usize, k: u32, mod_val: u32, out: *mut u32) -> usize;
fn syncmer(seq: *const c_char, len: usize, k: u32, large_window: u32, out: *mut u32) -> usize;
}

pub fn window_minimizer_rs(seq: &str, k: u32, window: u32) -> Result<Vec<u32>, DigestError> {
if k < 4 {
return Err(DigestError::InvalidKmerSize);
}
if window == 0 {
return Err(DigestError::InvalidWindowSize);
}

unsafe {
let c_seq = CString::new(seq).unwrap();
let char_count = seq.chars().count();
let mut result = Vec::with_capacity(char_count);
let size = window_minimizer(c_seq.as_ptr(), char_count, k, window, result.as_mut_ptr());
result.set_len(size);
Ok(result)
}
}

pub fn modimizer_rs(seq: &str, k: u32, mod_val: u32) -> Result<Vec<u32>, DigestError> {
if k < 4 {
return Err(DigestError::InvalidKmerSize);
}
if mod_val == 0 {
return Err(DigestError::InvalidModValue);
}

unsafe {
let c_seq = CString::new(seq).unwrap();
let char_count = seq.chars().count();
let mut result = Vec::with_capacity(char_count);
let size = modimizer(c_seq.as_ptr(), char_count, k, mod_val, result.as_mut_ptr());
result.set_len(size);
Ok(result)
}
}

pub fn syncmer_rs(seq: &str, k: u32, window: u32) -> Result<Vec<u32>, DigestError> {
if k < 4 {
return Err(DigestError::InvalidKmerSize);
}
if window == 0 {
return Err(DigestError::InvalidWindowSize);
}

unsafe {
let c_seq = CString::new(seq).unwrap();
let char_count = seq.chars().count();
let mut result = Vec::with_capacity(char_count);
let size = syncmer(c_seq.as_ptr(), char_count, k, window, result.as_mut_ptr());
result.set_len(size);
Ok(result)
}
}
Loading
Loading