diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 15a1bced..1d20979d 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -19,7 +19,7 @@ jobs: # - {os: windows-latest, r: 'release', rust-version: 'stable-msvc', rust-target: 'x86_64-pc-windows-gnu'} - {os: macOS-latest, r: 'release', rust-version: 'stable'} - {os: ubuntu-latest, r: 'release', rust-version: 'stable'} - - {os: ubuntu-latest, r: 'devel', rust-version: 'stable'} + #- {os: ubuntu-latest, r: 'devel', rust-version: 'stable'} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true steps: diff --git a/.gitignore b/.gitignore index 21fc1384..d2227469 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ bin/ .DS_Store .Rhistory +/gtars/tests/data/out/region_scoring_count.csv.gz diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..d6dcf2ff --- /dev/null +++ b/LICENSE @@ -0,0 +1,9 @@ +Copyright 2024 gtars authors + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 00000000..d6dcf2ff --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,9 @@ +Copyright 2024 gtars authors + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/README.md b/README.md index 3a71fc27..d486b88d 100644 --- a/README.md +++ b/README.md @@ -7,52 +7,65 @@ `gtars` is a rust crate that provides a set of tools for working with genomic interval data. Its primary goal is to provide processors for our python package, [`geniml`](https:github.com/databio/geniml), a library for machine learning on genomic intervals. However, it can be used as a standalone library for working with genomic intervals as well. -`gtars` provides three things: +`gtars` provides these things: 1. A rust library crate. 2. A command-line interface, written in rust. -3. A Python package that provides bindings to the rust library. +3. A Python package that provides Python bindings to the rust library. +4. An R package that provides R bindings to the rust library ## Repository organization (for developers) This repo is organized like so: -1. A rust library crate (`/gtars/lib.rs`) that provides functions, traits, and structs for working with genomic interval data. -2. A rust binary crate (in `/gtars/main.rs`), a small, wrapper command-line interface for the library crate. -3. A rust crate (in `/bindings`) that provides Python bindings, and a resulting Python package, so that it can be used within Python. +1. The main gtars rust package in `/gtars`, which contains two crates: + 1a. A rust library crate (`/gtars/lib.rs`) that provides functions, traits, and structs for working with genomic interval data. + 1b. A rust binary crate (in `/gtars/main.rs`), a small, wrapper command-line interface for the library crate. +2. Python bindings (in `/bindings/python`), which consists of a rust package with a library crate (no binary crate) and Python package. +3. R bindings (in `/bindinds/r`), which consists of an R package. This repository is a work in progress, and still in early development. ## Installation + To install `gtars`, you must have the rust toolchain installed. You can install it by following the instructions [here](https://www.rust-lang.org/tools/install). You may build the binary locally using `cargo build --release`. This will create a binary in `target/release/gtars`. You can then add this to your path, or run it directly. ## Usage + `gtars` is very early in development, and as such, it does not have a lot of functionality yet. However, it does have a few useful tools. To see the available tools, run `gtars --help`. To see the help for a specific tool, run `gtars <tool> --help`. Alternatively, you can link `gtars` as a library in your rust project. To do so, add the following to your `Cargo.toml` file: + ```toml [dependencies] gtars = { git = "https://github.com/databio/gtars" } ``` ## Testing + To run the tests, run `cargo test`. ## Contributing + ### New internal library crate tools + If you'd like to add a new tool, you can do so by creating a new module within the src folder. ### New public library crate tools + If you want this to be available to users of `gtars`, you can add it to the `gtars` library crate as well. To do so, add the following to `src/lib.rs`: ```rust pub mod <tool_name>; ``` ### New binary crate tools + Finally, if you want to have command-line functionality, you can add it to the `gtars` binary crate. This requires two steps: + 1. Create a new `cli` using `clap` inside the `interfaces` module of `src/cli.rs`: + ```rust pub fn make_new_tool_cli() -> Command { @@ -60,6 +73,7 @@ pub fn make_new_tool_cli() -> Command { ``` 2. Write your logic in a wrapper function. This will live inside the `functions` module of `src/cli.rs`: + ```rust // top of file: use tool_name::{ ... } @@ -73,6 +87,7 @@ pub fn new_tool_wrapper() -> Result<(), Box<dyn Error>> { Please make sure you update the changelog and bump the version number in `Cargo.toml` when you add a new tool. ### VSCode users + If you are using VSCode, make sure you link to the `Cargo.toml` inside the `.vscode` folder, so that `rust-analyzer` can link it all together: ```json { diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 5c9fcc55..f34a403c 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gtars-py" -version = "0.1.1" +version = "0.2.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/bindings/python/README.md b/bindings/python/README.md index 267eab85..f3fff89a 100644 --- a/bindings/python/README.md +++ b/bindings/python/README.md @@ -1,18 +1,23 @@ # gtars -This is a python wrapper around the `gtars` crate. It provides an easy interface for using `gtars` in python. It is currently in early development, and as such, it does not have a lot of functionality yet, but new tools are being worked on right now. -## Installation -You can get `gtars` from PyPI: -```bash -pip install gtars -``` +This is a Python package that wraps the `gtars` crate so you can call gtars code from Python. + +Documentation for Python bindings is hosted at: https://docs.bedbase.org/gtars/ + +## Brief instructions -## Usage -Import the package, and use the tools: -```python -import gtars as gt +To install the development version, you'll have to build it locally. Build Python bindings like this: -gt.prune_universe(...) +```console +cd bindings/python +maturin build --interpreter 3.11 --release ``` -## Developer docs -Write the develop docs here... \ No newline at end of file + +Then install the local wheel that was just built: + +```console +gtars_version=`grep '^version =' Cargo.toml | cut -d '"' -f 2` +python_version=$(python --version | awk '{print $2}' | cut -d '.' -f1-2 | tr -d '.') +wheel_path=$(find target/wheels/gtars-${gtars_version}-cp${python_version}-cp${python_version}-*.whl) +pip install --force-reinstall ${wheel_path} +``` \ No newline at end of file diff --git a/bindings/python/gtars/digests/__init__.py b/bindings/python/gtars/digests/__init__.py new file mode 100644 index 00000000..d21d6228 --- /dev/null +++ b/bindings/python/gtars/digests/__init__.py @@ -0,0 +1 @@ +from .gtars.digests import * # noqa: F403 diff --git a/bindings/python/src/digests/mod.rs b/bindings/python/src/digests/mod.rs new file mode 100644 index 00000000..f51ef963 --- /dev/null +++ b/bindings/python/src/digests/mod.rs @@ -0,0 +1,71 @@ +// This is intended to provide minimal Python bindings to functions in the `digests` module of the `gtars` crate. + +use pyo3::prelude::*; +use gtars::digests::{sha512t24u, md5, DigestResult}; + +#[pyfunction] +pub fn sha512t24u_digest(readable: &str) -> String { + return sha512t24u(readable); +} + +#[pyfunction] +pub fn md5_digest(readable: &str) -> String { + return md5(readable); +} + +#[pyfunction] +pub fn digest_fasta(fasta: &str) -> PyResult<Vec<PyDigestResult>> { + match gtars::digests::digest_fasta(fasta) { + Ok(digest_results) => { + let py_digest_results: Vec<PyDigestResult> = digest_results.into_iter().map(PyDigestResult::from).collect(); + Ok(py_digest_results) + }, + Err(e) => Err(PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("Error processing FASTA file: {}", e))), + } +} + +#[pyclass] +#[pyo3(name="DigestResult")] +pub struct PyDigestResult { + #[pyo3(get,set)] + pub id: String, + #[pyo3(get,set)] + pub length: usize, + #[pyo3(get,set)] + pub sha512t24u: String, + #[pyo3(get,set)] + pub md5: String +} + +#[pymethods] +impl PyDigestResult { + fn __repr__(&self) -> String { + format!("<DigestResult for {}>", self.id) + } + + fn __str__(&self) -> PyResult<String> { + Ok(format!("DigestResult for sequence {}\n length: {}\n sha512t24u: {}\n md5: {}", self.id, self.length, self.sha512t24u, self.md5)) + } +} + +impl From<DigestResult> for PyDigestResult { + fn from(value: DigestResult) -> Self { + PyDigestResult { + id: value.id, + length: value.length, + sha512t24u: value.sha512t24u, + md5: value.md5 + } + } +} + +// This represents the Python module to be created +#[pymodule] +pub fn digests(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(sha512t24u_digest, m)?)?; + m.add_function(wrap_pyfunction!(md5_digest, m)?)?; + m.add_function(wrap_pyfunction!(digest_fasta, m)?)?; + m.add_class::<PyDigestResult>()?; + Ok(()) +} + diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 207ab55b..52d0e790 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -5,6 +5,7 @@ mod ailist; mod models; mod tokenizers; mod utils; +mod digests; pub const VERSION: &str = env!("CARGO_PKG_VERSION"); @@ -14,11 +15,13 @@ fn gtars(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { let ailist_module = pyo3::wrap_pymodule!(ailist::ailist); let utils_module = pyo3::wrap_pymodule!(utils::utils); let models_module = pyo3::wrap_pymodule!(models::models); + let digests_module = pyo3::wrap_pymodule!(digests::digests); m.add_wrapped(tokenize_module)?; m.add_wrapped(ailist_module)?; m.add_wrapped(utils_module)?; m.add_wrapped(models_module)?; + m.add_wrapped(digests_module)?; let sys = PyModule::import_bound(py, "sys")?; let binding = sys.getattr("modules")?; @@ -29,6 +32,7 @@ fn gtars(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { sys_modules.set_item("gtars.ailist", m.getattr("ailist")?)?; sys_modules.set_item("gtars.utils", m.getattr("utils")?)?; sys_modules.set_item("gtars.models", m.getattr("models")?)?; + sys_modules.set_item("gtars.digests", m.getattr("digests")?)?; // add constants m.add("__version__", VERSION)?; diff --git a/bindings/r/DESCRIPTION b/bindings/r/DESCRIPTION index 9a777c52..8758bf36 100644 --- a/bindings/r/DESCRIPTION +++ b/bindings/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: gtars Title: Performance critical genomic interval analysis using Rust, in R -Version: 0.0.0.9000 +Version: 0.0.1 Authors@R: person("Nathan", "LeRoy", , "nleroy917@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-7354-7213")) diff --git a/bindings/r/R/igd.R b/bindings/r/R/igd.R index f9a7a869..b56cdf05 100644 --- a/bindings/r/R/igd.R +++ b/bindings/r/R/igd.R @@ -18,7 +18,7 @@ NULL #' @examples #' \dontrun{ #' # Create database with default name -#' igd_create("path/to/output", "path/to/bed/files") +#' r_igd_create("path/to/output", "path/to/bed/files") #' } #' #' @export @@ -49,6 +49,8 @@ r_igd_create <- function(output_path, filelist, db_name = "igd_database") { #' #' @examples #' \dontrun{ +#' # Search database with default name +#' r_igd_search("path/to/database", "path/to/query/file") #' } #' #' @export diff --git a/bindings/r/README.md b/bindings/r/README.md new file mode 100644 index 00000000..95550e4e --- /dev/null +++ b/bindings/r/README.md @@ -0,0 +1,18 @@ +# gtars + +This is an R package that wraps the `gtars` Rust crate so you can call gtars code from R. + +## Brief instructions + +To install the development version, you'll have to build it locally. Build R bindings like this: + +```console +cd bindings +R CMD build r +``` + +Then install the package that was just built: + +```console +R CMD INSTALL gtars_0.0.1.tar.gz +``` \ No newline at end of file diff --git a/bindings/r/man/r_igd_create.Rd b/bindings/r/man/r_igd_create.Rd index 377324e6..2878ca7c 100644 --- a/bindings/r/man/r_igd_create.Rd +++ b/bindings/r/man/r_igd_create.Rd @@ -28,7 +28,7 @@ Creates an IGD (Indexed Genomic Data) database from a collection of BED files. \examples{ \dontrun{ # Create database with default name -igd_create("path/to/output", "path/to/bed/files") +r_igd_create("path/to/output", "path/to/bed/files") } } diff --git a/bindings/r/man/r_igd_search.Rd b/bindings/r/man/r_igd_search.Rd index 5dd5dc1b..c017b141 100644 --- a/bindings/r/man/r_igd_search.Rd +++ b/bindings/r/man/r_igd_search.Rd @@ -19,6 +19,8 @@ Searches an IGD database for region overlaps with an input BED file } \examples{ \dontrun{ +# Search database with default name +r_igd_search("path/to/database", "path/to/query/file") } } diff --git a/bindings/r/src/rust/Cargo.toml b/bindings/r/src/rust/Cargo.toml index 78db82a6..4ed6d070 100644 --- a/bindings/r/src/rust/Cargo.toml +++ b/bindings/r/src/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = 'gtars-r' -version = '0.1.0' +version = '0.2.0' edition = '2021' [lib] diff --git a/bindings/r/tests/set_A.bed b/bindings/r/tests/set_A.bed deleted file mode 100644 index 667474af..00000000 --- a/bindings/r/tests/set_A.bed +++ /dev/null @@ -1,7 +0,0 @@ -chr1 0 3 . 0 . -chr1 3 6 . 0 . -chr1 7 10 . 0 . -chr1 11 14 . 0 . -chr1 14 17 . 0 . -chr1 19 22 . 0 . -chr1 24 27 . 0 . diff --git a/bindings/r/tests/set_AA.bed b/bindings/r/tests/set_AA.bed deleted file mode 100644 index 9b4dd815..00000000 --- a/bindings/r/tests/set_AA.bed +++ /dev/null @@ -1,3 +0,0 @@ -chr1 1 3 . 0 . -chr1 3 6 . 0 . -chr1 7 10 . 0 . diff --git a/bindings/r/tests/test.R b/bindings/r/tests/test.R deleted file mode 100644 index a921118b..00000000 --- a/bindings/r/tests/test.R +++ /dev/null @@ -1,66 +0,0 @@ -# library(GenomicRanges) -# library(rtracklayer) - -# # First create our GRanges objects -# set_A <- GRanges( -# seqnames = "chr1", -# ranges = IRanges( -# start = c(1, 4, 8, 12, 15, 20, 25), -# end = c(3, 6, 10, 14, 17, 22, 27) -# ) -# ) - -# set_AA <- GRanges( -# seqnames = "chr1", -# ranges = IRanges( -# start = c(2, 4, 8), -# end = c(3, 6, 10) -# ) -# ) - - -# set_B <- GRangesList( -# group1 = GRanges( -# seqnames = "chr1", -# ranges = IRanges( -# start = c(2, 7, 12, 16, 21), -# end = c(4, 9, 15, 18, 23) -# ) -# ), -# group2 = GRanges( -# seqnames = "chr1", -# ranges = IRanges( -# start = c(5, 11, 16, 19, 24), -# end = c(7, 13, 18, 21, 26) -# ) -# ), -# group3 = GRanges( -# seqnames = "chr1", -# ranges = IRanges( -# start = c(3, 8, 13, 17, 22), -# end = c(5, 10, 15, 19, 24) -# ) -# ) -# ) - - -# export(set_A, '/Users/sam/Documents/Work/gtars/bindings/r/tests/set_A.bed', format="BED") -# export(set_AA, '/Users/sam/Documents/Work/gtars/bindings/r/tests/set_AA.bed', format="BED" ) - -# # rextendr::document() - -# gtars_create <- gtars::r_igd_create('/Users/sam/Documents/Work/episcope/.test/igd/', '/Users/sam/Documents/Work/episcope/.test/test_paths.txt') -# gtars_count <- gtars::r_igd_search(database_path = '/Users/sam/Documents/Work/episcope/.test/igd/igd_database.igd', query_path = '/Users/sam/Documents/Work/episcope/.test/set_A.bed') - -# userSets_beds <- c('/Users/sam/Documents/Work/episcope/.test/set_A.bed', '/Users/sam/Documents/Work/episcope/.test/set_AA.bed') -# db_path <- '/Users/sam/Documents/Work/episcope/.test/igd/igd_database.igd' - - -# ## test lapply -# r_igd_search_rev <- function(query_path = query_path, database_path = database_path) { -# gtars::r_igd_search(database_path = database_path, query_path = query_path) -# } - -# geneSetDatabaseOverlaps <- lapply(userSets_beds, r_igd_search_rev, db_path) -# geneSetDatabaseOverlapsHits <- lapply(geneSetDatabaseOverlaps, function(x) as.numeric(as.character(x[," number of hits"]))) - \ No newline at end of file diff --git a/gtars/Cargo.toml b/gtars/Cargo.toml index faca48fc..c114fba1 100644 --- a/gtars/Cargo.toml +++ b/gtars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gtars" -version = "0.1.1" +version = "0.2.0" edition = "2021" description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package." license = "MIT" @@ -28,10 +28,14 @@ bigtools = "0.5.4" tokio = "1.40.0" os_pipe = "1.2.1" glob = "0.3.1" - +base64-url = "2.0.0" +sha2 = "0.10.7" +md-5 = "0.10.5" +seq_io = "0.3.2" +serde_json = "1.0.135" [dev-dependencies] -rstest = "0.18.2" +rstest = "0.23.0" tempfile = "3.8.1" pretty_assertions = "1.4.0" diff --git a/gtars/docs/changelog.md b/gtars/docs/changelog.md index cd94a894..4749db0d 100644 --- a/gtars/docs/changelog.md +++ b/gtars/docs/changelog.md @@ -4,6 +4,30 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.2.0] +- add position shift workflow for bam to bw (was previously added for bam to bed) +- add scaling argument for bam to bw workflow [#53](https://github.com/databio/gtars/issues/53) +- fix accumulation issue for bam workflow [#56](https://github.com/databio/gtars/issues/56) +- fix wiggle file (core) beginning at 0 [#43](https://github.com/databio/gtars/issues/43) +- fix npy file (end) using start instead of end [#61](https://github.com/databio/gtars/issues/61) +- force zoom to 1 for bed/narrowPeak to bw [#34](https://github.com/databio/gtars/issues/34) +- fix IGD overlap issue [#45](https://github.com/databio/gtars/issues/45) +- add ga4gh refget digest functionality [#58](https://github.com/databio/gtars/issues/58) +- fix wig and npy inconsistency [#64](https://github.com/databio/gtars/issues/64) +- fix narrowPeak to bw zoom [#34](https://github.com/databio/gtars/issues/34) +- fix bed to bw fileheader consistency issue [#52](https://github.com/databio/gtars/issues/52) +- change npy metadata file structure [#65](https://github.com/databio/gtars/issues/65) + +## [0.1.2] +- add position shift workflow for `bam` to `bw` (was previously added for `bam` to `bed`) +- add scaling argument for `bam` to `bw` workflow [#53](https://github.com/databio/gtars/issues/53) +- fix accumulation issue for `bam` workflow [#56](https://github.com/databio/gtars/issues/56) +- fix wiggle file (core) beginning at 0 [#43](https://github.com/databio/gtars/issues/43) +- fix npy file (end) using start instead of end [#61](https://github.com/databio/gtars/issues/61) +- force zoom to 1 for bed/narrowPeak to bw [#34](https://github.com/databio/gtars/issues/34) +- fix IGD overlap issue [#45](https://github.com/databio/gtars/issues/45) +- add ga4gh refget digest functionality [#58](https://github.com/databio/gtars/pull/58) + ## [0.1.1] - hot fix for broken python bindings; remove IGD from the python bindings for now diff --git a/gtars/src/common/utils.rs b/gtars/src/common/utils.rs index 93b8c837..4a4bec09 100644 --- a/gtars/src/common/utils.rs +++ b/gtars/src/common/utils.rs @@ -6,16 +6,17 @@ use std::io::{BufRead, BufReader}; use std::path::Path; use anyhow::{Context, Result}; -use flate2::read::GzDecoder; +use flate2::read::MultiGzDecoder; use rust_lapper::{Interval, Lapper}; use crate::common::models::region::Region; use crate::common::models::universe::Universe; /// -/// Function to return a reader for either a gzip'd or non-gzip'd file. +/// Get a reader for either a gzip'd or non-gzip'd file. /// /// # Arguments +/// /// - path: path to the file to read /// pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> { @@ -23,7 +24,7 @@ pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> { let file = File::open(path).with_context(|| "Failed to open bed file.")?; let file: Box<dyn Read> = match is_gzipped { - true => Box::new(GzDecoder::new(file)), + true => Box::new(MultiGzDecoder::new(file)), false => Box::new(file), }; @@ -32,6 +33,24 @@ pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> { Ok(reader) } +/// Get a reader for either a gzipped, non-gzipped file, or stdin +/// +/// # Arguments +/// +/// - file_path: path to the file to read, or '-' for stdin +/// +/// # Returns +/// +/// A `BufReader` object for a given file path or stdin. +pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<dyn Read>>> { + if file_path_str == "-" { + Ok(BufReader::new(Box::new(std::io::stdin()) as Box<dyn Read>)) + } else { + let file_path = Path::new(file_path_str); + return get_dynamic_reader(&file_path); + } +} + /// /// Create a region-to-id hash-map from a list of regions /// diff --git a/gtars/src/digests/mod.rs b/gtars/src/digests/mod.rs new file mode 100644 index 00000000..2b68aed4 --- /dev/null +++ b/gtars/src/digests/mod.rs @@ -0,0 +1,178 @@ +//! # Fast digest computations for genomic sequences +//! +//! This module provides functions for computing digests of strings. +//! +//! # Functions +//! +//! The following functions are available: +//! +//! * `sha512t24u` - Processes a given string to compute its GA4GH sha512t24 checksum. +//! * `md5` - Processes a given string to compute its MD5 checksum. +//! * `digest_fasta` - Processes a FASTA file to compute the digests of each sequence in the file. +//! +//! # Usage +//! +//! The `sha512t24u` function can be used to compute the GA4GH sha512t24 checksum of a string. +//! +//! ```rust +//! use gtars::digests::sha512t24u; +//! +//! let digest = sha512t24u("hello world"); +//! ``` +use std::fs::File; +use std::io; +use std::io::prelude::{Read, Write}; +use std::path::Path; + +use anyhow::Result; +use md5::Md5; +use seq_io::fasta::{Reader, Record, RefRecord}; +use sha2::{Digest, Sha512}; + +use crate::common::utils::get_dynamic_reader; + +/// A struct representing the digest of a given string. +#[derive(Debug)] +pub struct DigestResult { + pub id: String, + pub length: usize, + pub sha512t24u: String, + pub md5: String, +} + +/// Processes a given string to compute its GA4GH sha512t24u digest. +/// +/// # Arguments +/// +/// * `string` - The input string to be processed. +/// +/// # Returns +/// +/// A string SHA-512 digest of the input string. +pub fn sha512t24u(string: &str) -> String { + let mut sha512_hasher_box = Box::new(Sha512::new()); + for s in string.as_bytes().chunks(800) { + sha512_hasher_box.as_mut().update(s); + } + base64_url::encode(&sha512_hasher_box.as_mut().finalize_reset()[0..24]) +} + +/// Process a string to compute its md5 digest +/// +/// # Arguments +/// +/// * `string` - The input string to be processed. +/// +/// # Returns +/// +/// A string MD5 digest of the input string. +pub fn md5(string: &str) -> String { + let mut hasher = Md5::new(); + for s in string.as_bytes().chunks(800) { + hasher.update(s); + } + let result = hasher.finalize(); + format!("{:x}", result) +} + +/// Processes a FASTA file to compute the digests of each sequence in the file. +/// +/// This function reads a FASTA file, computes the SHA-512 and MD5 digests for each sequence, +/// and returns a vector of `DigestResult` structs containing the results. It can also handle +// gzipped FASTA files (ending in `.gz`). +/// +/// # Arguments +/// +/// * `file_path` - A string slice that holds the path to the FASTA file to be processed. +/// +/// # Returns +/// +/// A vector of `DigestResult` structs, each containing the length, SHA-512 digest, and MD5 digest +/// of a sequence in the FASTA file. +/// +/// # Panics +/// +/// This function will panic if the file cannot be opened or if there is an error reading the file. +/// +/// # Examples +/// +/// +pub fn digest_fasta(file_path: &str) -> Result<Vec<DigestResult>> { + let path = Path::new(&file_path); + let file_reader = get_dynamic_reader(&path)?; + let mut fasta_reader = Reader::new(file_reader); + let mut results = Vec::new(); + while let Some(record) = fasta_reader.next() { + // returns a RefRecord object + let record = record.expect("Error found when retrieving next record."); + let id = record.id().expect("No ID found for the FASTA record"); + let mut sha512_hasher = Sha512::new(); + let mut md5_hasher = Md5::new(); + let mut length = 0; + // let result = process_sequence(record, verbose); + for seq_line in record.seq_lines() { + // let seq_line = seq_line.expect("Error found when retrieving next sequence line."); + sha512_hasher.update(seq_line.to_ascii_uppercase()); + md5_hasher.update(seq_line.to_ascii_uppercase()); + length += seq_line.len(); + } + // let result = sha512_hasher.finalize(); + let sha512 = base64_url::encode(&sha512_hasher.finalize_reset()[0..24]); + let md5 = format!("{:x}", md5_hasher.finalize_reset()); + results.push(DigestResult { + id: id.to_string(), + length: length, + sha512t24u: sha512, + md5: md5, + }); + } + Ok(results) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_sha512t24u() { + let digest = sha512t24u("hello world"); + assert_eq!(digest, "MJ7MSJwS1utMxA9QyQLytNDtd-5RGnx6"); + } + + #[test] + fn test_md5() { + let digest = md5("hello world"); + assert_eq!(digest, "5eb63bbbe01eeed093cb22bb8f5acdc3"); + } + + #[test] + fn test_digest_fasta() { + let results = digest_fasta("tests/data/base.fa").expect("Can't open test fasta file"); + println!("{:?}", results); + assert_eq!(results.len(), 3); + assert_eq!(results[0].length, 8); + assert_eq!(results[0].sha512t24u, "iYtREV555dUFKg2_agSJW6suquUyPpMw"); + assert_eq!(results[0].md5, "5f63cfaa3ef61f88c9635fb9d18ec945"); + assert_eq!(results[1].length, 4); + assert_eq!(results[1].sha512t24u, "YBbVX0dLKG1ieEDCiMmkrTZFt_Z5Vdaj"); + assert_eq!(results[1].md5, "31fc6ca291a32fb9df82b85e5f077e31"); + assert_eq!(results[2].length, 4); + assert_eq!(results[2].sha512t24u, "AcLxtBuKEPk_7PGE_H4dGElwZHCujwH6"); + assert_eq!(results[2].md5, "92c6a56c9e9459d8a42b96f7884710bc"); + } + + #[test] + fn test_digest_gzipped_fasta() { + let results = digest_fasta("tests/data/base.fa.gz").expect("Can't open test fasta file"); + println!("{:?}", results); + assert_eq!(results[0].length, 8); + assert_eq!(results[0].sha512t24u, "iYtREV555dUFKg2_agSJW6suquUyPpMw"); + assert_eq!(results[0].md5, "5f63cfaa3ef61f88c9635fb9d18ec945"); + } + + #[test] + fn bogus_fasta_file() { + let result = digest_fasta("tests/data/bogus.fa"); + assert!(result.is_err(), "Expected an error for a bogus fasta file"); + } +} diff --git a/gtars/src/fragsplit/map.rs b/gtars/src/fragsplit/map.rs index aebe0805..c5a92a87 100644 --- a/gtars/src/fragsplit/map.rs +++ b/gtars/src/fragsplit/map.rs @@ -57,7 +57,6 @@ impl BarcodeToClusterMap { } if let (Some(barcode), Some(cluster_id)) = (barcode, cluster_id) { - map.insert(barcode.to_string(), cluster_id.to_string()); if !cluster_labels.contains(cluster_id) { cluster_labels.insert(cluster_id.to_string()); diff --git a/gtars/src/igd/create.rs b/gtars/src/igd/create.rs index 1bceea3e..eea1cf3e 100644 --- a/gtars/src/igd/create.rs +++ b/gtars/src/igd/create.rs @@ -69,6 +69,9 @@ pub struct igd_t { pub mctg: i32, //data type: 0, 1, 2 etc; size differs pub total: i64, // total region in each ctg pub ctg: Vec<ctg_t>, // this is the list of contigs (of size n-ctg) // this might need to be a reference + pub total_regions: i32, + pub total_average: f32, + pub average_length: f32, } impl igd_t { @@ -100,11 +103,11 @@ pub fn igd_get_create_matches(matches: &ArgMatches) { .get_one::<String>("dbname") .expect("File list path is required"); - create_igd_f(output_path, filelist, db_output_name); + let _igd = create_igd_f(output_path, filelist, db_output_name); } /// Creates IGD database from a directory of bed files. -pub fn create_igd_f(output_path: &String, filelist: &String, db_output_name: &String) { +pub fn create_igd_f(output_path: &String, filelist: &String, db_output_name: &String) -> igd_t { //println!("{}",db_output_name); //Initialize IGD into Memory let mut igd = igd_t::new(); @@ -373,14 +376,19 @@ pub fn create_igd_f(output_path: &String, filelist: &String, db_output_name: &St // Sort tile data and save into single files per ctg igd_save_db(&mut igd, output_path, db_output_name); + igd.total_regions = total_regions; + igd.total_average = total_avg_size; + igd.average_length = total_avg_size / total_regions as f32; + let save_path = format!("{}{}{}", output_path, db_output_name, ".igd"); println!("IGD saved to: {}", save_path); println!( "Total Intervals: {}, l_avg: {}", - total_regions, - total_avg_size / total_regions as f32 + igd.total_regions, igd.average_length ); println!("nctg:{} nbp:{}", igd.nctg, igd.nbp); + + igd // return for testing purposes } /// Saves the primary .igd database file by reading the temp_tiles, sorting them, and then writing the sorted tiles to disk. @@ -560,7 +568,7 @@ pub fn igd_save_db(igd: &mut igd_t, output_path: &String, db_output_name: &Strin let _ = main_db_file.write_all(&temp_buffer); } - q.nCnts = 0; + //q.nCnts = 0; } } @@ -631,7 +639,7 @@ pub fn igd_saveT(igd: &mut igd_t, output_file_path: &String) { } file.write_all(&buffer).unwrap(); - current_tile.nCnts = current_tile.ncnts + 1; + current_tile.nCnts = current_tile.nCnts + current_tile.ncnts; if current_tile.ncnts > 8 { current_tile.mcnts = 8; @@ -811,6 +819,7 @@ pub fn igd_add( gdata.start = start; gdata.end = end; gdata.value = v; + //println!("Adding to igd, start {}, idx {}", start,idx); gdata.idx = idx as i32; igd.total += 1; diff --git a/gtars/src/igd/search.rs b/gtars/src/igd/search.rs index 7426706b..0090c9e2 100644 --- a/gtars/src/igd/search.rs +++ b/gtars/src/igd/search.rs @@ -143,8 +143,10 @@ pub fn igd_search(database_path: &String, query_file_path: &String) -> Result<Ve "{}\t{}\t{}\t{}", i, IGD.file_info[i].nr, hit, IGD.file_info[i].fileName ); - let format_string = format!("{}\t{}\t{}\t{}", - i, IGD.file_info[i].nr, hit, IGD.file_info[i].fileName); + let format_string = format!( + "{}\t{}\t{}\t{}", + i, IGD.file_info[i].nr, hit, IGD.file_info[i].fileName + ); final_string_vec.push(format_string); } total += hit; @@ -281,11 +283,7 @@ fn get_overlaps( } // Min between n2 and mTile - n2 = if n2 < mTile { - n2 - } else { - mTile - }; + n2 = if n2 < mTile { n2 } else { mTile }; tmpi = IGD.nCnt[ichr as usize][n1 as usize]; tmpi1 = tmpi - 1; @@ -296,89 +294,89 @@ fn get_overlaps( // ); if tmpi > 0 { - if n1 != *preIdx || ichr != *preChr { - // println!( - // "n1 != *preIdx || ichr!= *preChr {} vs {} {} vs {} \n", - // n1, preIdx, ichr, preChr - // ); + // println!( + // "n1 != *preIdx || ichr!= *preChr {} vs {} {} vs {} \n", + // n1, preIdx, ichr, preChr + // ); + + //println!("Seek start here: {}",IGD.tIdx[ichr as usize][n1 as usize]); + //let ichr = 1; + db_reader + .seek(SeekFrom::Start(IGD.tIdx[ichr as usize][n1 as usize] as u64)) + .unwrap(); + + let mut gData: Vec<gdata_t> = Vec::new(); + for j in 0..tmpi { + gData.push(gdata_t::default()) + } + //let mut gData: Vec<gdata_t> = Vec::with_capacity(tmpi as usize); - //println!("Seek start here: {}",IGD.tIdx[ichr as usize][n1 as usize]); + for i in 0..tmpi { + let mut buf = [0u8; 16]; - db_reader - .seek(SeekFrom::Start(IGD.tIdx[ichr as usize][n1 as usize] as u64)) - .unwrap(); + let n = db_reader.read(&mut buf).unwrap(); - let mut gData: Vec<gdata_t> = Vec::new(); - for j in 0..tmpi { - gData.push(gdata_t::default()) + if n == 0 { + //println!("Breaking loop while reading tempfile"); + break; + } else if n != 16 { + //panic!("Cannot read temp file."); + break; } - //let mut gData: Vec<gdata_t> = Vec::with_capacity(tmpi as usize); - - for i in 0..tmpi { - let mut buf = [0u8; 16]; - let n = db_reader.read(&mut buf).unwrap(); + let mut rdr = &buf[..] as &[u8]; + let idx = rdr.read_i32::<LittleEndian>().unwrap(); + let start = rdr.read_i32::<LittleEndian>().unwrap(); + let end = rdr.read_i32::<LittleEndian>().unwrap(); + let value = rdr.read_i32::<LittleEndian>().unwrap(); + + //println!("for tmpi>0 where tmpi = {}", tmpi); + //println!("Looping through g_datat in temp files\n"); + //println!("idx: {} start: {} end: {}\n", idx,start,end); + + gData[i as usize] = gdata_t { + idx: idx, + start, + end, + value, + }; + + *preIdx = n1; + *preChr = ichr; + } - if n == 0 { - //println!("Breaking loop while reading tempfile"); - break; - } else if n != 16 { - //panic!("Cannot read temp file."); - break; + // check this code block. original code has outside this first check but that would potentially cause access to wrong + // object in memory if it was not de-allocated? + + if query_end > gData[0].start { + // sorted by start + //println!("n1 != *preIdx || ichr != *preChr query_end > gData[0].start: {} > {}", query_end,gData[0].start); + // find the 1st rs<qe + tL = 0; + tR = tmpi1; + + while tL < tR - 1 { + tM = (tL + tR) / 2; //result: tR=tL+1, tL.s<qe + //println!("What is tM? {}", tM); + if gData[tM as usize].start < query_end { + tL = tM; //right side + } else { + tR = tM; //left side } - - let mut rdr = &buf[..] as &[u8]; - let idx = rdr.read_i32::<LittleEndian>().unwrap(); - let start = rdr.read_i32::<LittleEndian>().unwrap(); - let end = rdr.read_i32::<LittleEndian>().unwrap(); - let value = rdr.read_i32::<LittleEndian>().unwrap(); - - //println!("Looping through g_datat in temp files\n"); - // println!("idx: {} start: {} end: {}\n", idx,start,end); - - gData[i as usize] = gdata_t { - idx: idx, - start, - end, - value, - }; - - *preIdx = n1; - *preChr = ichr; } - - // check this code block. original code has outside this first check but that would potentially cause access to wrong - // object in memory if it was not de-allocated? - - if query_end > gData[0].start { - // sorted by start - //println!("query_end > gData[0].start: {} > {}", query_end,gData[0].start); - // find the 1st rs<qe - tL = 0; - tR = tmpi1; - - while tL < tR - 1 { - tM = (tL + tR) / 2; //result: tR=tL+1, tL.s<qe - //println!("What is tM? {}", tM); - if gData[tM as usize].start < query_end { - tL = tM; //right side - } else { - tR = tM; //left side - } - } - if gData[tR as usize].start < query_end { - tL = tR; - } - //-------------------------- - for i in (0..=tL).rev() { - // count down from tL (inclusive to tL) - //println!("iterate over i: {} ", i); - //println!("gdata[i].end {} vs query start {}",gData[i as usize].end,query_start); - if gData[i as usize].end > query_start { - //println!(" > gData[i].end > query_start {} > {}", gData[i as usize].end, query_start); - hits[gData[i as usize].idx as usize] = - hits[gData[i as usize].idx as usize] + 1; - } + if gData[tR as usize].start < query_end { + tL = tR; + } + //-------------------------- + for i in (0..=tL).rev() { + //println!("Countdownfrom TL"); + // count down from tL (inclusive to tL) + //println!("iterate over i: {} from tL {}", i, tL); + //println!("gdata[i].end {} vs query start {}",gData[i as usize].end,query_start); + if gData[i as usize].end > query_start { + //println!("ADDING TO HITS"); + //println!(" > gData[i].end > query_start {} > {}", gData[i as usize].end, query_start); + hits[gData[i as usize].idx as usize] = hits[gData[i as usize].idx as usize] + 1; } } } @@ -424,7 +422,7 @@ fn get_overlaps( let value = rdr.read_i32::<LittleEndian>().unwrap(); //println!("Looping through g_datat in temp files\n"); - //println!("idx: {} start: {} end: {}\n", idx,start,end); + // println!("idx: {} start: {} end: {}\n", idx,start,end); gData.push(gdata_t { idx: idx, @@ -439,6 +437,7 @@ fn get_overlaps( } if query_end > gData[0].start { + //println!("n2>n1 query_end > gData[0].start: {} > {}", query_end,gData[0].start); tS = 0; while tS < tmpi && gData[tS as usize].start < bd { @@ -478,6 +477,7 @@ fn get_overlaps( } } } + //println!("here are the hits {:?}", hits); return nols; //TODO this is from the original code but its not actually being used for anything. hits vec IS the main thing. } @@ -567,7 +567,7 @@ pub fn get_igd_info( reader.read_exact(&mut buffer)?; let nCtg = i32::from_le_bytes(buffer); - //println!("Found:\n nbp:{} gtype: {} nCtg: {}", nbp,gType,nCtg); + println!("Found:\n nbp:{} gtype: {} nCtg: {}", nbp, gType, nCtg); igd.nbp = nbp; igd.gType = gType; diff --git a/gtars/src/lib.rs b/gtars/src/lib.rs index f7bb97fc..822a4d8c 100644 --- a/gtars/src/lib.rs +++ b/gtars/src/lib.rs @@ -35,6 +35,7 @@ //! ``` pub mod ailist; pub mod common; +pub mod digests; pub mod fragsplit; pub mod igd; pub mod io; diff --git a/gtars/src/scoring/cli.rs b/gtars/src/scoring/cli.rs index 3b620b4e..fef43113 100644 --- a/gtars/src/scoring/cli.rs +++ b/gtars/src/scoring/cli.rs @@ -41,9 +41,9 @@ pub mod handlers { let supplied_mode = ScoringMode::from_str(mode); match supplied_mode { Ok(mode) => mode, - Err(_err) => anyhow::bail!("Unknown scoring mode supplied: {}", mode) + Err(_err) => anyhow::bail!("Unknown scoring mode supplied: {}", mode), } - }, + } None => DEFAULT_SCORING_MODE, }; @@ -52,11 +52,7 @@ pub mod handlers { let consensus = PathBuf::from(consensus); let consensus = ConsensusSet::new(consensus)?; - let count_mat = region_scoring_from_fragments( - &mut fragments, - &consensus, - mode, - )?; + let count_mat = region_scoring_from_fragments(&mut fragments, &consensus, mode)?; count_mat.write_to_file(output)?; diff --git a/gtars/src/scoring/files.rs b/gtars/src/scoring/files.rs index 9db20343..a9120a06 100644 --- a/gtars/src/scoring/files.rs +++ b/gtars/src/scoring/files.rs @@ -126,5 +126,4 @@ impl FindOverlaps for ConsensusSet { Some(olaps) } } - } diff --git a/gtars/src/scoring/fragment_scoring.rs b/gtars/src/scoring/fragment_scoring.rs index 05333380..900b83a5 100644 --- a/gtars/src/scoring/fragment_scoring.rs +++ b/gtars/src/scoring/fragment_scoring.rs @@ -18,7 +18,6 @@ pub fn region_scoring_from_fragments( consensus: &ConsensusSet, scoring_mode: ScoringMode, ) -> Result<CountMatrix<u32>> { - let rows = fragments.len(); let cols = consensus.len(); @@ -116,7 +115,6 @@ mod tests { use super::*; use pretty_assertions::assert_eq; use rstest::*; - #[fixture] fn path_to_fragment_files() -> &'static str { @@ -132,12 +130,12 @@ mod tests { fn output_file() -> &'static str { "tests/data/out/region_scoring_count.csv.gz" } - + #[rstest] fn test_region_scoring_from_fragments_atac( path_to_fragment_files: &str, consensus_set: &str, - output_file: &str + output_file: &str, ) { let mut fragments = FragmentFileGlob::new(path_to_fragment_files).unwrap(); let consensus = ConsensusSet::new(consensus_set.into()).unwrap(); @@ -165,6 +163,5 @@ mod tests { let res = count_mat.write_to_file(output_file); assert_eq!(res.is_ok(), true); - } } diff --git a/gtars/src/scoring/mod.rs b/gtars/src/scoring/mod.rs index 10b15e0b..6497a108 100644 --- a/gtars/src/scoring/mod.rs +++ b/gtars/src/scoring/mod.rs @@ -9,4 +9,4 @@ pub mod scoring_modes; pub use counts::*; pub use files::*; pub use fragment_scoring::*; -pub use scoring_modes::*; \ No newline at end of file +pub use scoring_modes::*; diff --git a/gtars/src/uniwig/README.md b/gtars/src/uniwig/README.md index 663fceea..9c67091d 100644 --- a/gtars/src/uniwig/README.md +++ b/gtars/src/uniwig/README.md @@ -51,14 +51,14 @@ Options: -s, --stepsize <stepsize> Integer value for stepsize -l, --fileheader <fileheader> Name of the file -y, --outputtype <outputtype> Output as wiggle or npy - -u, --counttype <counttype> Select to only output start, end, or core. Defaults to all. [default: all] + -u, --counttype <counttype> Select to only output start, end, or core. Select `shift` for bam workflows. Defaults to all. [default: all] -p, --threads <threads> Number of rayon threads to use for parallel processing [default: 6] -o, --score Count via score (narrowPeak only!) - -z, --zoom <zoom> Number of zoom levels (for bw file output only [default: 5] + -a, --no-bamshift Set bam shift to False, i.e. uniwig will count raw reads without considering read direction. + -z, --zoom <zoom> Number of zoom levels (for bw file output only [default: 1] -d, --debug Print more verbose debug messages? -h, --help Print help - ``` ### Processing bam files to bw diff --git a/gtars/src/uniwig/cli.rs b/gtars/src/uniwig/cli.rs index ab00d889..e21ca7e9 100644 --- a/gtars/src/uniwig/cli.rs +++ b/gtars/src/uniwig/cli.rs @@ -47,6 +47,15 @@ pub fn create_uniwig_cli() -> Command { .help("Integer value for stepsize") .required(true), ) + .arg( + Arg::new("bamscale") + .long("bamscale") + .short('e') + .default_value("1.0") + .value_parser(clap::value_parser!(f32)) + .help("Integer for scaling bam read values, default is 1") + .required(false), + ) .arg( Arg::new("fileheader") .long("fileheader") @@ -66,7 +75,7 @@ pub fn create_uniwig_cli() -> Command { .long("counttype") .short('u') .default_value("all") - .help("Select to only output start, end, or core. Defaults to all.") + .help("Select to only output start, end, or core. Select `shift` for bam workflows. Defaults to all.") .required(false), ) .arg( @@ -85,11 +94,18 @@ pub fn create_uniwig_cli() -> Command { .help("Count via score (narrowPeak only!)") .action(ArgAction::SetTrue), ) + .arg( + Arg::new("no-bamshift") + .long("no-bamshift") + .short('a') + .help("Set bam shift to False, i.e. uniwig will count raw reads without considering read direction.") + .action(ArgAction::SetFalse), + ) .arg( Arg::new("zoom") .long("zoom") .short('z') - .default_value("5") + .default_value("1") .value_parser(clap::value_parser!(i32)) .help("Number of zoom levels (for bw file output only") .required(false), diff --git a/gtars/src/uniwig/counting.rs b/gtars/src/uniwig/counting.rs index 7d322277..4b3415d8 100644 --- a/gtars/src/uniwig/counting.rs +++ b/gtars/src/uniwig/counting.rs @@ -7,6 +7,7 @@ use std::fs::{create_dir_all, OpenOptions}; use std::io; use std::io::{BufWriter, Write}; +use noodles::sam::alignment::record::Flags; use std::sync::{Arc, Mutex}; #[derive(Debug)] @@ -34,8 +35,6 @@ pub fn start_end_counts( smoothsize: i32, stepsize: i32, ) -> (Vec<u32>, Vec<i32>) { - //let vin_iter = starts_vector.iter(); - let mut v_coordinate_positions: Vec<i32> = Vec::new(); // these are the final coordinates after any adjustments let mut v_coord_counts: Vec<u32> = Vec::new(); // u8 stores 0:255 This may be insufficient. u16 max is 65535 @@ -181,6 +180,8 @@ pub fn core_counts( current_start_site = starts_vector[0]; // get first coordinate position current_end_site = ends_vector[0]; + current_start_site.0 = current_start_site.0; + if current_start_site.0 < 1 { current_start_site.0 = 1; } @@ -196,6 +197,8 @@ pub fn core_counts( current_start_site = coordinate_value; + current_start_site.0 = current_start_site.0; + let current_score = current_start_site.1; count += current_score; @@ -1160,15 +1163,113 @@ pub fn bam_to_bed_no_counts( let mut writer = BufWriter::new(&mut *write_lock); // TODO Use PEEK INSTEAD + // let first_record_option = records.next(); + + // let _first_record = match first_record_option { + // Some(Ok(record)) => record, // Extract the record + // Some(Err(err)) => { + // // Handle the error + // eprintln!( + // "Error reading the first record for core chrom: {} {:?} Skipping...", + // chromosome_name, err + // ); + // writer.write_all(b"\n").unwrap(); + // writer.flush().unwrap(); + // drop(writer); + // return Err(BAMRecordError::NoFirstRecord); // Example error handling + // } + // None => { + // // Handle no records + // eprintln!("No records for core chrom: {} Skipping...", chromosome_name); + // writer.write_all(b"\n").unwrap(); + // writer.flush().unwrap(); + // drop(writer); + // return Err(BAMRecordError::NoFirstRecord); + // } + // }; + + // let mut current_start_site = first_record.alignment_start().unwrap().unwrap().get() as i32; + // let mut current_end_site = first_record.alignment_end().unwrap().unwrap().get() as i32; + + for coord in records { + let unwrapped_coord = coord.unwrap().clone(); + + let strand = match unwrapped_coord.flags().is_reverse_complemented() { + true => "-", + false => "+", + }; + + //println!("processing records bam to bed"); + + let flags = unwrapped_coord.flags(); + + //let shifted_pos: i32; + + let start_site = unwrapped_coord.alignment_start().unwrap().unwrap().get() as i32; + + let end_site = unwrapped_coord.alignment_end().unwrap().unwrap().get() as i32; + + // we must shift the start position by -1 to convert bam/sam 1 based position to bed 0 based pos + let shifted_pos = get_shifted_pos(&flags, start_site - 1, end_site); + + // Relevant comment from original bamSitesToWig.py: + // The bed file needs 6 columns (even though some are dummy) + // because MACS says so. + let single_line = format!( + "{}\t{}\t{}\t{}\t{}\t{}\n", + chromosome_name, + shifted_pos - smoothsize, + shifted_pos + smoothsize, + "N", + "0", + strand, + ); + + //eprintln!("here is shifted with smoothing: {} {}", shifted_pos - smoothsize, shifted_pos + smoothsize); + + writer.write_all(single_line.as_bytes())?; + writer.flush()?; + } + + drop(writer); + + Ok(()) +} + +pub fn variable_shifted_bam_to_bw( + records: &mut Box<Query<noodles::bgzf::reader::Reader<std::fs::File>>>, + chrom_size: i32, + smoothsize: i32, + stepsize: i32, + chromosome_name: &String, + out_sel: &str, + write_fd: Arc<Mutex<PipeWriter>>, + bam_scale: f32, +) -> Result<(), BAMRecordError> { + let mut write_lock = write_fd.lock().unwrap(); // Acquire lock for writing + let mut writer = BufWriter::new(&mut *write_lock); + + let mut coordinate_position = 0; + + let mut prev_count: f32 = 0.0; + let mut count: f32 = 0.0; + + let mut prev_coordinate_value = 0; + + let mut current_end_site: i32; + let mut bg_prev_coord: i32 = 0; // keep track of which coordinate had a switch in count. + + let mut collected_end_sites: Vec<i32> = Vec::new(); + let first_record_option = records.next(); - let _first_record = match first_record_option { + let first_record = match first_record_option { Some(Ok(record)) => record, // Extract the record Some(Err(err)) => { // Handle the error eprintln!( - "Error reading the first record for core chrom: {} {:?} Skipping...", - chromosome_name, err + "Error reading the first record for {} chrom: {} {:?} Skipping...", + out_sel, chromosome_name, err ); writer.write_all(b"\n").unwrap(); writer.flush().unwrap(); @@ -1177,7 +1278,10 @@ pub fn bam_to_bed_no_counts( } None => { // Handle no records - eprintln!("No records for core chrom: {} Skipping...", chromosome_name); + eprintln!( + "No records for {} chrom: {} Skipping...", + out_sel, chromosome_name + ); writer.write_all(b"\n").unwrap(); writer.flush().unwrap(); drop(writer); @@ -1185,81 +1289,143 @@ pub fn bam_to_bed_no_counts( } }; - // let mut current_start_site = first_record.alignment_start().unwrap().unwrap().get() as i32; - // let mut current_end_site = first_record.alignment_end().unwrap().unwrap().get() as i32; + let flags = first_record.flags(); - for coord in records { - let unwrapped_coord = coord.unwrap().clone(); + let start_site = first_record.alignment_start().unwrap().unwrap().get() as i32; - let strand = match unwrapped_coord.flags().is_reverse_complemented() { - true => "-", - false => "+", - }; + let end_site = first_record.alignment_end().unwrap().unwrap().get() as i32; - //println!("processing records bam to bed"); + let shifted_pos = get_shifted_pos(&flags, start_site - 1, end_site); // we must shift the start position by -1 to convert bam/sam 1 based position to bedgraph 0 based pos + + let mut adjusted_start_site = shifted_pos - smoothsize; + + //current_end_site = adjusted_start_site; + current_end_site = adjusted_start_site + 1 + smoothsize * 2; + + if adjusted_start_site < 0 { + adjusted_start_site = 0; // must ensure we start at 0 for bedGraph 0 position + } - let flag = unwrapped_coord.flags(); + while coordinate_position < adjusted_start_site { + // Just skip until we reach the initial adjusted start position + // Note that this function will not return 0s at locations before the initial start site + coordinate_position = coordinate_position + stepsize; + } - let shifted_pos: i32; + for coord in records { + let unwrapped_coord = coord.unwrap().clone(); + let flags = unwrapped_coord.flags().clone(); let start_site = unwrapped_coord.alignment_start().unwrap().unwrap().get() as i32; let end_site = unwrapped_coord.alignment_end().unwrap().unwrap().get() as i32; - // GET shifted pos and Strand - // TODO ONLY ATAC SHIFTING IS SUPPORTED - //shift_factor = {"+":4, "-":-5} # ATAC - // TODO this assumes tail_edge is false, which is default on PEPATAC pipeline, should add tail_edge=true workflow - if flag.bits() & 1 != 0 { - // Paired-end read - //println!("found, flag bits {} and flagbits &64 {}", flag.bits(), flag.bits() & 64); - if flag.bits() & 64 != 0 { - // First in pair - if flag.bits() & 16 != 0 { - // Reverse complement - //println!("found, flag bits {} and flagbits &16 {}", flag.bits(), flag.bits() & 16); - shifted_pos = end_site + -5; - } else { - //println!("found, flag bits {} and flagbits &16 {}", flag.bits(), flag.bits() & 16); - shifted_pos = start_site + 4; + let shifted_pos = get_shifted_pos(&flags, start_site - 1, end_site); + + adjusted_start_site = shifted_pos - smoothsize; + + if adjusted_start_site < 0 { + adjusted_start_site = 0; + } + + let new_end_site = adjusted_start_site + 1 + smoothsize * 2; + //println!("adjusted start site for new coord: {}", adjusted_start_site); + //println!("new endsite for new coord: {}", new_end_site); + + if new_end_site < current_end_site || coordinate_position > adjusted_start_site { + continue; + } else { + collected_end_sites.push(new_end_site); + } + + count += 1.0; + //println!("here is all endsites: {:?}", collected_end_sites); + + if adjusted_start_site == prev_coordinate_value { + continue; + } + + while coordinate_position < adjusted_start_site { + //println!("coordinate_position< adjusted_start_site: {} < {} . here is current endsite: {} ", coordinate_position, adjusted_start_site, current_end_site); + while current_end_site == coordinate_position { + //println!("current_end_site == coordinate_position {} = {} adjusted start site: {}", current_end_site, coordinate_position, adjusted_start_site); + count = count - 1.0; + + //prev_end_site = current_end_site; + + if count < 0.0 { + count = 0.0; } - } else { - // Second in pair - if flag.bits() & 16 != 0 { - // Reverse complement - //println!("found, flag bits {} and flagbits &16 {}", flag.bits(), flag.bits() & 16); - shifted_pos = end_site + -5; + + if collected_end_sites.last() == None { + current_end_site = 0; } else { - //println!("found, flag bits {} and flagbits &16 {}", flag.bits(), flag.bits() & 16); - shifted_pos = start_site + 4; + current_end_site = collected_end_sites.remove(0); + //println!("new endsite deom deque: {}", current_end_site); } } - } else { - // Single-end read - //println!("Single end read {}" flag.bits()); - if flag.bits() & 16 != 0 { - // Reverse complement - shifted_pos = end_site + -5; + + if count != prev_count { + let single_line = format!( + "{}\t{}\t{}\t{}\n", + chromosome_name, + bg_prev_coord, + coordinate_position, + prev_count / bam_scale + ); + writer.write_all(single_line.as_bytes())?; + writer.flush()?; + //eprintln!("{}\n",single_line); + //eprintln!("count {} Current Endsite {} adjusted Start {} Coordnate pos {} prev end site {}, bg_prev_coord {}\n", count,current_end_site,adjusted_start_site,coordinate_position, prev_end_site, bg_prev_coord); + + prev_count = count; + bg_prev_coord = coordinate_position; + } + + coordinate_position = coordinate_position + 1; + } + + prev_coordinate_value = adjusted_start_site; + } + + count = count + 1.0; // We must add 1 extra value here so that our calculation during the tail as we close out the end sites does not go negative. + // this is because the code above subtracts twice during the INITIAL end site closure. So we are missing one count and need to make it up else we go negative. + + while coordinate_position < chrom_size { + // Apply a bound to push the final coordinates otherwise it will become truncated. + + while current_end_site == coordinate_position { + count = count - 1.0; + //prev_end_site = current_end_site; + if count < 0.0 { + count = 0.0; + } + + if collected_end_sites.last() == None { + current_end_site = 0; } else { - shifted_pos = start_site + 4; + current_end_site = collected_end_sites.remove(0) } } - // Relevant comment from original bamSitesToWig.py: - // The bed file needs 6 columns (even though some are dummy) - // because MACS says so. - let single_line = format!( - "{}\t{}\t{}\t{}\t{}\t{}\n", - chromosome_name, - shifted_pos - smoothsize, - shifted_pos + smoothsize, - "N", - "O", - strand, - ); + if count != prev_count { + let single_line = format!( + "{}\t{}\t{}\t{}\n", + chromosome_name, + bg_prev_coord, + coordinate_position, + prev_count / bam_scale + ); + writer.write_all(single_line.as_bytes())?; + writer.flush()?; + //eprintln!("{}",single_line); + //eprintln!("count {} Current Endsite {} adjusted Start {} Coordnate pos {} prev end site {}, bg_prev_coord {}\n", count,current_end_site,adjusted_start_site,coordinate_position, prev_end_site, bg_prev_coord); - writer.write_all(single_line.as_bytes())?; - writer.flush()?; + prev_count = count; + bg_prev_coord = coordinate_position; + } + + coordinate_position = coordinate_position + 1; } drop(writer); @@ -1319,3 +1485,50 @@ fn set_up_file_output( // write to std_out, this will be useful for sending input to bigtools to create bw files } } + +pub fn get_shifted_pos(flags: &Flags, start_site: i32, end_site: i32) -> i32 { + let shifted_pos: i32; + // GET shifted pos and Strand + // TODO ONLY ATAC SHIFTING IS SUPPORTED + //shift_factor = {"+":4, "-":-5} # ATAC + // TODO this assumes tail_edge is false, which is default on PEPATAC pipeline, should add tail_edge=true workflow + if flags.bits() & 1 != 0 { + // Paired-end read + //println!("found, flag bits {} and flagbits &64 {}", flag.bits(), flag.bits() & 64); + if flags.bits() & 64 != 0 { + // First in pair + if flags.bits() & 16 != 0 { + // Reverse complement + //println!("found, flag bits {} and flagbits &16 {}", flag.bits(), flag.bits() & 16); + shifted_pos = end_site + -5; + } else { + //println!("found, flag bits {} and flagbits &16 {}", flag.bits(), flag.bits() & 16); + shifted_pos = start_site + 4; + } + } else { + // Second in pair + if flags.bits() & 16 != 0 { + // Reverse complement + //println!("found, flag bits {} and flagbits &16 {}", flag.bits(), flag.bits() & 16); + shifted_pos = end_site + -5; + } else { + //println!("found, flag bits {} and flagbits &16 {}", flag.bits(), flag.bits() & 16); + shifted_pos = start_site + 4; + } + } + } else { + // Single-end read + //println!("Single end read {}" flag.bits()); + if flags.bits() & 16 != 0 { + // Reverse complement + shifted_pos = end_site + -5; + } else { + shifted_pos = start_site + 4; + } + } + + //println!("Here is read.reference_start {} and read.reference_end {}", start_site, end_site); + //println!("here is shifted_pos -> {shifted_pos}"); + + shifted_pos +} diff --git a/gtars/src/uniwig/mod.rs b/gtars/src/uniwig/mod.rs index afd496b0..7b364cc7 100644 --- a/gtars/src/uniwig/mod.rs +++ b/gtars/src/uniwig/mod.rs @@ -5,12 +5,12 @@ use indicatif::ProgressBar; use rayon::prelude::*; use std::error::Error; -use std::fs::File; +use std::fs::{remove_file, File}; use std::io::{BufRead, BufReader, BufWriter, Write}; use crate::uniwig::counting::{ bam_to_bed_no_counts, core_counts, start_end_counts, variable_core_counts_bam_to_bw, - variable_start_end_counts_bam_to_bw, BAMRecordError, + variable_shifted_bam_to_bw, variable_start_end_counts_bam_to_bw, BAMRecordError, }; use crate::uniwig::reading::read_chromosome_sizes; use crate::uniwig::utils::{compress_counts, get_final_chromosomes}; @@ -137,6 +137,9 @@ pub fn run_uniwig(matches: &ArgMatches) { "core" => { vec!["core"] } + "shift" => { + vec!["shift"] + } _ => { vec!["start", "end", "core"] @@ -149,7 +152,14 @@ pub fn run_uniwig(matches: &ArgMatches) { .get_one::<i32>("threads") .expect("requires integer value"); + let bam_scale = matches + .get_one::<f32>("bamscale") + .expect("requires int value"); + let score = matches.get_one::<bool>("score").unwrap_or_else(|| &false); + let bam_shift = matches + .get_one::<bool>("no-bamshift") + .unwrap_or_else(|| &true); let debug = matches.get_one::<bool>("debug").unwrap_or_else(|| &false); @@ -174,13 +184,19 @@ pub fn run_uniwig(matches: &ArgMatches) { *stepsize, *zoom, *debug, + *bam_shift, + *bam_scale, ) .expect("Uniwig failed."); } -/// Ensures that the start position for every wiggle file is at a minimum equal to `1` -fn clamped_start_position(start: i32, smoothsize: i32) -> i32 { - std::cmp::max(1, start - smoothsize) +/// Ensures that the start position is at a minimum equal to `1` +fn clamped_start_position(start: i32, smoothsize: i32, wig_shift: i32) -> i32 { + std::cmp::max(1, start - smoothsize + wig_shift) +} +/// Ensure that the start position is at a minimum equal to `0` +fn clamped_start_position_zero_pos(start: i32, smoothsize: i32) -> i32 { + std::cmp::max(0, start - smoothsize) } /// Main function @@ -197,6 +213,8 @@ pub fn uniwig_main( stepsize: i32, zoom: i32, debug: bool, + bam_shift: bool, + bam_scale: f32, ) -> Result<(), Box<dyn Error>> { // Must create a Rayon thread pool in which to run our iterators let pool = rayon::ThreadPoolBuilder::new() @@ -204,8 +222,8 @@ pub fn uniwig_main( .build() .unwrap(); - // Determine File Type - let ft = FileType::from_str(filetype.to_lowercase().as_str()); + // Determine Input File Type + let input_filetype = FileType::from_str(filetype.to_lowercase().as_str()); // Set up output file names let mut meta_data_file_names: [String; 3] = [ @@ -218,6 +236,8 @@ pub fn uniwig_main( meta_data_file_names[1] = format!("{}{}.{}", bwfileheader, "end", "meta"); meta_data_file_names[2] = format!("{}{}.{}", bwfileheader, "core", "meta"); + let mut npy_meta_data_map: HashMap<String, HashMap<String, i32>> = HashMap::new(); + let chrom_sizes = match read_chromosome_sizes(chromsizerefpath) { // original program gets chromosome size from a .sizes file, e.g. chr1 248956422 // the original program simply pushes 0's until the end of the chromosome length and writes these to file. @@ -229,21 +249,23 @@ pub fn uniwig_main( } }; - match ft { + match input_filetype { //BED AND NARROWPEAK WORKFLOW Ok(FileType::BED) | Ok(FileType::NARROWPEAK) => { + // Pare down chromosomes if necessary + let mut final_chromosomes = + get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score); + + // Some housekeeping depending on output type let og_output_type = output_type; // need this later for conversion let mut output_type = output_type; - if output_type == "bedgraph" || output_type == "bw" || output_type == "bigwig" { output_type = "bedGraph" // we must create bedgraphs first before creating bigwig files } - let mut final_chromosomes = get_final_chromosomes(&ft, filepath, &chrom_sizes, score); - let bar = ProgressBar::new(final_chromosomes.len() as u64); - // Pool installs iterator + // Pool installs iterator via rayon crate pool.install(|| { final_chromosomes .par_iter_mut() @@ -269,31 +291,19 @@ pub fn uniwig_main( if smoothsize != 0 { match j { 0 => { - let mut count_result = match ft { - Ok(FileType::BED) => start_end_counts( - &chromosome.starts, - current_chrom_size, - smoothsize, - stepsize, - ), - _ => start_end_counts( - &chromosome.starts, - current_chrom_size, - smoothsize, - stepsize, - ), - }; + let mut count_result = start_end_counts( + &chromosome.starts, + current_chrom_size, + smoothsize, + stepsize, + ); match output_type { "file" => { - //print!("Writing to CLI"); - let handle = &std::io::stdout(); - let mut buf = BufWriter::new(handle); - for count in &count_result.0 { - writeln!(buf, "{}", count) - .expect("failed to write line"); - } - buf.flush().unwrap(); + panic!("Writing to file currently not supported"); + } + "csv" => { + panic!("Write to CSV. Not Implemented"); } "wig" => { //println!("Writing to wig file!"); @@ -308,8 +318,10 @@ pub fn uniwig_main( clamped_start_position( primary_start.0, smoothsize, + 1, //must shift wiggle starts and core by 1 since it is 1 based ), stepsize, + current_chrom_size, ); } "bedGraph" => { @@ -320,7 +332,7 @@ pub fn uniwig_main( let count_info: (Vec<u32>, Vec<u32>, Vec<u32>) = compress_counts( &mut count_result, - clamped_start_position( + clamped_start_position_zero_pos( primary_start.0, smoothsize, ), @@ -332,9 +344,6 @@ pub fn uniwig_main( stepsize, ); } - "csv" => { - panic!("Write to CSV. Not Implemented"); - } "npy" => { let file_name = format!( "{}{}_{}.{}", @@ -344,7 +353,7 @@ pub fn uniwig_main( &count_result.0, file_name.clone(), chrom_name.clone(), - clamped_start_position( + clamped_start_position_zero_pos( primary_start.0, smoothsize, ), @@ -362,7 +371,7 @@ pub fn uniwig_main( &count_result.0, file_name.clone(), chrom_name.clone(), - clamped_start_position( + clamped_start_position_zero_pos( primary_start.0, smoothsize, ), @@ -373,30 +382,18 @@ pub fn uniwig_main( } } 1 => { - let mut count_result = match ft { - Ok(FileType::BED) => start_end_counts( - &chromosome.ends, - current_chrom_size, - smoothsize, - stepsize, - ), - _ => start_end_counts( - &chromosome.ends, - current_chrom_size, - smoothsize, - stepsize, - ), - }; - + let mut count_result = start_end_counts( + &chromosome.ends, + current_chrom_size, + smoothsize, + stepsize, + ); match output_type { "file" => { - let handle = &std::io::stdout(); - let mut buf = BufWriter::new(handle); - for count in &count_result.0 { - writeln!(buf, "{}", count) - .expect("failed to write line"); - } - buf.flush().unwrap(); + panic!("Writing to file not currently supported.") + } + "csv" => { + panic!("Write to CSV. Not Implemented"); } "bedGraph" => { let file_name = format!( @@ -410,6 +407,7 @@ pub fn uniwig_main( clamped_start_position( primary_end.0, smoothsize, + 0, ), ); write_to_bed_graph_file( @@ -431,13 +429,13 @@ pub fn uniwig_main( clamped_start_position( primary_end.0, smoothsize, + 0, // ends already 1 based, do not shift further ), stepsize, + current_chrom_size, ); } - "csv" => { - panic!("Write to CSV. Not Implemented"); - } + "npy" => { let file_name = format!( "{}{}_{}.{}", @@ -448,8 +446,9 @@ pub fn uniwig_main( file_name.clone(), chrom_name.clone(), clamped_start_position( - primary_start.0, + primary_end.0, smoothsize, + 0, ), stepsize, meta_data_file_names[1].clone(), @@ -466,8 +465,9 @@ pub fn uniwig_main( file_name.clone(), chrom_name.clone(), clamped_start_position( - primary_start.0, + primary_end.0, smoothsize, + 0, ), stepsize, meta_data_file_names[1].clone(), @@ -476,30 +476,18 @@ pub fn uniwig_main( } } 2 => { - let mut core_results = match ft { - Ok(FileType::BED) => core_counts( - &chromosome.starts, - &chromosome.ends, - current_chrom_size, - stepsize, - ), - _ => core_counts( - &chromosome.starts, - &chromosome.ends, - current_chrom_size, - stepsize, - ), - }; - + let mut core_results = core_counts( + &chromosome.starts, + &chromosome.ends, + current_chrom_size, + stepsize, + ); match output_type { "file" => { - let handle = &std::io::stdout(); - let mut buf = BufWriter::new(handle); - for count in &core_results.0 { - writeln!(buf, "{}", count) - .expect("failed to write line"); - } - buf.flush().unwrap(); + panic!("Writing to file not supported.") + } + "csv" => { + panic!("Write to CSV. Not Implemented"); } "bedGraph" => { let file_name = format!( @@ -510,7 +498,10 @@ pub fn uniwig_main( let count_info: (Vec<u32>, Vec<u32>, Vec<u32>) = compress_counts( &mut core_results, - primary_start.0, + clamped_start_position_zero_pos( + primary_start.0, + 0, + ), ); write_to_bed_graph_file( &count_info, @@ -528,13 +519,11 @@ pub fn uniwig_main( &core_results.0, file_name.clone(), chrom_name.clone(), - primary_start.0, + clamped_start_position(primary_start.0, 0, 1), //starts are 1 based must be shifted by 1 stepsize, + current_chrom_size, ); } - "csv" => { - panic!("Write to CSV. Not Implemented"); - } "npy" => { let file_name = format!( "{}{}_{}.{}", @@ -544,7 +533,10 @@ pub fn uniwig_main( &core_results.0, file_name.clone(), chrom_name.clone(), - primary_start.0, + clamped_start_position_zero_pos( + primary_start.0, + 0, + ), stepsize, meta_data_file_names[2].clone(), ); @@ -559,7 +551,10 @@ pub fn uniwig_main( &core_results.0, file_name.clone(), chrom_name.clone(), - primary_start.0, + clamped_start_position_zero_pos( + primary_start.0, + 0, + ), stepsize, meta_data_file_names[2].clone(), ); @@ -593,6 +588,63 @@ pub fn uniwig_main( ); } } + "npy" => { + // populate hashmap for the npy meta data + for chromosome in final_chromosomes.iter() { + let chr_name = chromosome.chrom.clone(); + let current_chrom_size = + *chrom_sizes.get(&chromosome.chrom).unwrap() as i32; + npy_meta_data_map.insert( + chr_name, + HashMap::from([ + ("stepsize".to_string(), stepsize), + ("reported_chrom_size".to_string(), current_chrom_size), + ]), + ); + } + + for location in vec_count_type.iter() { + let temp_meta_file_name = + format!("{}{}.{}", bwfileheader, *location, "meta"); + + if let Ok(file) = File::open(&temp_meta_file_name) { + let reader = BufReader::new(file); + + for line in reader.lines() { + let line = line.unwrap(); + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 3 { + let chrom = parts[1].split('=').nth(1).expect( + "Processing npy metadata file: Missing chromosome in line", + ); + let start_str = parts[2].split('=') + .nth(1) + .expect("Processing npy metadata file: Missing start position in line"); + let starting_position: i32 = start_str.parse().expect( + "Processing npy metadata file: Invalid start position", + ); + + if let Some(current_chr_data) = npy_meta_data_map.get_mut(chrom) + { + current_chr_data.insert( + (*location.to_string()).parse().unwrap(), + starting_position, + ); + } + } + } + // Remove the file after it is used. + let path = std::path::Path::new(&temp_meta_file_name); + let _ = remove_file(path).unwrap(); + } + } + //write combined metadata as json + let json_string = serde_json::to_string_pretty(&npy_meta_data_map).unwrap(); + let combined_npy_meta_file_path = + format!("{}{}.{}", bwfileheader, "npy_meta", "json"); + let mut file = File::create(combined_npy_meta_file_path).unwrap(); + file.write_all(json_string.as_bytes()).unwrap(); + } _ => {} } bar.finish(); @@ -600,6 +652,13 @@ pub fn uniwig_main( match og_output_type { "bw" | "bigWig" => { println!("Writing bigWig files"); + if zoom != 1 { + println!( + "Only zoom level 1 is supported at this time, zoom level supplied {}", + zoom + ); + } + let zoom = 1; //overwrite zoom write_bw_files(bwfileheader, chromsizerefpath, num_threads, zoom); } @@ -625,6 +684,8 @@ pub fn uniwig_main( stepsize, output_type, debug, + bam_shift, + bam_scale, ); } @@ -642,7 +703,7 @@ pub fn uniwig_main( /// Currently, supports bam -> bigwig (start, end, core) and bam -> bed (shifted core values only). /// You must provide a .bai file alongside the bam file! Create one: `samtools index your_file.bam` fn process_bam( - vec_count_type: Vec<&str>, + mut vec_count_type: Vec<&str>, filepath: &str, bwfileheader: &str, chrom_sizes: HashMap<String, u32>, @@ -654,6 +715,8 @@ fn process_bam( stepsize: i32, output_type: &str, debug: bool, + bam_shift: bool, + bam_scale: f32, ) -> Result<(), Box<dyn Error>> { println!("Begin bam processing workflow..."); let fp_string = filepath.to_string(); @@ -704,6 +767,17 @@ fn process_bam( } } + //let out_selection_vec: Vec<&str>; + + if !bam_shift { + //do nothing, just keep user output selection for starts, ends, core + } else { + if vec_count_type.len() > 1 { + println!("bam_shift defaults to true for bam processing, but more than one count_type was selected. Defaulting to shift workflow which will produce a single file count file."); + } + vec_count_type = vec!["shift"]; + } + match output_type { // Must merge all individual CHRs bw files... "bw" => { @@ -713,6 +787,7 @@ fn process_bam( .par_iter() .for_each(|chromosome_string: &String| { let out_selection_vec = vec_count_type.clone(); + //let out_selection_vec = vec![OutSelection::STARTS]; for selection in out_selection_vec.iter() { @@ -729,6 +804,8 @@ fn process_bam( &fp_string, &chrom_sizes_ref_path_string, "start", + bam_shift, + bam_scale, ); } &"end" => { @@ -743,6 +820,8 @@ fn process_bam( &fp_string, &chrom_sizes_ref_path_string, "end", + bam_shift, + bam_scale, ); } &"core" => { @@ -757,6 +836,24 @@ fn process_bam( &fp_string, &chrom_sizes_ref_path_string, "core", + bam_shift, + bam_scale, + ); + } + &"shift" => { + process_bw_in_threads( + &chrom_sizes, + chromosome_string, + smoothsize, + stepsize, + num_threads, + zoom, + bwfileheader, + &fp_string, + &chrom_sizes_ref_path_string, + "shift", + bam_shift, + bam_scale, ); } _ => { @@ -867,21 +964,26 @@ fn process_bam( match selection { &"start" => { println!( - "Only CORE output is implemented for bam to BED file." + "Only shift output is implemented for bam to BED file. (bamshift must be set to true)" ); } &"end" => { println!( - "Only CORE output is implemented for bam to BED file." + "Only shift output is implemented for bam to BED file. (bamshift must be set to true)" ); } &"core" => { + println!( + "Only shift output is implemented for bam to BED file. (bamshift must be set to true)" + ); + } + &"shift" => { process_bed_in_threads( chromosome_string, smoothsize, bwfileheader, &fp_string, - "core", + "shift", ); } _ => { @@ -893,7 +995,7 @@ fn process_bam( }); // Combine bed files - let out_selection_vec = vec!["core"]; //TODO this should not be hard coded. + let out_selection_vec = vec_count_type.clone(); for location in out_selection_vec.iter() { // this is a work around since we need to make a String to Chrom // so that we can re-use write_combined_files @@ -1051,6 +1153,8 @@ fn process_bw_in_threads( fp_string: &String, chrom_sizes_ref_path_string: &String, sel: &str, + bam_shift: bool, + bam_scale: f32, ) { let (reader, writer) = os_pipe::pipe().unwrap(); let write_fd = Arc::new(Mutex::new(writer)); @@ -1086,6 +1190,8 @@ fn process_bw_in_threads( &chromosome_string_cloned, sel_clone.as_str(), write_fd, + bam_shift, + bam_scale, ) { Ok(_) => { //eprintln!("Processing successful for {}", chromosome_string_cloned); @@ -1149,10 +1255,14 @@ fn determine_counting_func( chromosome_string_cloned: &String, sel_clone: &str, write_fd: Arc<Mutex<PipeWriter>>, + bam_shift: bool, + bam_scale: f32, ) -> Result<(), BAMRecordError> { - let count_result: Result<(), BAMRecordError> = match sel_clone { - "start" | "end" => { - match variable_start_end_counts_bam_to_bw( + //let bam_shift: bool = true; // This is to ensure a shifted position workflow is used when doing bams + + let count_result: Result<(), BAMRecordError> = match bam_shift { + true => { + match variable_shifted_bam_to_bw( &mut records, current_chrom_size_cloned, smoothsize_cloned, @@ -1160,6 +1270,7 @@ fn determine_counting_func( &chromosome_string_cloned, sel_clone, write_fd, + bam_scale, ) { Ok(_) => Ok(()), Err(err) => { @@ -1168,33 +1279,54 @@ fn determine_counting_func( } } } + false => { + match sel_clone { + "start" | "end" => { + match variable_start_end_counts_bam_to_bw( + &mut records, + current_chrom_size_cloned, + smoothsize_cloned, + stepsize_cloned, + &chromosome_string_cloned, + sel_clone, + write_fd, + ) { + Ok(_) => Ok(()), + Err(err) => { + //eprintln!("Error processing records for {} {:?}", sel_clone,err); + Err(err) + } + } + } - "core" => { - match variable_core_counts_bam_to_bw( - &mut records, - current_chrom_size_cloned, - stepsize_cloned, - &chromosome_string_cloned, - write_fd, - ) { - Ok(_) => { - //eprintln!("Processing successful for {}", chromosome_string_cloned); - Ok(()) + "core" => { + match variable_core_counts_bam_to_bw( + &mut records, + current_chrom_size_cloned, + stepsize_cloned, + &chromosome_string_cloned, + write_fd, + ) { + Ok(_) => { + //eprintln!("Processing successful for {}", chromosome_string_cloned); + Ok(()) + } + Err(err) => { + //eprintln!("Error processing records for {}: {:?}", sel_clone,err); + Err(err) + } + } } - Err(err) => { - //eprintln!("Error processing records for {}: {:?}", sel_clone,err); - Err(err) + + &_ => { + eprintln!( + "Error processing records, improper selection: {}", + sel_clone + ); + Err(BAMRecordError::IncorrectSel) } } } - - &_ => { - eprintln!( - "Error processing records, improper selection: {}", - sel_clone - ); - Err(BAMRecordError::IncorrectSel) - } }; count_result diff --git a/gtars/src/uniwig/writing.rs b/gtars/src/uniwig/writing.rs index 446a3738..286d4662 100644 --- a/gtars/src/uniwig/writing.rs +++ b/gtars/src/uniwig/writing.rs @@ -6,7 +6,7 @@ use ndarray::Array; use ndarray_npy::write_npy; use std::fs::{create_dir_all, remove_file, File, OpenOptions}; use std::io::{BufWriter, Write}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::{fs, io}; /// Write output to npy files @@ -96,6 +96,7 @@ pub fn write_to_wig_file( chromname: String, start_position: i32, stepsize: i32, + chrom_size: i32, ) { let path = std::path::Path::new(&filename).parent().unwrap(); let _ = create_dir_all(path); @@ -117,7 +118,8 @@ pub fn write_to_wig_file( let mut buf = BufWriter::new(file); - for count in counts.iter() { + for count in counts.iter().take(chrom_size as usize) { + // must set upper bound for wiggles based on reported chromsize, this is for downstream tool interoperability writeln!(&mut buf, "{}", count).unwrap(); } buf.flush().unwrap(); @@ -163,7 +165,15 @@ pub fn write_bw_files(location: &str, chrom_sizes: &str, num_threads: i32, zoom_ //Collect all bedGraph files in the given location/directory let mut bed_graph_files = Vec::new(); - for entry in fs::read_dir(location).unwrap() { + let mut location_path = location; + + if !location_path.ends_with("/") { + let mut temp_path = Path::new(location_path); + let parent_location_path = temp_path.parent().unwrap(); + location_path = parent_location_path.to_str().unwrap(); + } + + for entry in fs::read_dir(location_path).unwrap() { let entry = entry.unwrap(); let path = entry.path(); diff --git a/gtars/tests/data/base.fa b/gtars/tests/data/base.fa new file mode 100644 index 00000000..dd08063d --- /dev/null +++ b/gtars/tests/data/base.fa @@ -0,0 +1,6 @@ +>chrX +TTGGGGAA +>chr1 +GGAA +>chr2 +GCGC diff --git a/gtars/tests/data/base.fa.gz b/gtars/tests/data/base.fa.gz new file mode 100644 index 00000000..343e91af Binary files /dev/null and b/gtars/tests/data/base.fa.gz differ diff --git a/gtars/tests/data/igd_file_list/bad_bed_file.notbed b/gtars/tests/data/igd_file_list/bad_bed_file.notbed deleted file mode 100644 index e31a333e..00000000 --- a/gtars/tests/data/igd_file_list/bad_bed_file.notbed +++ /dev/null @@ -1,15 +0,0 @@ -chr1 7 10 -chr1 8 12 -chr1 9 15 -chr1 10 17 -chr1 11 18 -chr1 12 19 -chr1 13 20 -chr1 14 22 -chr1 16 23 -chr1 18 24 -chr1 19 27 -chr1 20 28 -chr1 22 30 -chr1 23 31 -chr1 24 32 \ No newline at end of file diff --git a/gtars/tests/data/igd_file_list/bad_bed_file_2.notbed b/gtars/tests/data/igd_file_list/bad_bed_file_2.notbed deleted file mode 100644 index 1b91112d..00000000 --- a/gtars/tests/data/igd_file_list/bad_bed_file_2.notbed +++ /dev/null @@ -1,8 +0,0 @@ -chr11 10 50 -chr11 20 76 -chr12 769 2395 -chr13 771 3000 -chr14 800 2900 -chr21 1 30 -chr21 2 19 -chr21 16 31 diff --git a/gtars/tests/data/igd_file_list/igd_bed_file_1.bed b/gtars/tests/data/igd_file_list/igd_bed_file_1.bed deleted file mode 100644 index ab24a1b0..00000000 --- a/gtars/tests/data/igd_file_list/igd_bed_file_1.bed +++ /dev/null @@ -1,4 +0,0 @@ -chr1 632554 632780 SRX4150706.05_peak_5 157 . 2.14622 20.42377 15.73019 44 -chr1 633837 634214 SRX4150706.05_peak_6 757 . 3.67362 82.37296 75.78497 191 -chr10 931681 932010 SRX4150706.05_peak_247 205 . 11.82913 25.65609 20.56433 139 -chr10 1048894 1049428 SRX4150706.05_peak_248 252 . 11.83432 30.63056 25.20567 179 \ No newline at end of file diff --git a/gtars/tests/data/igd_file_list/igd_bed_file_2.notbed b/gtars/tests/data/igd_file_list/igd_bed_file_2.notbed deleted file mode 100644 index d1b2de09..00000000 --- a/gtars/tests/data/igd_file_list/igd_bed_file_2.notbed +++ /dev/null @@ -1,37 +0,0 @@ -chr1 32481 32787 SRX4150706.05_peak_1 92 . 7.69231 13.22648 9.25988 155 -chr1 629094 630022 SRX4150706.05_peak_2 820 . 3.81936 88.76474 82.09715 743 -chr1 630770 631348 SRX4150706.05_peak_3 333 . 2.69642 39.15731 33.36833 464 -chr1 631874 632292 SRX4150706.05_peak_4 513 . 3.14391 57.55429 51.34151 169 -chr10 3172518 3172964 SRX4150706.05_peak_249 114 . 8.40708 15.69710 11.46197 371 -chr10 3785332 3785624 SRX4150706.05_peak_250 140 . 9.57811 18.59647 14.07850 164 -chr10 4848619 4848897 SRX4150706.05_peak_251 148 . 10.09615 19.45367 14.85063 121 -chr10 4867612 4867959 SRX4150706.05_peak_252 148 . 10.40312 19.46796 14.86100 138 -chr12 26274777 26275010 SRX4150706.05_peak_502 155 . 11.35647 20.23804 15.56519 190 -chr12 30754778 30755141 SRX4150706.05_peak_503 146 . 9.98811 19.27493 14.68905 175 -chr12 31066520 31066788 SRX4150706.05_peak_504 94 . 8.08625 13.48456 9.48825 107 -chr12 31728967 31729242 SRX4150706.05_peak_505 197 . 12.33933 24.77604 19.74551 126 -chr12 40105822 40106052 SRX4150706.05_peak_506 112 . 9.06516 15.49433 11.28455 71 -chr12 42144779 42145013 SRX4150706.05_peak_507 128 . 9.88372 17.27142 12.88671 94 -chr12 43758834 43759073 SRX4150706.05_peak_508 87 . 7.83217 12.71157 8.79783 147 -chr16 1678069 1678364 SRX4150706.05_peak_757 114 . 9.18221 15.69259 11.46152 121 -chr16 1782651 1782896 SRX4150706.05_peak_758 161 . 10.92328 20.82612 16.10091 109 -chr16 1943243 1943468 SRX4150706.05_peak_759 88 . 8.14941 12.77668 8.85488 116 -chr16 2136005 2136235 SRX4150706.05_peak_760 145 . 10.16518 19.07285 14.50998 104 -chr16 2214862 2215110 SRX4150706.05_peak_761 111 . 8.74036 15.35579 11.15965 171 -chr16 2223339 2223636 SRX4150706.05_peak_762 128 . 9.88372 17.27142 12.88671 145 -chr16 3003944 3004198 SRX4150706.05_peak_763 114 . 9.18221 15.69259 11.46152 106 -chr16 3400901 3401238 SRX4150706.05_peak_764 101 . 8.82852 14.21739 10.13631 147 -chr16 4307669 4307938 SRX4150706.05_peak_765 145 . 10.49724 19.15774 14.58114 107 -chr17 10697460 10697723 SRX4150706.05_peak_821 76 . 7.47029 11.37055 7.60573 50 -chr17 15490746 15490988 SRX4150706.05_peak_822 153 . 11.37124 19.94566 15.30242 133 -chr17 15651622 15651906 SRX4150706.05_peak_823 125 . 10.03344 16.89878 12.54836 108 -chr17 15699452 15699766 SRX4150706.05_peak_824 148 . 11.20841 19.40026 14.80545 161 -chr17 15999582 15999891 SRX4150706.05_peak_825 153 . 11.19751 19.95225 15.30478 125 -chr17 16535698 16535959 SRX4150706.05_peak_826 120 . 9.55224 16.32735 12.03429 147 -chr17 17972524 17972813 SRX4150706.05_peak_827 131 . 10.24000 17.54836 13.13781 133 -chr17 19062312 19062585 SRX4150706.05_peak_828 140 . 8.64086 18.53730 14.02305 137 -chr19 1275440 1275769 SRX4150706.05_peak_900 80 . 6.87433 11.89345 8.07370 138 -chr19 1812463 1812867 SRX4150706.05_peak_901 74 . 7.09413 11.16432 7.41911 181 -chr19 2042147 2042419 SRX4150706.05_peak_902 106 . 8.83652 14.74695 10.61464 170 -chr19 2151617 2151889 SRX4150706.05_peak_903 133 . 9.94475 17.78651 13.34663 162 -chr19 4471718 4472167 SRX4150706.05_peak_904 109 . 8.83978 15.11550 10.94480 106 diff --git a/gtars/tests/data/igd_file_list_01/igd_bed_file_1.bed b/gtars/tests/data/igd_file_list_01/igd_bed_file_1.bed new file mode 100644 index 00000000..daae26c5 --- /dev/null +++ b/gtars/tests/data/igd_file_list_01/igd_bed_file_1.bed @@ -0,0 +1,8 @@ +chr1 1 100 +chr1 200 300 +chr1 32768 32868 +chr1 49152 49352 +chr2 1 100 +chr2 200 300 +chr3 32768 32868 +chr3 49152 49352 diff --git a/gtars/tests/data/igd_file_list_02/igd_bed_file_1.bed b/gtars/tests/data/igd_file_list_02/igd_bed_file_1.bed new file mode 100644 index 00000000..daae26c5 --- /dev/null +++ b/gtars/tests/data/igd_file_list_02/igd_bed_file_1.bed @@ -0,0 +1,8 @@ +chr1 1 100 +chr1 200 300 +chr1 32768 32868 +chr1 49152 49352 +chr2 1 100 +chr2 200 300 +chr3 32768 32868 +chr3 49152 49352 diff --git a/gtars/tests/data/igd_file_list_02/igd_bed_file_2.bed b/gtars/tests/data/igd_file_list_02/igd_bed_file_2.bed new file mode 100644 index 00000000..23f3e131 --- /dev/null +++ b/gtars/tests/data/igd_file_list_02/igd_bed_file_2.bed @@ -0,0 +1,4 @@ +chr2 652554 652780 SRX4150706.05_peak_5 157 . 2.14622 20.42377 15.73019 44 +chr2 653837 654214 SRX4150706.05_peak_6 757 . 3.67362 82.37296 75.78497 191 +chr11 951681 952010 SRX4150706.05_peak_247 205 . 11.82913 25.65609 20.56433 139 +chr11 1248894 1249428 SRX4150706.05_peak_248 252 . 11.83432 30.63056 25.20567 179 \ No newline at end of file diff --git a/gtars/tests/data/igd_query_files/query1.bed b/gtars/tests/data/igd_query_files/query1.bed new file mode 100644 index 00000000..daae26c5 --- /dev/null +++ b/gtars/tests/data/igd_query_files/query1.bed @@ -0,0 +1,8 @@ +chr1 1 100 +chr1 200 300 +chr1 32768 32868 +chr1 49152 49352 +chr2 1 100 +chr2 200 300 +chr3 32768 32868 +chr3 49152 49352 diff --git a/gtars/tests/data/igd_query_files/query2.bed b/gtars/tests/data/igd_query_files/query2.bed new file mode 100644 index 00000000..6c6ece21 --- /dev/null +++ b/gtars/tests/data/igd_query_files/query2.bed @@ -0,0 +1,2 @@ +chr3 49152 49352 +chr2 653837 654214 SRX4150706.05_peak_6 757 . 3.67362 82.37296 75.78497 191 \ No newline at end of file diff --git a/gtars/tests/data/out/_core.wig b/gtars/tests/data/out/_core.wig index bce79299..7142f6c2 100644 --- a/gtars/tests/data/out/_core.wig +++ b/gtars/tests/data/out/_core.wig @@ -1,4 +1,4 @@ -fixedStep chrom=chr1 start=2 step=1 +fixedStep chrom=chr1 start=3 step=1 2 2 3 @@ -16,4 +16,4 @@ fixedStep chrom=chr1 start=2 step=1 0 0 0 -0 +0 \ No newline at end of file diff --git a/gtars/tests/data/out/_end.wig b/gtars/tests/data/out/_end.wig index e89bdc32..306e8c4e 100644 --- a/gtars/tests/data/out/_end.wig +++ b/gtars/tests/data/out/_end.wig @@ -13,4 +13,4 @@ fixedStep chrom=chr1 start=5 step=1 0 0 0 -0 +0 \ No newline at end of file diff --git a/gtars/tests/data/out/_start.wig b/gtars/tests/data/out/_start.wig index 361beb36..a8481c04 100644 --- a/gtars/tests/data/out/_start.wig +++ b/gtars/tests/data/out/_start.wig @@ -1,4 +1,4 @@ -fixedStep chrom=chr1 start=1 step=1 +fixedStep chrom=chr1 start=2 step=1 2 2 3 @@ -17,4 +17,4 @@ fixedStep chrom=chr1 start=1 step=1 0 0 0 -0 +0 \ No newline at end of file diff --git a/gtars/tests/test.rs b/gtars/tests/test.rs index 837a2647..aeeb4e3e 100644 --- a/gtars/tests/test.rs +++ b/gtars/tests/test.rs @@ -72,8 +72,12 @@ fn path_to_core_bedgraph_output() -> &'static str { mod tests { use super::*; - use gtars::igd::create::{create_igd_f, igd_add, igd_saveT, igd_save_db, igd_t, parse_bed}; - use gtars::igd::search::igd_search; + use gtars::igd::create::{ + create_igd_f, gdata_t, igd_add, igd_saveT, igd_save_db, igd_t, parse_bed, + }; + use gtars::igd::search::{ + getOverlaps, get_file_info_tsv, get_igd_info, get_tsv_path, igd_search, igd_t_from_disk, + }; use gtars::uniwig::{uniwig_main, Chromosome}; @@ -84,7 +88,12 @@ mod tests { use gtars::uniwig::writing::write_bw_files; + use anyhow::Context; + use byteorder::{LittleEndian, ReadBytesExt}; use std::collections::HashMap; + use std::collections::HashSet; + use std::fs::OpenOptions; + use std::io::{Seek, SeekFrom}; // IGD TESTS #[rstest] @@ -110,68 +119,230 @@ mod tests { } #[rstest] - fn test_igd_create() { + fn test_igd_create_short_long_regions() { + // Depending on start and end coordinates which are divided by nbp=16384 + // the number of tiles per ctg are adjusted, this tests to ensure they are created appropriately let tempdir = tempfile::tempdir().unwrap(); let path = PathBuf::from(&tempdir.path()); - let db_path_unwrapped = path.into_os_string().into_string().unwrap(); let db_output_path = db_path_unwrapped; let path_to_crate = env!("CARGO_MANIFEST_DIR"); - let testfilelists = format!("{}{}", path_to_crate, "/tests/data/igd_file_list/"); + let testfilelists = format!("{}{}", path_to_crate, "/tests/data/igd_file_list_01/"); let demo_name = String::from("demo"); - create_igd_f(&db_output_path, &testfilelists, &demo_name); + let igd = create_igd_f(&db_output_path, &testfilelists, &demo_name); + assert_eq!(igd.ctg[0].name, "chr1"); + assert_eq!(igd.ctg[1].name, "chr2"); + assert_eq!(igd.ctg[2].name, "chr3"); + assert_eq!(igd.nctg, 3); + + assert_eq!(igd.ctg[0].mTiles, 4); // chr1 has 4 Tiles because of the 32768, and 49152 starts + assert_eq!(igd.ctg[1].mTiles, 1); // chr only has 1 Tile due to the 200 start + + assert_eq!(igd.ctg[0].gTile[0].gList[0].start, 1); // look specific tile's start + assert_eq!( + igd.ctg[0].gTile[(igd.ctg[0].mTiles - 1) as usize].gList[0].start, + 49152 + ); // look specific tile's start + + assert_eq!(igd.ctg[0].gTile[0].nCnts, 2); // look at nCnts + assert_eq!(igd.ctg[0].gTile[1].nCnts, 0); // look at nCnts + assert_eq!(igd.ctg[0].gTile[2].nCnts, 1); // look at nCnts + + // Overall stats + assert_eq!(igd.total_regions, 8); + assert_eq!(igd.total_average, 998.0); + assert_eq!(igd.average_length, 124.75); } - // #[rstest] - // fn test_igd_create_txt() { - // let tempdir = tempfile::tempdir().unwrap(); - // let path = PathBuf::from(&tempdir.path()); - // - // let db_path_unwrapped = path.into_os_string().into_string().unwrap(); - // let db_output_path = db_path_unwrapped; - // - // let path_to_crate = env!("CARGO_MANIFEST_DIR"); - // let testfilelists = format!("{}{}", path_to_crate, "/tests/data/igdlist.txt"); - // - // let demo_name = String::from("demo"); - // - // create_igd_f(&db_output_path, &testfilelists, &demo_name); - // } - #[rstest] - fn test_igd_search() { - // First must create temp igd - - // Temp dir to hold igd + fn test_igd_create_then_load_from_disk() { + // Depending on start and end coordinates which are divided by nbp=16384 + // the number of tiles per ctg are adjusted, this tests to ensure they are created appropriately let tempdir = tempfile::tempdir().unwrap(); let path = PathBuf::from(&tempdir.path()); - let db_path_unwrapped = path.into_os_string().into_string().unwrap(); - let db_output_path = db_path_unwrapped; + let mut db_path_unwrapped = path.into_os_string().into_string().unwrap(); + db_path_unwrapped.push_str("/"); + let db_output_path = db_path_unwrapped.clone(); - // bed files used to create IGD let path_to_crate = env!("CARGO_MANIFEST_DIR"); - let testfilelists = format!("{}{}", path_to_crate, "/tests/data/igd_file_list/"); + let testfilelists = format!("{}{}", path_to_crate, "/tests/data/igd_file_list_01/"); let demo_name = String::from("demo"); - // Create IGD from directory of bed files - create_igd_f(&db_output_path, &testfilelists, &demo_name); + let igd_saved = create_igd_f(&db_output_path, &testfilelists, &demo_name); + + println!("dboutput_path {}", db_output_path); + + db_path_unwrapped.push_str("/demo.igd"); + + let mut hash_table: HashMap<String, i32> = HashMap::new(); + + // Create IGD Struct from database + let mut igd_from_disk: igd_t_from_disk = + get_igd_info(&db_path_unwrapped, &mut hash_table).expect("Could not open IGD"); + let tsv_path = get_tsv_path(db_path_unwrapped.as_str()).unwrap(); + get_file_info_tsv(tsv_path, &mut igd_from_disk).unwrap(); //sets igd.finfo - // Get a query file path from test files - let query_file = format!( - "{}{}", - path_to_crate, "/tests/data/igd_file_list/igd_bed_file_1.bed" + assert_eq!(igd_saved.ctg.len(), igd_from_disk.nCtg as usize); + + assert_eq!(igd_from_disk.nFiles, 1); + + assert_eq!( + igd_from_disk.nCnt[0].len(), + igd_saved.ctg[0].mTiles as usize + ); + assert_eq!( + igd_from_disk.nCnt[1].len(), + igd_saved.ctg[1].mTiles as usize ); + assert_eq!( + igd_from_disk.nCnt[2].len(), + igd_saved.ctg[2].mTiles as usize + ); + + assert_eq!(igd_from_disk.nCnt[0][0], igd_saved.ctg[0].gTile[0].nCnts); + assert_eq!(igd_from_disk.nCnt[0][1], igd_saved.ctg[0].gTile[1].nCnts); + assert_eq!(igd_from_disk.nCnt[0][2], igd_saved.ctg[0].gTile[2].nCnts); + assert_eq!(igd_from_disk.nCnt[0][3], igd_saved.ctg[0].gTile[3].nCnts); + + // Check to see if the regions on disk are the same as the original igd (minus the unused zeros) + let dbpath = std::path::Path::new(&db_path_unwrapped); + let db_file = OpenOptions::new() + .create(true) + .append(true) + .read(true) + .open(dbpath) + .unwrap(); + let mut db_reader = BufReader::new(db_file); + + for k in 0..3 { + let nCnt_len = igd_from_disk.nCnt[k].len(); + + for l in 0..nCnt_len { + let mut a: HashSet<i32> = Default::default(); + let mut b: HashSet<i32> = Default::default(); + + let tmpi = igd_from_disk.nCnt[k][l]; // number of gdata_t to read + + //println!("Here is k {}, l {}, and igd_from_disk.tIdx[k][l] {}",k,l, igd_from_disk.tIdx[k][l]); + db_reader + .seek(SeekFrom::Start(igd_from_disk.tIdx[k][l] as u64)) // [k]contig [l] tile position + .unwrap(); + + let mut gData: Vec<gdata_t> = Vec::new(); + + //println!("Creating gData with tmpi {}", tmpi); + for j in 0..tmpi { + gData.push(gdata_t::default()) + } + + for i in 0..tmpi { + // number of gdata_t to read + //println!("Iterating with i {} of tmpi {} ",i,tmpi); + let mut buf = [0u8; 16]; + + let n = db_reader.read(&mut buf).unwrap(); + + if n == 0 { + //println!("Breaking loop while reading tempfile"); + break; + } else if n != 16 { + //panic!("Cannot read temp file."); + break; + } + + let mut rdr = &buf[..] as &[u8]; + let idx = rdr.read_i32::<LittleEndian>().unwrap(); + let start = rdr.read_i32::<LittleEndian>().unwrap(); + let end = rdr.read_i32::<LittleEndian>().unwrap(); + let value = rdr.read_i32::<LittleEndian>().unwrap(); + + //println!("Looping through g_datat in temp files"); + //println!("Chr_name: {} Filename: {} start: {} end: {}", igd_from_disk.cName[k], igd_from_disk.file_info[idx as usize].fileName, start, end); + + gData[i as usize] = gdata_t { + idx: idx, + start, + end, + value, + }; + } + + //println!("here is k {}, l {}",k,l); + for g in gData.iter() { + //println!("Inserting {} from gData on Disk", g.start); + a.insert(g.start); + } + + for g in igd_saved.ctg[k].gTile[l].gList.iter() { + //println!("Inserting {} from original gList ", g.start); + b.insert(g.start); + } + //println!("A: {:?}", a); + //println!("B: {:?}", b); + // There difference should at most be a 0 from unused tiles, therefore the difference length should at MOST be 1. + let diff = b.difference(&a).collect::<Vec<&i32>>(); + //println!("Difference: {:?}", diff); + assert!(diff.len() <= 1) + } + } + } + + #[rstest] + #[case( + "/tests/data/igd_file_list_01/", + "/tests/data/igd_query_files/query1.bed", + 8, + 8 + )] + // #[case( + // "/tests/data/igd_file_list_02/", + // "/tests/data/igd_query_files/query2.bed", + // 4, + // 1 + // )] + fn test_igd_create_then_search( + #[case] input: &str, + #[case] query_file: &str, + #[case] expected_regions: u32, + #[case] expected_hits: u32, + ) { + let tempdir = tempfile::tempdir().unwrap(); + let path = PathBuf::from(&tempdir.path()); + let mut db_path_unwrapped = path.into_os_string().into_string().unwrap(); + db_path_unwrapped.push_str("/"); + let db_output_path = db_path_unwrapped.clone(); + + let path_to_crate = env!("CARGO_MANIFEST_DIR"); + let testfilelists = format!("{}{}", path_to_crate, input); + + let demo_name = String::from("demo"); + + let igd_saved = create_igd_f(&db_output_path, &testfilelists, &demo_name); + + println!("dboutput_path {}", db_output_path); - // the final db path will be constructed within igd_save_db like so - let final_db_save_path = format!("{}{}{}", db_output_path, demo_name, ".igd"); + db_path_unwrapped.push_str("/demo.igd"); - let res = igd_search(&final_db_save_path, &query_file).expect("Error during testing:"); + let queryfile = format!("{}{}", path_to_crate, query_file); + let res = igd_search(&db_path_unwrapped, &queryfile).expect("Error during testing:"); + let mut res_iter = res[1].split('\t'); + // Skip the first two columns + res_iter.next().unwrap(); + // Extract the third and fourth columns + let second_column = res_iter.next().unwrap().to_string(); + let third_column = res_iter.next().unwrap().to_string(); + + println!("Number of Regions: {}", second_column); + println!("Number of Hits: {}", third_column); + + assert_eq!(second_column, expected_regions.to_string()); + assert_eq!(third_column, expected_hits.to_string()); } #[rstest] @@ -394,6 +565,8 @@ mod tests { stepsize, zoom, false, + true, + 1.0, ) .expect("Uniwig main failed!"); @@ -438,6 +611,8 @@ mod tests { stepsize, zoom, false, + true, + 1.0, ) .expect("Uniwig main failed!"); @@ -483,6 +658,8 @@ mod tests { stepsize, zoom, false, + true, + 1.0, ) .expect("Uniwig main failed!"); @@ -506,7 +683,7 @@ mod tests { let bwfileheader_path = path.into_os_string().into_string().unwrap(); let bwfileheader = bwfileheader_path.as_str(); - let smoothsize: i32 = 5; + let smoothsize: i32 = 2; let output_type = "npy"; let filetype = "bed"; let num_threads = 6; @@ -528,6 +705,8 @@ mod tests { stepsize, zoom, false, + true, + 1.0, ) .expect("Uniwig main failed!"); Ok(()) @@ -592,6 +771,8 @@ mod tests { stepsize, zoom, false, + true, + 1.0, ); assert!(result.is_ok()); @@ -658,6 +839,8 @@ mod tests { stepsize, zoom, false, + true, + 1.0, ); assert!(result.is_ok()); @@ -770,6 +953,8 @@ mod tests { stepsize, zoom, false, + true, + 1.0, ); assert!(result.is_ok()); @@ -877,6 +1062,54 @@ mod tests { stepsize, zoom, false, + true, + 1.0, + ) + .expect("Uniwig main failed!"); + + Ok(()) + } + + #[rstest] + fn test_process_bed_to_bw( + _path_to_dummy_bed_file: &str, + ) -> Result<(), Box<(dyn std::error::Error + 'static)>> { + let path_to_crate = env!("CARGO_MANIFEST_DIR"); + let chromsizerefpath: String = format!("{}{}", path_to_crate, "/tests/hg38.chrom.sizes"); + let chromsizerefpath = chromsizerefpath.as_str(); + let combinedbedpath = _path_to_dummy_bed_file; + + let tempdir = tempfile::tempdir().unwrap(); + let path = PathBuf::from(&tempdir.path()); + + let mut bwfileheader_path = path.into_os_string().into_string().unwrap(); + bwfileheader_path.push_str("/final/"); + let bwfileheader = bwfileheader_path.as_str(); + + let smoothsize: i32 = 1; + let output_type = "bw"; + let filetype = "bed"; + let num_threads = 2; + let score = true; + let stepsize = 1; + let zoom = 1; + let vec_count_type = vec!["start", "end", "core"]; + + uniwig_main( + vec_count_type, + smoothsize, + combinedbedpath, + chromsizerefpath, + bwfileheader, + output_type, + filetype, + num_threads, + score, + stepsize, + zoom, + false, + true, + 1.0, ) .expect("Uniwig main failed!");