Skip to content

Commit ce0967a

Browse files
Merge pull request #66 from databio/dev_64
Fixes for #64 and #65
2 parents 33d4851 + bb5bc89 commit ce0967a

File tree

10 files changed

+108
-46
lines changed

10 files changed

+108
-46
lines changed

.github/workflows/R-CMD-check.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
# - {os: windows-latest, r: 'release', rust-version: 'stable-msvc', rust-target: 'x86_64-pc-windows-gnu'}
2020
- {os: macOS-latest, r: 'release', rust-version: 'stable'}
2121
- {os: ubuntu-latest, r: 'release', rust-version: 'stable'}
22-
- {os: ubuntu-latest, r: 'devel', rust-version: 'stable'}
22+
#- {os: ubuntu-latest, r: 'devel', rust-version: 'stable'}
2323
env:
2424
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
2525
steps:

gtars/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ base64-url = "2.0.0"
3232
sha2 = "0.10.7"
3333
md-5 = "0.10.5"
3434
seq_io = "0.3.2"
35+
serde_json = "1.0.135"
3536

3637

3738
[dev-dependencies]

gtars/src/common/utils.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> {
3939
///
4040
/// - file_path: path to the file to read, or '-' for stdin
4141
///
42-
/// # Returns
43-
///
42+
/// # Returns
43+
///
4444
/// A `BufReader` object for a given file path or stdin.
4545
pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<dyn Read>>> {
4646
if file_path_str == "-" {
@@ -51,7 +51,6 @@ pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result<BufReader<Box<d
5151
}
5252
}
5353

54-
5554
///
5655
/// Create a region-to-id hash-map from a list of regions
5756
///

gtars/src/digests/mod.rs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,21 +13,21 @@
1313
//! # Usage
1414
//!
1515
//! The `sha512t24u` function can be used to compute the GA4GH sha512t24 checksum of a string.
16-
//!
16+
//!
1717
//! ```rust
1818
//! use gtars::digests::sha512t24u;
1919
//!
2020
//! let digest = sha512t24u("hello world");
2121
//! ```
22-
use std::io::prelude::{Read, Write};
23-
use std::io;
2422
use std::fs::File;
23+
use std::io;
24+
use std::io::prelude::{Read, Write};
2525
use std::path::Path;
2626

2727
use anyhow::Result;
2828
use md5::Md5;
29+
use seq_io::fasta::{Reader, Record, RefRecord};
2930
use sha2::{Digest, Sha512};
30-
use seq_io::fasta::{Reader, RefRecord, Record};
3131

3232
use crate::common::utils::get_dynamic_reader;
3333

@@ -75,7 +75,6 @@ pub fn md5(string: &str) -> String {
7575
format!("{:x}", result)
7676
}
7777

78-
7978
/// Processes a FASTA file to compute the digests of each sequence in the file.
8079
///
8180
/// This function reads a FASTA file, computes the SHA-512 and MD5 digests for each sequence,
@@ -103,7 +102,8 @@ pub fn digest_fasta(file_path: &str) -> Result<Vec<DigestResult>> {
103102
let file_reader = get_dynamic_reader(&path)?;
104103
let mut fasta_reader = Reader::new(file_reader);
105104
let mut results = Vec::new();
106-
while let Some(record) = fasta_reader.next() { // returns a RefRecord object
105+
while let Some(record) = fasta_reader.next() {
106+
// returns a RefRecord object
107107
let record = record.expect("Error found when retrieving next record.");
108108
let id = record.id().expect("No ID found for the FASTA record");
109109
let mut sha512_hasher = Sha512::new();
@@ -123,7 +123,7 @@ pub fn digest_fasta(file_path: &str) -> Result<Vec<DigestResult>> {
123123
id: id.to_string(),
124124
length: length,
125125
sha512t24u: sha512,
126-
md5: md5
126+
md5: md5,
127127
});
128128
}
129129
Ok(results)
@@ -169,10 +169,10 @@ mod tests {
169169
assert_eq!(results[0].sha512t24u, "iYtREV555dUFKg2_agSJW6suquUyPpMw");
170170
assert_eq!(results[0].md5, "5f63cfaa3ef61f88c9635fb9d18ec945");
171171
}
172-
172+
173173
#[test]
174174
fn bogus_fasta_file() {
175175
let result = digest_fasta("tests/data/bogus.fa");
176176
assert!(result.is_err(), "Expected an error for a bogus fasta file");
177177
}
178-
}
178+
}

gtars/src/uniwig/counting.rs

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,7 @@ pub fn start_end_counts(
3434
chrom_size: i32,
3535
smoothsize: i32,
3636
stepsize: i32,
37-
shift: i32,
3837
) -> (Vec<u32>, Vec<i32>) {
39-
//let vin_iter = starts_vector.iter();
40-
4138
let mut v_coordinate_positions: Vec<i32> = Vec::new(); // these are the final coordinates after any adjustments
4239
let mut v_coord_counts: Vec<u32> = Vec::new(); // u8 stores 0:255 This may be insufficient. u16 max is 65535
4340

@@ -55,7 +52,7 @@ pub fn start_end_counts(
5552

5653
adjusted_start_site = starts_vector[0]; // get first coordinate position
5754

58-
adjusted_start_site.0 = adjusted_start_site.0 - smoothsize + shift;
55+
adjusted_start_site.0 = adjusted_start_site.0 - smoothsize;
5956

6057
current_end_site = adjusted_start_site;
6158
current_end_site.0 = adjusted_start_site.0 + 1 + smoothsize * 2;
@@ -74,7 +71,7 @@ pub fn start_end_counts(
7471
coordinate_value = *coord;
7572

7673
adjusted_start_site = coordinate_value;
77-
adjusted_start_site.0 = coordinate_value.0 - smoothsize + shift;
74+
adjusted_start_site.0 = coordinate_value.0 - smoothsize;
7875

7976
let current_score = adjusted_start_site.1;
8077

@@ -164,7 +161,6 @@ pub fn core_counts(
164161
ends_vector: &[(i32, i32)],
165162
chrom_size: i32,
166163
stepsize: i32,
167-
shift: i32,
168164
) -> (Vec<u32>, Vec<i32>) {
169165
let mut v_coordinate_positions: Vec<i32> = Vec::new(); // these are the final coordinates after any adjustments
170166
let mut v_coord_counts: Vec<u32> = Vec::new(); // u8 stores 0:255 This may be insufficient. u16 max is 65535
@@ -184,7 +180,7 @@ pub fn core_counts(
184180
current_start_site = starts_vector[0]; // get first coordinate position
185181
current_end_site = ends_vector[0];
186182

187-
current_start_site.0 = current_start_site.0 + shift;
183+
current_start_site.0 = current_start_site.0;
188184

189185
if current_start_site.0 < 1 {
190186
current_start_site.0 = 1;
@@ -201,7 +197,7 @@ pub fn core_counts(
201197

202198
current_start_site = coordinate_value;
203199

204-
current_start_site.0 = current_start_site.0 + shift;
200+
current_start_site.0 = current_start_site.0;
205201

206202
let current_score = current_start_site.1;
207203
count += current_score;

gtars/src/uniwig/mod.rs

Lines changed: 84 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use indicatif::ProgressBar;
55

66
use rayon::prelude::*;
77
use std::error::Error;
8-
use std::fs::File;
8+
use std::fs::{remove_file, File};
99
use std::io::{BufRead, BufReader, BufWriter, Write};
1010

1111
use crate::uniwig::counting::{
@@ -191,8 +191,8 @@ pub fn run_uniwig(matches: &ArgMatches) {
191191
}
192192

193193
/// Ensures that the start position is at a minimum equal to `1`
194-
fn clamped_start_position(start: i32, smoothsize: i32) -> i32 {
195-
std::cmp::max(1, start - smoothsize)
194+
fn clamped_start_position(start: i32, smoothsize: i32, wig_shift: i32) -> i32 {
195+
std::cmp::max(1, start - smoothsize + wig_shift)
196196
}
197197
/// Ensure that the start position is at a minimum equal to `0`
198198
fn clamped_start_position_zero_pos(start: i32, smoothsize: i32) -> i32 {
@@ -222,8 +222,6 @@ pub fn uniwig_main(
222222
.build()
223223
.unwrap();
224224

225-
let mut wig_shift: i32 = 0; // This will be set to 1 when writing to wiggle files, else always set to 0
226-
227225
// Determine Input File Type
228226
let input_filetype = FileType::from_str(filetype.to_lowercase().as_str());
229227
// Set up output file names
@@ -238,6 +236,8 @@ pub fn uniwig_main(
238236
meta_data_file_names[1] = format!("{}{}.{}", bwfileheader, "end", "meta");
239237
meta_data_file_names[2] = format!("{}{}.{}", bwfileheader, "core", "meta");
240238

239+
let mut npy_meta_data_map: HashMap<String, HashMap<String, i32>> = HashMap::new();
240+
241241
let chrom_sizes = match read_chromosome_sizes(chromsizerefpath) {
242242
// original program gets chromosome size from a .sizes file, e.g. chr1 248956422
243243
// the original program simply pushes 0's until the end of the chromosome length and writes these to file.
@@ -252,19 +252,16 @@ pub fn uniwig_main(
252252
match input_filetype {
253253
//BED AND NARROWPEAK WORKFLOW
254254
Ok(FileType::BED) | Ok(FileType::NARROWPEAK) => {
255+
// Pare down chromosomes if necessary
256+
let mut final_chromosomes =
257+
get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score);
258+
255259
// Some housekeeping depending on output type
256260
let og_output_type = output_type; // need this later for conversion
257261
let mut output_type = output_type;
258262
if output_type == "bedgraph" || output_type == "bw" || output_type == "bigwig" {
259263
output_type = "bedGraph" // we must create bedgraphs first before creating bigwig files
260264
}
261-
if output_type == "wig" {
262-
wig_shift = 1;
263-
}
264-
265-
// Pare down chromosomes if necessary
266-
let mut final_chromosomes =
267-
get_final_chromosomes(&input_filetype, filepath, &chrom_sizes, score);
268265

269266
let bar = ProgressBar::new(final_chromosomes.len() as u64);
270267

@@ -299,7 +296,6 @@ pub fn uniwig_main(
299296
current_chrom_size,
300297
smoothsize,
301298
stepsize,
302-
wig_shift,
303299
);
304300

305301
match output_type {
@@ -322,6 +318,7 @@ pub fn uniwig_main(
322318
clamped_start_position(
323319
primary_start.0,
324320
smoothsize,
321+
1, //must shift wiggle starts and core by 1 since it is 1 based
325322
),
326323
stepsize,
327324
current_chrom_size,
@@ -390,7 +387,6 @@ pub fn uniwig_main(
390387
current_chrom_size,
391388
smoothsize,
392389
stepsize,
393-
wig_shift,
394390
);
395391
match output_type {
396392
"file" => {
@@ -411,6 +407,7 @@ pub fn uniwig_main(
411407
clamped_start_position(
412408
primary_end.0,
413409
smoothsize,
410+
0,
414411
),
415412
);
416413
write_to_bed_graph_file(
@@ -432,6 +429,7 @@ pub fn uniwig_main(
432429
clamped_start_position(
433430
primary_end.0,
434431
smoothsize,
432+
0, // ends already 1 based, do not shift further
435433
),
436434
stepsize,
437435
current_chrom_size,
@@ -450,6 +448,7 @@ pub fn uniwig_main(
450448
clamped_start_position(
451449
primary_end.0,
452450
smoothsize,
451+
0,
453452
),
454453
stepsize,
455454
meta_data_file_names[1].clone(),
@@ -468,6 +467,7 @@ pub fn uniwig_main(
468467
clamped_start_position(
469468
primary_end.0,
470469
smoothsize,
470+
0,
471471
),
472472
stepsize,
473473
meta_data_file_names[1].clone(),
@@ -481,7 +481,6 @@ pub fn uniwig_main(
481481
&chromosome.ends,
482482
current_chrom_size,
483483
stepsize,
484-
wig_shift,
485484
);
486485
match output_type {
487486
"file" => {
@@ -499,7 +498,10 @@ pub fn uniwig_main(
499498
let count_info: (Vec<u32>, Vec<u32>, Vec<u32>) =
500499
compress_counts(
501500
&mut core_results,
502-
primary_start.0,
501+
clamped_start_position_zero_pos(
502+
primary_start.0,
503+
0,
504+
),
503505
);
504506
write_to_bed_graph_file(
505507
&count_info,
@@ -517,7 +519,7 @@ pub fn uniwig_main(
517519
&core_results.0,
518520
file_name.clone(),
519521
chrom_name.clone(),
520-
clamped_start_position(primary_start.0, 0),
522+
clamped_start_position(primary_start.0, 0, 1), //starts are 1 based must be shifted by 1
521523
stepsize,
522524
current_chrom_size,
523525
);
@@ -531,7 +533,10 @@ pub fn uniwig_main(
531533
&core_results.0,
532534
file_name.clone(),
533535
chrom_name.clone(),
534-
primary_start.0,
536+
clamped_start_position_zero_pos(
537+
primary_start.0,
538+
0,
539+
),
535540
stepsize,
536541
meta_data_file_names[2].clone(),
537542
);
@@ -546,7 +551,10 @@ pub fn uniwig_main(
546551
&core_results.0,
547552
file_name.clone(),
548553
chrom_name.clone(),
549-
primary_start.0,
554+
clamped_start_position_zero_pos(
555+
primary_start.0,
556+
0,
557+
),
550558
stepsize,
551559
meta_data_file_names[2].clone(),
552560
);
@@ -580,6 +588,63 @@ pub fn uniwig_main(
580588
);
581589
}
582590
}
591+
"npy" => {
592+
// populate hashmap for the npy meta data
593+
for chromosome in final_chromosomes.iter() {
594+
let chr_name = chromosome.chrom.clone();
595+
let current_chrom_size =
596+
*chrom_sizes.get(&chromosome.chrom).unwrap() as i32;
597+
npy_meta_data_map.insert(
598+
chr_name,
599+
HashMap::from([
600+
("stepsize".to_string(), stepsize),
601+
("reported_chrom_size".to_string(), current_chrom_size),
602+
]),
603+
);
604+
}
605+
606+
for location in vec_count_type.iter() {
607+
let temp_meta_file_name =
608+
format!("{}{}.{}", bwfileheader, *location, "meta");
609+
610+
if let Ok(file) = File::open(&temp_meta_file_name) {
611+
let reader = BufReader::new(file);
612+
613+
for line in reader.lines() {
614+
let line = line.unwrap();
615+
let parts: Vec<&str> = line.split_whitespace().collect();
616+
if parts.len() >= 3 {
617+
let chrom = parts[1].split('=').nth(1).expect(
618+
"Processing npy metadata file: Missing chromosome in line",
619+
);
620+
let start_str = parts[2].split('=')
621+
.nth(1)
622+
.expect("Processing npy metadata file: Missing start position in line");
623+
let starting_position: i32 = start_str.parse().expect(
624+
"Processing npy metadata file: Invalid start position",
625+
);
626+
627+
if let Some(current_chr_data) = npy_meta_data_map.get_mut(chrom)
628+
{
629+
current_chr_data.insert(
630+
(*location.to_string()).parse().unwrap(),
631+
starting_position,
632+
);
633+
}
634+
}
635+
}
636+
// Remove the file after it is used.
637+
let path = std::path::Path::new(&temp_meta_file_name);
638+
let _ = remove_file(path).unwrap();
639+
}
640+
}
641+
//write combined metadata as json
642+
let json_string = serde_json::to_string_pretty(&npy_meta_data_map).unwrap();
643+
let combined_npy_meta_file_path =
644+
format!("{}{}.{}", bwfileheader, "npy_meta", "json");
645+
let mut file = File::create(combined_npy_meta_file_path).unwrap();
646+
file.write_all(json_string.as_bytes()).unwrap();
647+
}
583648
_ => {}
584649
}
585650
bar.finish();

0 commit comments

Comments
 (0)