Skip to content

Commit dc84ca8

Browse files
committed
fix up comments, etc
1 parent 66597a9 commit dc84ca8

File tree

5 files changed

+55
-700
lines changed

5 files changed

+55
-700
lines changed

bed_reader/_open_bed.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,7 @@ def _count(self, suffix):
10991099
if self.property_item("iid") is None:
11001100
# ... unless user doesn't want iid
11011101
file_bytes = bytes(
1102+
# cmk test this failing with bad url, options, and file-not-found
11021103
url_to_bytes(location.geturl(), self.cloud_options)
11031104
)
11041105
count = _rawincount(BytesIO(file_bytes))

src/bed_cloud.rs

Lines changed: 14 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
#[cfg(not(target_pointer_width = "64"))]
2+
compile_error!(
3+
"This code requires a 64-bit target architecture because the cloud-library assumes it."
4+
);
5+
16
use anyinput::anyinput;
27
use bytes::Bytes;
38
use core::fmt;
@@ -156,16 +161,14 @@ fn convert_negative_sid_index(
156161
Ok(in_sid_i_signed as u64)
157162
} else if (lower_sid_count..=-1).contains(&in_sid_i_signed) {
158163
#[allow(clippy::cast_sign_loss)]
159-
Ok((in_sid_i_signed - lower_sid_count) as u64) // cmk not sure about overflow
164+
Ok((in_sid_i_signed - lower_sid_count) as u64)
160165
} else {
161166
Err(Box::new(BedErrorPlus::BedError(BedError::SidIndexTooBig(
162167
in_sid_i_signed,
163168
))))
164169
}
165170
}
166171

167-
// cmk somehow we must only compile if size(usize) is 64 bits.
168-
169172
#[allow(clippy::too_many_arguments)]
170173
#[allow(clippy::similar_names)]
171174
async fn internal_read_no_alloc<TVal: BedVal, TObjectStore>(
@@ -272,11 +275,7 @@ fn decode_bytes_into_columns<TVal: BedVal>(
272275
) {
273276
for (bytes, out_sid_i) in bytes_slice.iter().zip(out_sid_i_vec.into_iter()) {
274277
let mut col = out_val.column_mut(out_sid_i);
275-
// // cmk In parallel, decompress the iid info and put it in its column
276-
// // cmk .par_bridge() // This seems faster that parallel zip
277-
// .try_for_each(|(bytes_vector_result, mut col)| match bytes_vector_result {
278-
// Err(e) => Err(e),
279-
// Ok(bytes_vector) => {
278+
// LATER: Consider doing this in parallel as in the non-cloud version.
280279
for out_iid_i in 0..iid_index.len() {
281280
let i_div_4 = i_div_4_array[out_iid_i];
282281
let i_mod_4_times_2: u8 = i_mod_4_times_2_array[out_iid_i];
@@ -455,7 +454,6 @@ where
455454
Ok(bed_cloud)
456455
}
457456

458-
/// cmk update docs
459457
/// Create [`BedCloud`](struct.BedCloud.html) from the builder.
460458
///
461459
/// > See [`BedCloud::builder`](struct.BedCloud.html#method.builder) for more details and examples.
@@ -1223,7 +1221,7 @@ where
12231221
/// assert!(dim == (3,4));
12241222
/// # Ok::<(), Box<BedErrorPlus>>(())}).unwrap();
12251223
/// # use {tokio::runtime::Runtime, bed_reader::BedErrorPlus};
1226-
// cmk call these at the same time?
1224+
// LATER: Could these be called at the same time, async?
12271225
pub async fn dim(&mut self) -> Result<(usize, usize), Box<BedErrorPlus>> {
12281226
Ok((self.iid_count().await?, self.sid_count().await?))
12291227
}
@@ -1767,52 +1765,6 @@ where
17671765
.await
17681766
}
17691767

1770-
/// cmk doc
1771-
// have read_and_fill_with_options call this
1772-
pub async fn read_and_fill_with_options_no_mut<TVal: BedVal>(
1773-
&self,
1774-
iid_count: usize,
1775-
sid_count: usize,
1776-
val: &mut nd::ArrayViewMut2<'_, TVal>, //mutable slices additionally allow to modify elements. But slices cannot grow - they are just a view into some vector.,
1777-
read_options: &ReadOptions<TVal>,
1778-
) -> Result<(), Box<BedErrorPlus>> {
1779-
// // must do these one-at-a-time because they mutate self to cache the results
1780-
// let iid_count = self.iid_count().await?;
1781-
// let sid_count = self.sid_count().await?;
1782-
1783-
let max_concurrent_requests =
1784-
compute_max_concurrent_requests(read_options.max_concurrent_requests)?;
1785-
1786-
let max_chunk_size = compute_max_chunk_size(read_options.max_chunk_size)?;
1787-
1788-
// If we already have a Vec<isize>, reference it. If we don't, create one and reference it.
1789-
let iid_hold = Hold::new(&read_options.iid_index, iid_count)?;
1790-
let iid_index = iid_hold.as_ref();
1791-
let sid_hold = Hold::new(&read_options.sid_index, sid_count)?;
1792-
let sid_index = sid_hold.as_ref();
1793-
1794-
let dim = val.dim();
1795-
if dim != (iid_index.len(), sid_index.len()) {
1796-
return Err(Box::new(
1797-
BedError::InvalidShape(iid_index.len(), sid_index.len(), dim.0, dim.1).into(),
1798-
));
1799-
}
1800-
1801-
read_no_alloc(
1802-
&self.object_path,
1803-
iid_count,
1804-
sid_count,
1805-
read_options.is_a1_counted,
1806-
iid_index,
1807-
sid_index,
1808-
read_options.missing_value,
1809-
max_concurrent_requests,
1810-
max_chunk_size,
1811-
&mut val.view_mut(),
1812-
)
1813-
.await
1814-
}
1815-
18161768
/// Read all genotype data into a preallocated array.
18171769
///
18181770
/// > Also see [`ReadOptions::builder`](struct.ReadOptions.html#method.builder).
@@ -1896,122 +1848,7 @@ where
18961848
Ok(val)
18971849
}
18981850

1899-
/// Write genotype data with default metadata.
1900-
///
1901-
/// > Also see [`WriteOptions::builder`](struct.WriteOptions.html#method.builder), which supports metadata and options.
1902-
///
1903-
/// # Errors
1904-
/// See [`BedError`](enum.BedError.html) and [`BedErrorPlus`](enum.BedErrorPlus.html)
1905-
/// for all possible errors.
1906-
///
1907-
/// # Example
1908-
/// In this example, write genotype data using default metadata.
1909-
/// ```ignore // cmk
1910-
/// use ndarray as nd;
1911-
/// use bed_reader::{BedCloud, WriteOptions};
1912-
///
1913-
/// let output_folder = temp_testdir::TempDir::default();
1914-
/// let output_file = output_folder.join("small.bed");
1915-
///
1916-
/// let val = nd::array![[1, 0, -127, 0], [2, 0, -127, 2], [0, 1, 2, 0]];
1917-
/// BedCloud::write(&val, &output_file)?;
1918-
///
1919-
/// // If we then read the new file and list the chromosome property,
1920-
/// // it is an array of zeros, the default chromosome value.
1921-
/// let mut bed_cloud2 = BedCloud::new(&output_file)?;
1922-
/// println!("{:?}", bed_cloud2.chromosome().await?); // Outputs ndarray ["0", "0", "0", "0"]
1923-
/// # use bed_reader::BedErrorPlus;
1924-
/// # Ok::<(), Box<BedErrorPlus>>(())
1925-
/// ```
1926-
// cmk need to do 'write'
1927-
// pub fn write<S: nd::Data<Elem = TVal>, TVal: BedVal>(
1928-
// val: &nd::ArrayBase<S, nd::Ix2>,
1929-
// path: &Path,
1930-
// ) -> Result<(), Box<BedErrorPlus>> {
1931-
// WriteOptions::builder(path).write(val)
1932-
// }
1933-
1934-
/// Given an 2D array of genotype data and a [`WriteOptions`](struct.WriteOptionsBuilder.html), write to a .bed file.
1935-
///
1936-
/// > Also see [`WriteOptionsBuilder::write`](struct.WriteOptionsBuilder.html#method.write), which creates
1937-
/// > a [`WriteOptions`](struct.WriteOptionsBuilder.html) and writes to file in one step.
1938-
///
1939-
/// # Example
1940-
/// ```ignore // cmk
1941-
/// use ndarray as nd;
1942-
/// use bed_reader::{BedCloud, WriteOptions};
1943-
///
1944-
/// let val = nd::array![
1945-
/// [1.0, 0.0, f64::NAN, 0.0],
1946-
/// [2.0, 0.0, f64::NAN, 2.0],
1947-
/// [0.0, 1.0, 2.0, 0.0]
1948-
/// ];
1949-
///
1950-
/// let output_folder = temp_testdir::TempDir::default();
1951-
/// let output_file = output_folder.join("small.bed");
1952-
/// let write_options = WriteOptions::builder(output_file)
1953-
/// .iid(["iid1", "iid2", "iid3"])
1954-
/// .sid(["sid1", "sid2", "sid3", "sid4"])
1955-
/// .build(3,4)?;
1956-
///
1957-
/// BedCloud::write_with_options(&val, &write_options)?;
1958-
/// # use bed_reader::BedErrorPlus;
1959-
/// # Ok::<(), Box<BedErrorPlus>>(())
1960-
/// ```
1961-
// cmk need to do 'write_with_options'
1962-
// pub fn write_with_options<S, TVal>(
1963-
// val: &nd::ArrayBase<S, nd::Ix2>,
1964-
// write_options: &WriteOptions<TVal>,
1965-
// ) -> Result<(), Box<BedErrorPlus>>
1966-
// where
1967-
// S: nd::Data<Elem = TVal>,
1968-
// TVal: BedVal,
1969-
// {
1970-
// let (iid_count, sid_count) = val.dim();
1971-
// if iid_count != write_options.iid_count() {
1972-
// return Err(BedError::InconsistentCount(
1973-
// "iid".to_string(),
1974-
// write_options.iid_count(),
1975-
// iid_count,
1976-
// )
1977-
// .into());
1978-
// }
1979-
// if sid_count != write_options.sid_count() {
1980-
// return Err(BedError::InconsistentCount(
1981-
// "sid".to_string(),
1982-
// write_options.sid_count(),
1983-
// sid_count,
1984-
// )
1985-
// .into());
1986-
// }
1987-
1988-
// let num_threads = compute_num_threads(write_options.num_threads)?;
1989-
// write_val(
1990-
// &write_options.path,
1991-
// val,
1992-
// write_options.is_a1_counted,
1993-
// write_options.missing_value,
1994-
// num_threads,
1995-
// )?;
1996-
1997-
// if !write_options.skip_fam() {
1998-
// if let Err(e) = write_options.metadata.write_fam(write_options.fam_object_path()) {
1999-
// // Clean up the file
2000-
// let _ = fs::remove_file(&write_options.fam_object_path);
2001-
// return Err(e);
2002-
// }
2003-
// }
2004-
2005-
// if !write_options.skip_bim() {
2006-
// if let Err(e) = write_options.metadata.write_bim(write_options.bim_object_path()) {
2007-
// // Clean up the file
2008-
// let _ = fs::remove_file(&write_options.bim_object_path);
2009-
// return Err(e);
2010-
// }
2011-
// }
2012-
2013-
// Ok(())
2014-
// }
1851+
// LATER: Support writing to a BedCloud
20151852

20161853
async fn unlazy_fam<T: FromStringArray<T>>(
20171854
&mut self,
@@ -2227,9 +2064,9 @@ pub struct ObjectPath<TObjectStore>
22272064
where
22282065
TObjectStore: ObjectStore,
22292066
{
2230-
/// cmk doc
2067+
/// An `Arc`-wrapped [`ObjectStore`], for example, an AWS S3 reader or a local file reader.
22312068
pub object_store: Arc<TObjectStore>,
2232-
/// cmk doc
2069+
/// A [`StorePath`] that points to a file on the [`ObjectStore`].
22332070
pub path: StorePath,
22342071
}
22352072

@@ -2287,7 +2124,9 @@ where
22872124
/// ```
22882125
pub async fn size(&self) -> Result<usize, Box<BedErrorPlus>> {
22892126
let get_result = self.get().await?;
2290-
let object_meta = &get_result.meta; // cmk good idea?
2127+
// LATER: See if https://github.com/apache/arrow-rs/issues/5272 if fixed in
2128+
// a way so that only one read is needed.
2129+
let object_meta = &get_result.meta;
22912130
Ok(object_meta.size)
22922131
}
22932132

src/lib.rs

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
#![warn(missing_docs)]
2-
// cmk decide which of these to use
32
#![warn(clippy::pedantic)]
43
#![allow(
5-
clippy::missing_panics_doc,
6-
clippy::missing_errors_doc,
4+
clippy::missing_panics_doc, // LATER: add panics docs
5+
clippy::missing_errors_doc, // LATER: add errors docs
76
clippy::similar_names,
87
clippy::cast_possible_truncation,
98
clippy::cast_possible_wrap,
@@ -3769,7 +3768,6 @@ impl From<()> for Index {
37693768
/// for a list of expressions for selecting individuals (sample)
37703769
/// and SNPs (variants).
37713770
#[derive(Debug, Clone, Builder)]
3772-
// cmk should this be Box<BedErrorPlus>?
37733771
#[builder(build_fn(error = "Box<BedErrorPlus>"))]
37743772
pub struct ReadOptions<TVal: BedVal> {
37753773
/// Value to use for missing values (defaults to -127 or NaN)
@@ -3983,12 +3981,10 @@ pub struct ReadOptions<TVal: BedVal> {
39833981
#[builder(default, setter(strip_option))]
39843982
num_threads: Option<usize>,
39853983

3984+
// LATER: Allow this to be set with an environment variable.
39863985
/// Maximum number of concurrent async requests (defaults to 10) --
39873986
/// Used by `BedCloud`.
39883987
///
3989-
/// cmk Can also be set with an environment variable.
3990-
/// See [Environment Variables](index.html#environment-variables).
3991-
///
39923988
/// In this example, we read using only request at a time.
39933989
/// ```
39943990
/// use ndarray as nd;
@@ -4014,12 +4010,10 @@ pub struct ReadOptions<TVal: BedVal> {
40144010
#[builder(default, setter(strip_option))]
40154011
max_concurrent_requests: Option<usize>,
40164012

4013+
// LATER: Allow this to be set with an environment variable.
40174014
/// Maximum chunk size of async requests (defaults to 8_000_000 bytes) --
40184015
/// Used by `BedCloud`.
40194016
///
4020-
/// cmk Can also be set with an environment variable.
4021-
/// See [Environment Variables](index.html#environment-variables).
4022-
///
40234017
/// In this example, we read using only 1_000_000 bytes per request.
40244018
/// ```
40254019
/// use ndarray as nd;

0 commit comments

Comments
 (0)