From 9fc35fd00d1b34c58629479185941da2a8976f55 Mon Sep 17 00:00:00 2001 From: Carl Kadie Date: Mon, 1 Jan 2024 13:28:11 -0800 Subject: [PATCH] all tests pass --- bed_reader/__init__.py | 1 - bed_reader/_open_bed.py | 101 +++++++++++------------- bed_reader/tests/test_open_bed.py | 2 +- bed_reader/tests/test_open_bed_cloud.py | 36 +-------- 4 files changed, 50 insertions(+), 90 deletions(-) diff --git a/bed_reader/__init__.py b/bed_reader/__init__.py index b588b3f..53c488e 100644 --- a/bed_reader/__init__.py +++ b/bed_reader/__init__.py @@ -8,7 +8,6 @@ from bed_reader._open_bed import get_num_threads, open_bed from bed_reader._sample_data import sample_file, tmp_path from bed_reader._to_bed import to_bed -# cmk from bed_reader._open_bed_cloud import open_bed_cloud from .bed_reader import ( file_aat_piece_f32_orderf, diff --git a/bed_reader/_open_bed.py b/bed_reader/_open_bed.py index 5eb35b4..dd4aec8 100644 --- a/bed_reader/_open_bed.py +++ b/bed_reader/_open_bed.py @@ -88,6 +88,7 @@ class open_bed: Parameters ---------- + cmk update docs filepath: pathlib.Path or str File path to the .bed file. iid_count: None or int, optional @@ -210,7 +211,6 @@ class open_bed: def __init__( self, location: Union[str, Path, UrlParseResult], - # cmk must also accept old value iid_count: Optional[int] = None, sid_count: Optional[int] = None, properties: Mapping[str, List[Any]] = {}, @@ -218,11 +218,17 @@ def __init__( num_threads: Optional[int] = None, skip_format_check: bool = False, fam_location: Union[str, Path, UrlParseResult] = None, - # cmk must also accept old value bim_location: Union[str, Path, UrlParseResult] = None, - # cmk must also accept old value + + # accept old keywords + filepath: Union[str, Path] = None, + fam_filepath: Union[str, Path] = None, + bim_filepath: Union[str, Path] = None, ): - # cmk need to read the .fam and .bim files and check file from cloud if requested + location = self._combined(location, filepath, "location", "filepath") + fam_location = self._combined(fam_location, fam_filepath, "fam_location", "fam_filepath") + bim_location = self._combined(bim_location, bim_filepath, "bim_location", "bim_filepath") + self.location = self._path_or_url(location) self.count_A1 = count_A1 self._num_threads = num_threads @@ -250,6 +256,17 @@ def __init__( with open(self.location, "rb") as filepointer: self._check_file(filepointer) + # # its an error to set both location and filepath + # location = self._combined(location, filepath, "location", "filepath") + # fam_location = self._combined(fam_location, fam_filepath, "fam_location", "fam_filepath") + # bim_location = self._combined(bim_location, bim_filepath, "bim_location", "bim_filepath") + @staticmethod + def _combined(location, filepath, location_name, filepath_name): + if location is not None and filepath is not None: + raise ValueError(f"Cannot set both {location_name} and {filepath_name}") + # None, None is ok for now + return location if location is not None else filepath + @staticmethod def _replace_extension(location, extension): if open_bed._is_url(location): @@ -436,33 +453,10 @@ def read( val = np.zeros((len(iid_index), len(sid_index)), order=order, dtype=dtype) - # cmk similar code in sparse if self.iid_count > 0 and self.sid_count > 0: - if dtype == np.int8: - # cmk000 - file_reader = read_i8 - cloud_reader = read_cloud_i8 - elif dtype == np.float64: - file_reader = read_f64 - cloud_reader = read_cloud_f64 - elif dtype == np.float32: - file_reader = read_f32 - cloud_reader = read_cloud_f32 - else: - raise ValueError( - f"dtype '{val.dtype}' not known, only " - + "'int8', 'float32', and 'float64' are allowed." - ) - - if open_bed._is_url(self.location): - reader = cloud_reader - location_str = self.location.geturl() - else: - reader = file_reader - location_str = str(self.location.as_posix()) + reader, location_str = self._pick_reader(dtype) reader( - # cmk000 location_str, iid_count=self.iid_count, sid_count=self.sid_count, @@ -474,7 +468,6 @@ def read( ) else: - # cmk assert not a cloud read if not self.count_A1: byteZero = 0 byteThree = 2 @@ -532,6 +525,30 @@ def read( return val + def _pick_reader(self, dtype): + if dtype == np.int8: + file_reader = read_i8 + cloud_reader = read_cloud_i8 + elif dtype == np.float64: + file_reader = read_f64 + cloud_reader = read_cloud_f64 + elif dtype == np.float32: + file_reader = read_f32 + cloud_reader = read_cloud_f32 + else: + raise ValueError( + f"dtype '{dtype}' not known, only " + + "'int8', 'float32', and 'float64' are allowed." + ) + + if open_bed._is_url(self.location): + reader = cloud_reader + location_str = self.location.geturl() + else: + reader = file_reader + location_str = str(self.location.as_posix()) + return reader, location_str + def __str__(self) -> str: return f"{self.__class__.__name__}('{self.location}',...)" @@ -1239,7 +1256,7 @@ def _read_fam_or_bim(self, suffix): file_bytes = bytes(url_to_bytes(property_location.geturl())) if len(file_bytes) == 0: columns, row_count = [], 0 - else: # cmk similar code + else: # note similar code below columns, row_count = _read_csv( BytesIO(file_bytes), delimiter=delimiter, @@ -1482,28 +1499,7 @@ def read_sparse( indices = [np.empty(0, dtype=np.int32)] if self.iid_count > 0 and self.sid_count > 0: - if dtype == np.int8: - # cmk000 - file_reader = read_i8 - cloud_reader = read_cloud_i8 - elif dtype == np.float64: - file_reader = read_f64 - cloud_reader = read_cloud_f64 - elif dtype == np.float32: - file_reader = read_f32 - cloud_reader = read_cloud_f32 - else: - raise ValueError( - f"dtype '{dtype}' not known, only " - + "'int8', 'float32', and 'float64' are allowed." - ) - - if open_bed._is_url(self.location): - reader = cloud_reader - location_str = self.location.geturl() - else: - reader = file_reader - location_str = str(self.location.as_posix()) + reader, location_str = self._pick_reader(dtype) if format == "csc": val = np.zeros((len(iid_index), batch_size), order=order, dtype=dtype) @@ -1652,4 +1648,3 @@ def _convert_to_dtype(str_arr, dtype): logging.basicConfig(level=logging.INFO) pytest.main(["--doctest-modules", __file__]) -# cmk000 look for every self.filepath and fam_file and .bim_file diff --git a/bed_reader/tests/test_open_bed.py b/bed_reader/tests/test_open_bed.py index 4fca6dc..ff281ce 100644 --- a/bed_reader/tests/test_open_bed.py +++ b/bed_reader/tests/test_open_bed.py @@ -786,7 +786,7 @@ def test_fam_bim_filepath(shared_datadir, tmp_path): ) assert output_file.exists() and fam_file.exists() and bim_file.exists() - with open_bed(output_file, fam_location=fam_file, bim_location=bim_file) as deb: + with open_bed(output_file, fam_filepath=fam_file, bim_filepath=bim_file) as deb: val2 = deb.read() assert np.allclose(val, val2, equal_nan=True) val_sparse = deb.read_sparse() diff --git a/bed_reader/tests/test_open_bed_cloud.py b/bed_reader/tests/test_open_bed_cloud.py index 8beb0f9..8648675 100644 --- a/bed_reader/tests/test_open_bed_cloud.py +++ b/bed_reader/tests/test_open_bed_cloud.py @@ -7,6 +7,7 @@ import pytest from bed_reader import open_bed, to_bed, subset_f64_f64 +from bed_reader.tests.test_open_bed import setting_generator, reference_val def test_cloud_read1(shared_datadir): @@ -14,7 +15,6 @@ def test_cloud_read1(shared_datadir): file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed" file = "file:///" + str(file.as_posix()) - # cmk cmk next up, need to see if this is right and need to pass it to Rust with open_bed(file) as bed: assert bed.iid_count == 10 @@ -190,35 +190,6 @@ def test_cloud_bad_dtype_or_order(shared_datadir): open_bed(file_to_url(shared_datadir / "some_missing.bed")).read_sparse(dtype=np.int32) -# cmk similar code elsewhere -def setting_generator(seq_dict, seed=9392): - import itertools - - from numpy.random import RandomState - - longest = max((len(value_list) for value_list in seq_dict.values())) - - for test_index in range(longest): - setting = {} - for offset, (key, value_list) in enumerate(seq_dict.items()): - val = value_list[(test_index + offset) % len(value_list)] - if not (isinstance(val, str) and "leave_out" == val): - setting[key] = val - yield setting - - all_combo = list(itertools.product(*seq_dict.values())) - - random_state = RandomState(seed) - random_state.shuffle(all_combo) - for combo in all_combo: - setting = { - key: value_list - for key, value_list in itertools.zip_longest(seq_dict, combo) - if not (isinstance(value_list, str) and "leave_out" == value_list) - } - yield setting - - def test_cloud_properties(shared_datadir): file = file_to_url(shared_datadir / "plink_sim_10s_100v_10pmiss.bed") with open_bed(file) as bed: @@ -333,11 +304,6 @@ def test_cloud_c_reader_bed(shared_datadir): ) -def reference_val(shared_datadir): - val = np.load(shared_datadir / "some_missing.val.npy") - return val - - def test_cloud_bed_int8(tmp_path, shared_datadir): with open_bed(file_to_url(shared_datadir / "some_missing.bed")) as bed: for force_python_only in [False, True]: