Skip to content

Commit

Permalink
all tests pass
Browse files Browse the repository at this point in the history
  • Loading branch information
CarlKCarlK committed Jan 1, 2024
1 parent dcde21e commit 9fc35fd
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 90 deletions.
1 change: 0 additions & 1 deletion bed_reader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from bed_reader._open_bed import get_num_threads, open_bed
from bed_reader._sample_data import sample_file, tmp_path
from bed_reader._to_bed import to_bed
# cmk from bed_reader._open_bed_cloud import open_bed_cloud

from .bed_reader import (
file_aat_piece_f32_orderf,
Expand Down
101 changes: 48 additions & 53 deletions bed_reader/_open_bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ class open_bed:
Parameters
----------
cmk update docs
filepath: pathlib.Path or str
File path to the .bed file.
iid_count: None or int, optional
Expand Down Expand Up @@ -210,19 +211,24 @@ class open_bed:
def __init__(
self,
location: Union[str, Path, UrlParseResult],
# cmk must also accept old value
iid_count: Optional[int] = None,
sid_count: Optional[int] = None,
properties: Mapping[str, List[Any]] = {},
count_A1: bool = True,
num_threads: Optional[int] = None,
skip_format_check: bool = False,
fam_location: Union[str, Path, UrlParseResult] = None,
# cmk must also accept old value
bim_location: Union[str, Path, UrlParseResult] = None,
# cmk must also accept old value

# accept old keywords
filepath: Union[str, Path] = None,
fam_filepath: Union[str, Path] = None,
bim_filepath: Union[str, Path] = None,
):
# cmk need to read the .fam and .bim files and check file from cloud if requested
location = self._combined(location, filepath, "location", "filepath")
fam_location = self._combined(fam_location, fam_filepath, "fam_location", "fam_filepath")
bim_location = self._combined(bim_location, bim_filepath, "bim_location", "bim_filepath")

self.location = self._path_or_url(location)
self.count_A1 = count_A1
self._num_threads = num_threads
Expand Down Expand Up @@ -250,6 +256,17 @@ def __init__(
with open(self.location, "rb") as filepointer:
self._check_file(filepointer)

# # its an error to set both location and filepath
# location = self._combined(location, filepath, "location", "filepath")
# fam_location = self._combined(fam_location, fam_filepath, "fam_location", "fam_filepath")
# bim_location = self._combined(bim_location, bim_filepath, "bim_location", "bim_filepath")
@staticmethod
def _combined(location, filepath, location_name, filepath_name):
if location is not None and filepath is not None:
raise ValueError(f"Cannot set both {location_name} and {filepath_name}")
# None, None is ok for now
return location if location is not None else filepath

@staticmethod
def _replace_extension(location, extension):
if open_bed._is_url(location):
Expand Down Expand Up @@ -436,33 +453,10 @@ def read(

val = np.zeros((len(iid_index), len(sid_index)), order=order, dtype=dtype)

# cmk similar code in sparse
if self.iid_count > 0 and self.sid_count > 0:
if dtype == np.int8:
# cmk000
file_reader = read_i8
cloud_reader = read_cloud_i8
elif dtype == np.float64:
file_reader = read_f64
cloud_reader = read_cloud_f64
elif dtype == np.float32:
file_reader = read_f32
cloud_reader = read_cloud_f32
else:
raise ValueError(
f"dtype '{val.dtype}' not known, only "
+ "'int8', 'float32', and 'float64' are allowed."
)

if open_bed._is_url(self.location):
reader = cloud_reader
location_str = self.location.geturl()
else:
reader = file_reader
location_str = str(self.location.as_posix())
reader, location_str = self._pick_reader(dtype)

reader(
# cmk000
location_str,
iid_count=self.iid_count,
sid_count=self.sid_count,
Expand All @@ -474,7 +468,6 @@ def read(
)

else:
# cmk assert not a cloud read
if not self.count_A1:
byteZero = 0
byteThree = 2
Expand Down Expand Up @@ -532,6 +525,30 @@ def read(

return val

def _pick_reader(self, dtype):
if dtype == np.int8:
file_reader = read_i8
cloud_reader = read_cloud_i8
elif dtype == np.float64:
file_reader = read_f64
cloud_reader = read_cloud_f64
elif dtype == np.float32:
file_reader = read_f32
cloud_reader = read_cloud_f32
else:
raise ValueError(
f"dtype '{dtype}' not known, only "
+ "'int8', 'float32', and 'float64' are allowed."
)

if open_bed._is_url(self.location):
reader = cloud_reader
location_str = self.location.geturl()
else:
reader = file_reader
location_str = str(self.location.as_posix())
return reader, location_str

def __str__(self) -> str:
return f"{self.__class__.__name__}('{self.location}',...)"

Expand Down Expand Up @@ -1239,7 +1256,7 @@ def _read_fam_or_bim(self, suffix):
file_bytes = bytes(url_to_bytes(property_location.geturl()))
if len(file_bytes) == 0:
columns, row_count = [], 0
else: # cmk similar code
else: # note similar code below
columns, row_count = _read_csv(
BytesIO(file_bytes),
delimiter=delimiter,
Expand Down Expand Up @@ -1482,28 +1499,7 @@ def read_sparse(
indices = [np.empty(0, dtype=np.int32)]

if self.iid_count > 0 and self.sid_count > 0:
if dtype == np.int8:
# cmk000
file_reader = read_i8
cloud_reader = read_cloud_i8
elif dtype == np.float64:
file_reader = read_f64
cloud_reader = read_cloud_f64
elif dtype == np.float32:
file_reader = read_f32
cloud_reader = read_cloud_f32
else:
raise ValueError(
f"dtype '{dtype}' not known, only "
+ "'int8', 'float32', and 'float64' are allowed."
)

if open_bed._is_url(self.location):
reader = cloud_reader
location_str = self.location.geturl()
else:
reader = file_reader
location_str = str(self.location.as_posix())
reader, location_str = self._pick_reader(dtype)

if format == "csc":
val = np.zeros((len(iid_index), batch_size), order=order, dtype=dtype)
Expand Down Expand Up @@ -1652,4 +1648,3 @@ def _convert_to_dtype(str_arr, dtype):
logging.basicConfig(level=logging.INFO)

pytest.main(["--doctest-modules", __file__])
# cmk000 look for every self.filepath and fam_file and .bim_file
2 changes: 1 addition & 1 deletion bed_reader/tests/test_open_bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,7 @@ def test_fam_bim_filepath(shared_datadir, tmp_path):
)
assert output_file.exists() and fam_file.exists() and bim_file.exists()

with open_bed(output_file, fam_location=fam_file, bim_location=bim_file) as deb:
with open_bed(output_file, fam_filepath=fam_file, bim_filepath=bim_file) as deb:
val2 = deb.read()
assert np.allclose(val, val2, equal_nan=True)
val_sparse = deb.read_sparse()
Expand Down
36 changes: 1 addition & 35 deletions bed_reader/tests/test_open_bed_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
import pytest

from bed_reader import open_bed, to_bed, subset_f64_f64
from bed_reader.tests.test_open_bed import setting_generator, reference_val


def test_cloud_read1(shared_datadir):
import math

file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed"
file = "file:///" + str(file.as_posix())
# cmk cmk next up, need to see if this is right and need to pass it to Rust

with open_bed(file) as bed:
assert bed.iid_count == 10
Expand Down Expand Up @@ -190,35 +190,6 @@ def test_cloud_bad_dtype_or_order(shared_datadir):
open_bed(file_to_url(shared_datadir / "some_missing.bed")).read_sparse(dtype=np.int32)


# cmk similar code elsewhere
def setting_generator(seq_dict, seed=9392):
import itertools

from numpy.random import RandomState

longest = max((len(value_list) for value_list in seq_dict.values()))

for test_index in range(longest):
setting = {}
for offset, (key, value_list) in enumerate(seq_dict.items()):
val = value_list[(test_index + offset) % len(value_list)]
if not (isinstance(val, str) and "leave_out" == val):
setting[key] = val
yield setting

all_combo = list(itertools.product(*seq_dict.values()))

random_state = RandomState(seed)
random_state.shuffle(all_combo)
for combo in all_combo:
setting = {
key: value_list
for key, value_list in itertools.zip_longest(seq_dict, combo)
if not (isinstance(value_list, str) and "leave_out" == value_list)
}
yield setting


def test_cloud_properties(shared_datadir):
file = file_to_url(shared_datadir / "plink_sim_10s_100v_10pmiss.bed")
with open_bed(file) as bed:
Expand Down Expand Up @@ -333,11 +304,6 @@ def test_cloud_c_reader_bed(shared_datadir):
)


def reference_val(shared_datadir):
val = np.load(shared_datadir / "some_missing.val.npy")
return val


def test_cloud_bed_int8(tmp_path, shared_datadir):
with open_bed(file_to_url(shared_datadir / "some_missing.bed")) as bed:
for force_python_only in [False, True]:
Expand Down

0 comments on commit 9fc35fd

Please sign in to comment.