Skip to content

Commit

Permalink
all Python tests (so far) pass
Browse files Browse the repository at this point in the history
  • Loading branch information
CarlKCarlK committed Jan 1, 2024
1 parent 90e9ab2 commit 5cc2655
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 113 deletions.
40 changes: 32 additions & 8 deletions bed_reader/_open_bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,16 @@ def __init__(
self.count_A1 = count_A1
self._num_threads = num_threads
self.skip_format_check = skip_format_check
self._fam_location = (self._path_or_url(fam_location) if fam_location is not None else self._replace_extension(self.location, "fam"))
self._bim_location = (self._path_or_url(bim_location) if bim_location is not None else self._replace_extension(self.location, "bim"))
self._fam_location = (
self._path_or_url(fam_location)
if fam_location is not None
else self._replace_extension(self.location, "fam")
)
self._bim_location = (
self._path_or_url(bim_location)
if bim_location is not None
else self._replace_extension(self.location, "bim")
)

self.properties_dict, self._counts = open_bed._fix_up_properties(
properties, iid_count, sid_count, use_fill_sequence=False
Expand All @@ -236,7 +244,7 @@ def __init__(
if self._is_url(self.location):
check_file_cloud(self.location.geturl())
else:
with open(self.filepath, "rb") as filepointer:
with open(self.location, "rb") as filepointer:
self._check_file(filepointer)

@staticmethod
Expand Down Expand Up @@ -270,7 +278,9 @@ def _path_or_url(input):
return input
if isinstance(input, UrlParseResult):
return input
assert isinstance(input, str), "Expected a string or Path object or UrlParseResult"
assert isinstance(
input, str
), "Expected a string or Path object or UrlParseResult"
parsed = urlparse(input)
if parsed.scheme and "://" in input:
return parsed
Expand Down Expand Up @@ -1019,9 +1029,23 @@ def _count(self, suffix):
if count is None:
location = self._property_location(suffix)
if open_bed._is_url(location):
# should not download twice
file_bytes = bytes(url_to_bytes(location.geturl()))
count = _rawincount(BytesIO(file_bytes))
# should not download twice from cloud
if suffix == "fam":
if self.property_item("iid") is None:
# ... unless user doesn't want iid
file_bytes = bytes(url_to_bytes(location.geturl()))
count = _rawincount(BytesIO(file_bytes))
else:
count = len(self.iid)
elif suffix == "bim":
if self.property_item("sid") is None:
# ... unless user doesn't want sid
file_bytes = bytes(url_to_bytes(location.geturl()))
count = _rawincount(BytesIO(file_bytes))
else:
count = len(self.sid)
else:
raise ValueError("real assert")
else:
count = _rawincount(open(location, "rb"))
self._counts[suffix] = count
Expand Down Expand Up @@ -1079,7 +1103,7 @@ def shape(self):
(3, 4)
"""
return (len(self.iid), len(self.sid))
return (self.iid_count, self.sid_count)

@staticmethod
def _split_index(index):
Expand Down
211 changes: 106 additions & 105 deletions bed_reader/tests/test_open_bed_cloud.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# import logging
import logging
# import os
# import platform
from pathlib import Path
Expand Down Expand Up @@ -181,119 +181,120 @@ def test_cloud_bad_bed(shared_datadir):
open_bed(file_to_url(shared_datadir / "badfile.bed"), skip_format_check=True)


# def test_cloud_bad_dtype_or_order(shared_datadir):
# with pytest.raises(ValueError):
# open_bed(shared_datadir / "some_missing.bed").read(dtype=np.int32)
# with pytest.raises(ValueError):
# open_bed(shared_datadir / "some_missing.bed").read(order="X")
# with pytest.raises(ValueError):
# open_bed(shared_datadir / "some_missing.bed").read_sparse(dtype=np.int32)
def test_cloud_bad_dtype_or_order(shared_datadir):
with pytest.raises(ValueError):
open_bed(file_to_url(shared_datadir / "some_missing.bed")).read(dtype=np.int32)
with pytest.raises(ValueError):
open_bed(file_to_url(shared_datadir / "some_missing.bed")).read(order="X")
with pytest.raises(ValueError):
open_bed(file_to_url(shared_datadir / "some_missing.bed")).read_sparse(dtype=np.int32)


# def setting_generator(seq_dict, seed=9392):
# import itertools
# cmk similar code elsewhere
def setting_generator(seq_dict, seed=9392):
import itertools

# from numpy.random import RandomState
from numpy.random import RandomState

# longest = max((len(value_list) for value_list in seq_dict.values()))
longest = max((len(value_list) for value_list in seq_dict.values()))

# for test_index in range(longest):
# setting = {}
# for offset, (key, value_list) in enumerate(seq_dict.items()):
# val = value_list[(test_index + offset) % len(value_list)]
# if not (isinstance(val, str) and "leave_out" == val):
# setting[key] = val
# yield setting
for test_index in range(longest):
setting = {}
for offset, (key, value_list) in enumerate(seq_dict.items()):
val = value_list[(test_index + offset) % len(value_list)]
if not (isinstance(val, str) and "leave_out" == val):
setting[key] = val
yield setting

# all_combo = list(itertools.product(*seq_dict.values()))
all_combo = list(itertools.product(*seq_dict.values()))

# random_state = RandomState(seed)
# random_state.shuffle(all_combo)
# for combo in all_combo:
# setting = {
# key: value_list
# for key, value_list in itertools.zip_longest(seq_dict, combo)
# if not (isinstance(value_list, str) and "leave_out" == value_list)
# }
# yield setting


# def test_cloud_properties(shared_datadir):
# file = shared_datadir / "plink_sim_10s_100v_10pmiss.bed"
# with open_bed(file) as bed:
# iid_list = bed.iid.tolist()
# sid_list = bed.sid.tolist()
# chromosome_list = bed.chromosome.tolist()

# test_count = 75

# seq_dict = {
# "iid": ["leave_out", None, iid_list, np.array(iid_list)],
# "iid_count": ["leave_out", len(iid_list)],
# "iid_before_read": [False, True],
# "iid_after_read": [False, True],
# "sid": ["leave_out", None, sid_list, np.array(sid_list)],
# "sid_count": [None, len(sid_list)],
# "sid_before_read": [False, True],
# "sid_after_read": [False, True],
# "chromosome": ["leave_out", None, chromosome_list, np.array(chromosome_list)],
# "chromosome_before_read": [False, True],
# "chromosome_after_read": [False, True],
# }
random_state = RandomState(seed)
random_state.shuffle(all_combo)
for combo in all_combo:
setting = {
key: value_list
for key, value_list in itertools.zip_longest(seq_dict, combo)
if not (isinstance(value_list, str) and "leave_out" == value_list)
}
yield setting

# def _not_set_to_none(settings, key):
# return key not in settings or settings[key] is not None

# for test_index, settings in enumerate(setting_generator(seq_dict)):
# if test_index >= test_count:
# break
# with open_bed(
# file,
# iid_count=settings.get("iid_count"),
# sid_count=settings.get("sid_count"),
# properties={
# k: v for k, v in settings.items() if k in {"iid", "sid", "chromosome"}
# },
# ) as bed:
# logging.info(f"Test {test_count}")
# if settings["iid_before_read"]:
# if _not_set_to_none(settings, "iid"):
# assert np.array_equal(bed.iid, iid_list)
# else:
# assert bed.iid is None
# if settings["sid_before_read"]:
# if _not_set_to_none(settings, "sid"):
# assert np.array_equal(bed.sid, sid_list)
# else:
# assert bed.sid is None
# if settings["chromosome_before_read"]:
# if _not_set_to_none(settings, "chromosome"):
# assert np.array_equal(bed.chromosome, chromosome_list)
# else:
# assert bed.chromosome is None
# val = bed.read()
# assert val.shape == (
# len(iid_list),
# len(sid_list),
# )
# val_sparse = bed.read_sparse()
# assert np.allclose(val, val_sparse.toarray(), equal_nan=True)
# if settings["iid_after_read"]:
# if _not_set_to_none(settings, "iid"):
# assert np.array_equal(bed.iid, iid_list)
# else:
# assert bed.iid is None
# if settings["sid_after_read"]:
# if _not_set_to_none(settings, "sid"):
# assert np.array_equal(bed.sid, sid_list)
# else:
# assert bed.sid is None
# if settings["chromosome_after_read"]:
# if _not_set_to_none(settings, "chromosome"):
# assert np.array_equal(bed.chromosome, chromosome_list)
# else:
# assert bed.chromosome is None
# # bed._assert_iid_sid_chromosome()
def test_cloud_properties(shared_datadir):
file = file_to_url(shared_datadir / "plink_sim_10s_100v_10pmiss.bed")
with open_bed(file) as bed:
iid_list = bed.iid.tolist()
sid_list = bed.sid.tolist()
chromosome_list = bed.chromosome.tolist()

test_count = 75

seq_dict = {
"iid": ["leave_out", None, iid_list, np.array(iid_list)],
"iid_count": ["leave_out", len(iid_list)],
"iid_before_read": [False, True],
"iid_after_read": [False, True],
"sid": ["leave_out", None, sid_list, np.array(sid_list)],
"sid_count": [None, len(sid_list)],
"sid_before_read": [False, True],
"sid_after_read": [False, True],
"chromosome": ["leave_out", None, chromosome_list, np.array(chromosome_list)],
"chromosome_before_read": [False, True],
"chromosome_after_read": [False, True],
}

def _not_set_to_none(settings, key):
return key not in settings or settings[key] is not None

for test_index, settings in enumerate(setting_generator(seq_dict)):
if test_index >= test_count:
break
with open_bed(
file,
iid_count=settings.get("iid_count"),
sid_count=settings.get("sid_count"),
properties={
k: v for k, v in settings.items() if k in {"iid", "sid", "chromosome"}
},
) as bed:
logging.info(f"Test {test_count}")
if settings["iid_before_read"]:
if _not_set_to_none(settings, "iid"):
assert np.array_equal(bed.iid, iid_list)
else:
assert bed.iid is None
if settings["sid_before_read"]:
if _not_set_to_none(settings, "sid"):
assert np.array_equal(bed.sid, sid_list)
else:
assert bed.sid is None
if settings["chromosome_before_read"]:
if _not_set_to_none(settings, "chromosome"):
assert np.array_equal(bed.chromosome, chromosome_list)
else:
assert bed.chromosome is None
val = bed.read()
assert val.shape == (
len(iid_list),
len(sid_list),
)
val_sparse = bed.read_sparse()
assert np.allclose(val, val_sparse.toarray(), equal_nan=True)
if settings["iid_after_read"]:
if _not_set_to_none(settings, "iid"):
assert np.array_equal(bed.iid, iid_list)
else:
assert bed.iid is None
if settings["sid_after_read"]:
if _not_set_to_none(settings, "sid"):
assert np.array_equal(bed.sid, sid_list)
else:
assert bed.sid is None
if settings["chromosome_after_read"]:
if _not_set_to_none(settings, "chromosome"):
assert np.array_equal(bed.chromosome, chromosome_list)
else:
assert bed.chromosome is None
# bed._assert_iid_sid_chromosome()


# def test_cloud_c_reader_bed(shared_datadir):
Expand Down

0 comments on commit 5cc2655

Please sign in to comment.