-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_data.py
74 lines (63 loc) · 1.92 KB
/
test_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Test on the data to ensure it is structured as expected.
import os
from pathlib import Path
from typing import Final
import dask.dataframe as dd
import pandas as pd
import pytest
from dask.distributed import Client
from speclet.io import DataFile, data_path
SKIP_DATA_TESTS = os.getenv("DATA_TESTS") is None
FULL_DEPMAP_DATASET_PATH: Final = data_path(DataFile.DEPMAP_DATA)
@pytest.fixture(scope="module")
def setup_dask() -> Client:
client = Client(n_workers=4, threads_per_worker=2, memory_limit="16GB")
yield client
client.close()
@pytest.mark.skipif(SKIP_DATA_TESTS, reason="Skip data tests.")
def test_depmap_data_columns_exist(depmap_test_data: Path) -> None:
df = pd.read_csv(depmap_test_data)
expected_cols = (
"sgrna",
"hugo_symbol",
"depmap_id",
"replicate_id",
"lfc",
"counts_final",
"p_dna_batch",
"copy_number",
"replicate_id",
"screen",
"rna_expr",
"num_mutations",
"is_mutated",
"lineage",
"primary_or_metastasis",
)
for col in expected_cols:
assert col in df.columns
@pytest.mark.skipif(SKIP_DATA_TESTS, reason="Skip data tests.")
def test_depmap_data_no_missing(setup_dask: Client) -> None:
dask_df: dd.DataFrame = dd.read_csv(
FULL_DEPMAP_DATASET_PATH,
dtype={
"age": "float64",
"p_dna_batch": "object",
"primary_or_metastasis": "object",
"counts_final": "float64",
},
low_memory=False,
)
cols_without_na = [
"depmap_id",
"sgrna",
"hugo_symbol",
"lfc",
"screen",
"num_mutations",
"is_mutated",
"lineage",
]
na_checks: pd.Series = dask_df.head().isna()[cols_without_na].any().compute()
for column, any_missing in na_checks.iteritems(): # noqa: B301
assert not any_missing and isinstance(column, str)