|
4 | 4 | import numpy as np
|
5 | 5 | import pandas as pd
|
6 | 6 | import pytest
|
| 7 | +from datasets import Dataset |
| 8 | + |
| 9 | +from datasets.features import ( |
| 10 | + Array2D, |
| 11 | + Array3D, |
| 12 | + Array4D, |
| 13 | + Array5D, |
| 14 | + Sequence, |
| 15 | + Value, |
| 16 | +) |
7 | 17 |
|
8 | 18 | from cyclops.data.utils import (
|
9 | 19 | create_indicator_variables,
|
|
12 | 22 | has_range_index,
|
13 | 23 | is_timestamp_series,
|
14 | 24 | to_range_index,
|
| 25 | + feature_is_numeric, |
| 26 | + feature_is_datetime, |
| 27 | + check_required_columns, |
| 28 | + get_columns_as_numpy_array, |
15 | 29 | )
|
16 | 30 |
|
17 | 31 |
|
| 32 | +def test_get_columns_as_numpy_array(): |
| 33 | + """Test get_columns_as_numpy_array fn.""" |
| 34 | + # Mock dataset creation |
| 35 | + data = { |
| 36 | + "column1": [1, 2, 3], |
| 37 | + "column2": [4, 5, 6] |
| 38 | + } |
| 39 | + mock_dataset = Dataset.from_dict(data) |
| 40 | + |
| 41 | + # Scenario 1: Valid Dataset object with single column |
| 42 | + result = get_columns_as_numpy_array(mock_dataset, "column1") |
| 43 | + np.testing.assert_array_equal(result, np.array([1, 2, 3])) # Adjusted expectation |
| 44 | + |
| 45 | + # Scenario 2: Valid Dataset object with multiple columns |
| 46 | + result = get_columns_as_numpy_array(mock_dataset, ["column1", "column2"]) |
| 47 | + np.testing.assert_array_equal(result, np.array([[1, 4], [2, 5], [3, 6]])) |
| 48 | + |
| 49 | + # Scenario 3: Valid dictionary dataset with single column |
| 50 | + result = get_columns_as_numpy_array(data, "column1") |
| 51 | + np.testing.assert_array_equal(result, np.array([1, 2, 3])) # Adjusted expectation |
| 52 | + |
| 53 | + # Scenario 4: Valid dictionary dataset with multiple columns |
| 54 | + result = get_columns_as_numpy_array(data, ["column1", "column2"]) |
| 55 | + np.testing.assert_array_equal(result, np.array([[1, 4], [2, 5], [3, 6]])) |
| 56 | + |
| 57 | + # Scenario 5: Invalid dataset type |
| 58 | + with pytest.raises(TypeError): |
| 59 | + get_columns_as_numpy_array(["not", "a", "dataset"], "column1") |
| 60 | + |
| 61 | + |
| 62 | +def test_check_required_columns(): |
| 63 | + """Test check_required_columns fn.""" |
| 64 | + # Scenario 1: All required columns are present |
| 65 | + dataset_columns = ["name", "age", "email"] |
| 66 | + check_required_columns(dataset_columns, "name", ["age", "email"]) |
| 67 | + check_required_columns(dataset_columns, "age") |
| 68 | + check_required_columns(dataset_columns, ["name"]) |
| 69 | + |
| 70 | + # Scenario 2: Some required columns are missing |
| 71 | + with pytest.raises(ValueError) as excinfo: |
| 72 | + check_required_columns(dataset_columns, "name", "gender") |
| 73 | + assert "gender" in str(excinfo.value) |
| 74 | + |
| 75 | + with pytest.raises(ValueError) as excinfo: |
| 76 | + check_required_columns(dataset_columns, ["name", "gender"]) |
| 77 | + assert "gender" in str(excinfo.value) |
| 78 | + |
| 79 | + # Scenario 3: All required columns are missing |
| 80 | + with pytest.raises(ValueError) as excinfo: |
| 81 | + check_required_columns(dataset_columns, "height", "weight") |
| 82 | + assert "height" in str(excinfo.value) and "weight" in str(excinfo.value) |
| 83 | + |
| 84 | + # Scenario 4: Handling of None in required columns |
| 85 | + check_required_columns(dataset_columns, None, "name") |
| 86 | + check_required_columns(dataset_columns, "name", None) |
| 87 | + check_required_columns(dataset_columns, None) |
| 88 | + |
| 89 | + # Scenario 5: No required columns provided (should not raise an error) |
| 90 | + check_required_columns(dataset_columns) |
| 91 | + |
| 92 | + |
| 93 | +def test_feature_is_numeric(): |
| 94 | + """Test feature_is_numeric fn.""" |
| 95 | + numeric_features = [ |
| 96 | + Value("int32"), Value("float64"), |
| 97 | + Array2D(shape=(10, 10), dtype="int32"), |
| 98 | + Array3D(shape=(10, 10, 10), dtype="float64"), |
| 99 | + Array4D(shape=(10, 10, 10, 10), dtype="int32"), |
| 100 | + Array5D(shape=(10, 10, 10, 10, 10), dtype="float64"), |
| 101 | + Sequence([Value("int32"), Value("float64")]), |
| 102 | + Value("bool"), |
| 103 | + ] |
| 104 | + for feature in numeric_features: |
| 105 | + assert feature_is_numeric(feature) == True, f"Failed for {type(feature)} with dtype {feature.dtype}" |
| 106 | + |
| 107 | + non_numeric_features = [ |
| 108 | + Value("string"), |
| 109 | + Array2D(shape=(10, 10), dtype="string"), |
| 110 | + ] |
| 111 | + for feature in non_numeric_features: |
| 112 | + assert feature_is_numeric(feature) == False, f"Failed for {type(feature)} with dtype {feature.dtype}" |
| 113 | + |
| 114 | + invalid_features = [None, 123, "invalid", [1, 2, 3], {"key": "value"}] |
| 115 | + for feature in invalid_features: |
| 116 | + with pytest.raises(AttributeError): |
| 117 | + feature_is_numeric(feature) |
| 118 | + |
| 119 | + |
| 120 | +def test_feature_is_datetime(): |
| 121 | + """Test feature_is_datetime fn.""" |
| 122 | + datetime_features = [ |
| 123 | + Value("timestamp[s]"), |
| 124 | + Array2D(shape=(10, 10), dtype="time[s]"), |
| 125 | + Sequence([Value("timestamp[ns]")]), |
| 126 | + ] |
| 127 | + for feature in datetime_features: |
| 128 | + assert feature_is_datetime(feature) == True, f"Failed for {type(feature)} with dtype {feature.dtype}" |
| 129 | + |
| 130 | + non_datetime_features = [ |
| 131 | + Value("string"), |
| 132 | + Array2D(shape=(10, 10), dtype="string"), |
| 133 | + Value("int32"), |
| 134 | + Array2D(shape=(10, 10), dtype="int32"), |
| 135 | + ] |
| 136 | + for feature in non_datetime_features: |
| 137 | + assert feature_is_datetime(feature) == False, f"Failed for {type(feature)} with dtype {feature.dtype}" |
| 138 | + |
| 139 | + invalid_features = [None, 123, "invalid", [1, 2, 3], {"key": "value"}] |
| 140 | + for feature in invalid_features: |
| 141 | + with pytest.raises(AttributeError): |
| 142 | + feature_is_datetime(feature) |
| 143 | + |
| 144 | + |
18 | 145 | def test_create_indicator_variables():
|
19 | 146 | """Test create_indicator_variables fn."""
|
20 | 147 | features = pd.DataFrame([[np.nan, 1], [3, np.nan]], columns=["A", "B"])
|
|
0 commit comments