diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index 8f1e9de6d..2f84387c9 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -18,6 +18,7 @@ DatasetMissesFeaturesError, FeatureDataMismatchError, InputSizeError, + InvalidFitDataError, InvalidModelStructureError, LearningError, ModelNotFittedError, @@ -69,6 +70,7 @@ class OutOfBoundsError(SafeDsError): "DatasetMissesDataError", "DatasetMissesFeaturesError", "FeatureDataMismatchError", + "InvalidFitDataError", "InputSizeError", "InvalidModelStructureError", "LearningError", diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index d84395485..649ea0455 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -22,6 +22,13 @@ def __init__(self) -> None: super().__init__("Dataset contains no rows") +class InvalidFitDataError(Exception): + """Raised when a Neural Network is fitted on invalid data.""" + + def __init__(self, reason: str) -> None: + super().__init__(f"The given Fit Data is invalid:\n{reason}") + + class LearningError(Exception): """ Raised when an error occurred while training a model. diff --git a/src/safeds/ml/nn/converters/_input_converter_table.py b/src/safeds/ml/nn/converters/_input_converter_table.py index 52d64ac01..7f26b39af 100644 --- a/src/safeds/ml/nn/converters/_input_converter_table.py +++ b/src/safeds/ml/nn/converters/_input_converter_table.py @@ -4,6 +4,7 @@ from safeds.data.labeled.containers import TabularDataset from safeds.data.tabular.containers import Column, Table +from safeds.exceptions import InvalidFitDataError from ._input_converter import InputConversion @@ -43,6 +44,24 @@ def _is_fit_data_valid(self, input_data: TabularDataset) -> bool: self._feature_names = input_data.features.column_names self._target_name = input_data.target.name self._first = False + + columns_with_missing_values = [] + columns_with_non_numerical_data = [] + + for col in input_data.features.add_columns([input_data.target]).to_columns(): + if col.missing_value_count() > 0: + columns_with_missing_values.append(col.name) + if not col.type.is_numeric: + columns_with_non_numerical_data.append(col.name) + + reason = "" + if len(columns_with_missing_values) > 0: + reason += f"The following Columns contain missing values: {columns_with_missing_values}\n" + if len(columns_with_non_numerical_data) > 0: + reason += f"The following Columns contain non-numerical data: {columns_with_non_numerical_data}" + if reason != "": + raise InvalidFitDataError(reason) + return (sorted(input_data.features.column_names)).__eq__(sorted(self._feature_names)) def _is_predict_data_valid(self, input_data: Table) -> bool: diff --git a/tests/safeds/ml/nn/test_model.py b/tests/safeds/ml/nn/test_model.py index 5b8022a2c..0902d630d 100644 --- a/tests/safeds/ml/nn/test_model.py +++ b/tests/safeds/ml/nn/test_model.py @@ -1,4 +1,5 @@ import pickle +import re import pytest from safeds.data.image.typing import ImageSize @@ -6,6 +7,7 @@ from safeds.data.tabular.containers import Table from safeds.exceptions import ( FeatureDataMismatchError, + InvalidFitDataError, InvalidModelStructureError, ModelNotFittedError, OutOfBoundsError, @@ -231,6 +233,54 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: ): learned_model.fit(Table.from_dict({"k": [0.1, 0, 0.2], "l": [0, 0.15, 0.5]}).to_tabular_dataset("k")) + @pytest.mark.parametrize( + ("table", "reason"), + [ + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\n"), + ), + ( + Table.from_dict({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['a']"), + ), + ( + Table.from_dict({"a": ["a", "b", "c"], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\nThe following Columns contain non-numerical data: ['a']", + ), + ), + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, None, 5]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\n", + ), + ), + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": ["a", "b", "a"]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['c']"), + ), + ], + ids=[ + "missing value feature", + "non-numerical feature", + "missing value and non-numerical features", + "missing value target", + "non-numerical target", + ], + ) + def test_should_catch_invalid_fit_data(self, device: Device, table: TabularDataset, reason: str) -> None: + configure_test_with_device(device) + model = NeuralNetworkClassifier( + InputConversionTable(), + [ForwardLayer(neuron_count=4), ForwardLayer(1)], + ) + with pytest.raises( + InvalidFitDataError, + match=reason, + ): + model.fit(table) + # def test_should_raise_if_table_size_and_input_size_mismatch(self, device: Device) -> None: # configure_test_with_device(device) # model = NeuralNetworkClassifier( @@ -609,6 +659,54 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: Table.from_dict({"k": [1, 0, 2], "l": [0, 15, 5]}).to_tabular_dataset("l"), ) + @pytest.mark.parametrize( + ("table", "reason"), + [ + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\n"), + ), + ( + Table.from_dict({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['a']"), + ), + ( + Table.from_dict({"a": ["a", "b", "c"], "b": [1, 2, None], "c": [0, 15, 5]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['b']\nThe following Columns contain non-numerical data: ['a']", + ), + ), + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": [0, None, 5]}).to_tabular_dataset("c"), + re.escape( + "The given Fit Data is invalid:\nThe following Columns contain missing values: ['c']\n", + ), + ), + ( + Table.from_dict({"a": [1, 2, 3], "b": [1, 2, 3], "c": ["a", "b", "a"]}).to_tabular_dataset("c"), + re.escape("The given Fit Data is invalid:\nThe following Columns contain non-numerical data: ['c']"), + ), + ], + ids=[ + "missing value feature", + "non-numerical feature", + "missing value and non-numerical features", + "missing value target", + "non-numerical target", + ], + ) + def test_should_catch_invalid_fit_data(self, device: Device, table: TabularDataset, reason: str) -> None: + configure_test_with_device(device) + model = NeuralNetworkRegressor( + InputConversionTable(), + [ForwardLayer(neuron_count=4), ForwardLayer(1)], + ) + with pytest.raises( + InvalidFitDataError, + match=reason, + ): + model.fit(table) + # def test_should_raise_if_table_size_and_input_size_mismatch(self, device: Device) -> None: # configure_test_with_device(device) # model = NeuralNetworkRegressor(