diff --git a/docs/converters/vision_converter.md b/docs/converters/vision_converter.md index 2fee20e6..e845e2b9 100644 --- a/docs/converters/vision_converter.md +++ b/docs/converters/vision_converter.md @@ -52,6 +52,48 @@ Vision introduced UUID based identifier system since version 9.7. It is implemen An examplery usage can be found in the example notebook as well as in the test cases. +## Optional extra columns + +When working with Vision Excel exports, some metadata columns (like `GUID` or `StationID`) may not always be present, especially in partial exports. The `optional_extra` feature allows you to specify columns that should be included in `extra_info` if present, but won't cause conversion failure if missing. + +**Syntax:** +```yaml +grid: + Transformers: + transformer: + id: + auto_id: + key: Number + # ... other fields ... + extra: + - ID # Required - fails if missing + - Name # Required - fails if missing + - optional_extra: + - GUID # Optional - skipped if missing + - StationID # Optional - skipped if missing +``` + +**Behavior:** +- Required columns (listed directly under `extra`) will cause a KeyError if missing +- Optional columns (nested under `optional_extra`) are silently skipped if not found +- If some optional columns are present and others missing, only the present ones are included in `extra_info` +- This feature is particularly useful for handling different Vision export configurations or versions + +**Duplicate handling:** +When a column appears in both the regular `extra` list and within `optional_extra`, the regular `extra` entry takes precedence and duplicates are automatically eliminated from `optional_extra`: + +```yaml +extra: + - ID # Regular column - always processed + - Name # Regular column - always processed + - optional_extra: + - ID # Duplicate - automatically removed + - GUID # Unique optional - processed if present + - StationID # Unique optional - processed if present +``` + +In this example, `ID` will only be processed once (from the regular `extra` list), while `GUID` and `StationID` are processed as optional columns. This prevents duplicate data in the resulting `extra_info` and ensures consistent behavior regardless of column ordering. + ## Common/Known issues related to Vision So far we have the following issue known to us related to Vision exported spread sheets. We provide a solution from user perspective to the best of our knowledge. diff --git a/src/power_grid_model_io/converters/tabular_converter.py b/src/power_grid_model_io/converters/tabular_converter.py index b596d1ac..55b83bad 100644 --- a/src/power_grid_model_io/converters/tabular_converter.py +++ b/src/power_grid_model_io/converters/tabular_converter.py @@ -337,11 +337,14 @@ def _handle_extra_info( # pylint: disable = too-many-arguments,too-many-positio if extra_info is None: return + # Normalize col_def to handle deduplication when optional_extra contains columns also in regular extra + normalized_col_def = self._normalize_extra_col_def(col_def) + extra = self._parse_col_def( data=data, table=table, table_mask=table_mask, - col_def=col_def, + col_def=normalized_col_def, extra_info=None, ).to_dict(orient="records") for i, xtr in zip(uuids, extra): @@ -356,6 +359,55 @@ def _handle_extra_info( # pylint: disable = too-many-arguments,too-many-positio else: extra_info[i] = xtr + def _normalize_extra_col_def(self, col_def: Any) -> Any: + """ + Normalize extra column definition to eliminate duplicates between regular columns and optional_extra. + Regular columns take precedence over optional_extra columns. + + Args: + col_def: Column definition for extra info that may contain optional_extra sections + + Returns: + Normalized column definition with duplicates removed from optional_extra + """ + if not isinstance(col_def, list): + return col_def + + # Collect all non-optional_extra column names + regular_columns = set() + normalized_list = [] + + for item in col_def: + if isinstance(item, dict) and len(item) == 1 and "optional_extra" in item: + # This is an optional_extra section - we'll process it later + normalized_list.append(item) + else: + # This is a regular column + if isinstance(item, str): + regular_columns.add(item) + normalized_list.append(item) + + # Now process optional_extra sections and remove duplicates + final_list = [] + for item in normalized_list: + if isinstance(item, dict) and len(item) == 1 and "optional_extra" in item: + optional_cols = item["optional_extra"] + if isinstance(optional_cols, list): + # Filter out columns that are already in regular columns + filtered_optional_cols = [ + col for col in optional_cols if not isinstance(col, str) or col not in regular_columns + ] + # Only include the optional_extra section if it has remaining columns + if filtered_optional_cols: + final_list.append({"optional_extra": filtered_optional_cols}) + else: + # Keep non-list optional_extra as-is (shouldn't happen but be safe) + final_list.append(item) + else: + final_list.append(item) + + return final_list + @staticmethod def _merge_pgm_data(data: Dict[str, List[np.ndarray]]) -> Dict[str, np.ndarray]: """During the conversion, multiple numpy arrays can be produced for the same type of component. These arrays @@ -396,6 +448,8 @@ def _parse_col_def( # pylint: disable = too-many-arguments,too-many-positional- col_def: Any, table_mask: Optional[np.ndarray], extra_info: Optional[ExtraInfo], + *, + allow_missing: bool = False, ) -> pd.DataFrame: """Interpret the column definition and extract/convert/create the data as a pandas DataFrame. @@ -404,6 +458,7 @@ def _parse_col_def( # pylint: disable = too-many-arguments,too-many-positional- table: str: col_def: Any: extra_info: Optional[ExtraInfo]: + allow_missing: bool: If True, missing columns will return empty DataFrame instead of raising KeyError Returns: @@ -411,8 +466,19 @@ def _parse_col_def( # pylint: disable = too-many-arguments,too-many-positional- if isinstance(col_def, (int, float)): return self._parse_col_def_const(data=data, table=table, col_def=col_def, table_mask=table_mask) if isinstance(col_def, str): - return self._parse_col_def_column_name(data=data, table=table, col_def=col_def, table_mask=table_mask) + return self._parse_col_def_column_name( + data=data, table=table, col_def=col_def, table_mask=table_mask, allow_missing=allow_missing + ) if isinstance(col_def, dict): + # Check if this is an optional_extra wrapper + if len(col_def) == 1 and "optional_extra" in col_def: + # Extract the list of optional columns and parse as composite with allow_missing=True + optional_cols = col_def["optional_extra"] + if not isinstance(optional_cols, list): + raise TypeError(f"optional_extra value must be a list, got {type(optional_cols).__name__}") + return self._parse_col_def_composite( + data=data, table=table, col_def=optional_cols, table_mask=table_mask, allow_missing=True + ) return self._parse_col_def_filter( data=data, table=table, @@ -421,7 +487,9 @@ def _parse_col_def( # pylint: disable = too-many-arguments,too-many-positional- extra_info=extra_info, ) if isinstance(col_def, list): - return self._parse_col_def_composite(data=data, table=table, col_def=col_def, table_mask=table_mask) + return self._parse_col_def_composite( + data=data, table=table, col_def=col_def, table_mask=table_mask, allow_missing=allow_missing + ) raise TypeError(f"Invalid column definition: {col_def}") @staticmethod @@ -454,6 +522,7 @@ def _parse_col_def_column_name( table: str, col_def: str, table_mask: Optional[np.ndarray] = None, + allow_missing: bool = False, ) -> pd.DataFrame: """Extract a column from the data. If the column doesn't exist, check if the col_def is a special float value, like 'inf'. If that's the case, create a single column pandas DataFrame containing the const value. @@ -462,6 +531,7 @@ def _parse_col_def_column_name( data: TabularData: table: str: col_def: str: + allow_missing: bool: If True, return empty DataFrame when column is missing instead of raising KeyError Returns: @@ -484,10 +554,17 @@ def _parse_col_def_column_name( try: # Maybe it is not a column name, but a float value like 'inf', let's try to convert the string to a float const_value = float(col_def) - except ValueError: - # pylint: disable=raise-missing-from + except ValueError as e: + if allow_missing: + # Return empty DataFrame with correct number of rows when column is optional and missing + self._log.debug( + "Optional column not found", + table=table, + columns=" or ".join(f"'{col_name}'" for col_name in columns), + ) + return pd.DataFrame(index=table_data.index) columns_str = " and ".join(f"'{col_name}'" for col_name in columns) - raise KeyError(f"Could not find column {columns_str} on table '{table}'") + raise KeyError(f"Could not find column {columns_str} on table '{table}'") from e return self._parse_col_def_const(data=data, table=table, col_def=const_value, table_mask=table_mask) @@ -778,6 +855,7 @@ def _parse_col_def_composite( table: str, col_def: list, table_mask: Optional[np.ndarray], + allow_missing: bool = False, ) -> pd.DataFrame: """Select multiple columns (each is created from a column definition) and return them as a new DataFrame. @@ -785,6 +863,7 @@ def _parse_col_def_composite( data: TabularData: table: str: col_def: list: + allow_missing: bool: If True, skip missing columns instead of raising errors Returns: @@ -797,10 +876,19 @@ def _parse_col_def_composite( col_def=sub_def, table_mask=table_mask, extra_info=None, + allow_missing=allow_missing, ) for sub_def in col_def ] - return pd.concat(columns, axis=1) + # Filter out DataFrames with no columns (from missing optional columns) + non_empty_columns = [col for col in columns if len(col.columns) > 0] + if not non_empty_columns: + # If all columns are missing, return an empty DataFrame with the correct number of rows + table_data = data[table] + if table_mask is not None: + table_data = table_data[table_mask] + return pd.DataFrame(index=table_data.index) + return pd.concat(non_empty_columns, axis=1) def _get_id(self, table: str, key: Mapping[str, int], name: Optional[str]) -> int: """ diff --git a/tests/data/config/test_optional_extra_mapping.yaml b/tests/data/config/test_optional_extra_mapping.yaml new file mode 100644 index 00000000..1b757b4e --- /dev/null +++ b/tests/data/config/test_optional_extra_mapping.yaml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Contributors to the Power Grid Model project +# +# SPDX-License-Identifier: MPL-2.0 +--- +# Test mapping file for optional_extra feature +grid: + nodes: + node: + id: + auto_id: + key: node_id + u_rated: voltage + extra: + - ID + - Name + - optional_extra: + - GUID + - StationID + +units: + V: + kV: 1000.0 + +substitutions: {} diff --git a/tests/data/vision/vision_optional_extra_full.xlsx b/tests/data/vision/vision_optional_extra_full.xlsx new file mode 100644 index 00000000..dba72193 Binary files /dev/null and b/tests/data/vision/vision_optional_extra_full.xlsx differ diff --git a/tests/data/vision/vision_optional_extra_full.xlsx.license b/tests/data/vision/vision_optional_extra_full.xlsx.license new file mode 100644 index 00000000..76010591 --- /dev/null +++ b/tests/data/vision/vision_optional_extra_full.xlsx.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: Contributors to the Power Grid Model project + +SPDX-License-Identifier: MPL-2.0 diff --git a/tests/data/vision/vision_optional_extra_mapping.yaml b/tests/data/vision/vision_optional_extra_mapping.yaml new file mode 100644 index 00000000..4979d149 --- /dev/null +++ b/tests/data/vision/vision_optional_extra_mapping.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Contributors to the Power Grid Model project +# +# SPDX-License-Identifier: MPL-2.0 +--- +# Test mapping file for optional_extra feature with Vision Excel format +id_reference: + nodes_table: Nodes + number: Number + node_number: Node.Number + sub_number: Subnumber + +grid: + Nodes: + node: + id: + auto_id: + key: Number + u_rated: Unom + extra: + - ID + - Name + - optional_extra: + - GUID + - StationID + +units: + V: + kV: 1000.0 + +substitutions: {} diff --git a/tests/data/vision/vision_optional_extra_mapping.yaml.license b/tests/data/vision/vision_optional_extra_mapping.yaml.license new file mode 100644 index 00000000..76010591 --- /dev/null +++ b/tests/data/vision/vision_optional_extra_mapping.yaml.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: Contributors to the Power Grid Model project + +SPDX-License-Identifier: MPL-2.0 diff --git a/tests/data/vision/vision_optional_extra_minimal.xlsx b/tests/data/vision/vision_optional_extra_minimal.xlsx new file mode 100644 index 00000000..af66a5d7 Binary files /dev/null and b/tests/data/vision/vision_optional_extra_minimal.xlsx differ diff --git a/tests/data/vision/vision_optional_extra_minimal.xlsx.license b/tests/data/vision/vision_optional_extra_minimal.xlsx.license new file mode 100644 index 00000000..76010591 --- /dev/null +++ b/tests/data/vision/vision_optional_extra_minimal.xlsx.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: Contributors to the Power Grid Model project + +SPDX-License-Identifier: MPL-2.0 diff --git a/tests/data/vision/vision_optional_extra_partial.xlsx b/tests/data/vision/vision_optional_extra_partial.xlsx new file mode 100644 index 00000000..3346203d Binary files /dev/null and b/tests/data/vision/vision_optional_extra_partial.xlsx differ diff --git a/tests/data/vision/vision_optional_extra_partial.xlsx.license b/tests/data/vision/vision_optional_extra_partial.xlsx.license new file mode 100644 index 00000000..76010591 --- /dev/null +++ b/tests/data/vision/vision_optional_extra_partial.xlsx.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: Contributors to the Power Grid Model project + +SPDX-License-Identifier: MPL-2.0 diff --git a/tests/unit/converters/test_tabular_converter.py b/tests/unit/converters/test_tabular_converter.py index 157bc807..c475e61a 100644 --- a/tests/unit/converters/test_tabular_converter.py +++ b/tests/unit/converters/test_tabular_converter.py @@ -462,6 +462,7 @@ def test_parse_col_def(converter: TabularConverter, tabular_data_no_units_no_sub table="nodes", col_def="col_name", table_mask=None, + allow_missing=False, ) # type(col_def) == dict @@ -499,6 +500,7 @@ def test_parse_col_def(converter: TabularConverter, tabular_data_no_units_no_sub table="nodes", col_def=[], table_mask=None, + allow_missing=False, ) @@ -1356,3 +1358,664 @@ def bool_fn_filter(row: pd.Series, **kwargs): def test_parse_table_filters__ndarray_data(converter: TabularConverter): numpy_tabular_data = TabularData(numpy_table=np.ones((4, 3))) assert converter._parse_table_filters(data=numpy_tabular_data, table="numpy_table", filtering_functions=[]) is None + + +def test_optional_extra__all_columns_present(converter: TabularConverter): + """Test optional_extra when all optional columns are present in the data""" + # Arrange + data = TabularData( + test_table=pd.DataFrame( + {"id": [1, 2], "name": ["node1", "node2"], "guid": ["guid1", "guid2"], "station": ["st1", "st2"]} + ) + ) + col_def = {"optional_extra": ["guid", "station"]} + + # Act + result = converter._parse_col_def( + data=data, table="test_table", col_def=col_def, table_mask=None, extra_info=None, allow_missing=False + ) + + # Assert + assert list(result.columns) == ["guid", "station"] + assert list(result["guid"]) == ["guid1", "guid2"] + assert list(result["station"]) == ["st1", "st2"] + + +def test_optional_extra__some_columns_missing(converter: TabularConverter): + """Test optional_extra when some optional columns are missing from the data""" + # Arrange + data = TabularData(test_table=pd.DataFrame({"id": [1, 2], "name": ["node1", "node2"], "guid": ["guid1", "guid2"]})) + col_def = {"optional_extra": ["guid", "station"]} # 'station' is missing + + # Act + result = converter._parse_col_def( + data=data, table="test_table", col_def=col_def, table_mask=None, extra_info=None, allow_missing=False + ) + + # Assert - only 'guid' should be present + assert list(result.columns) == ["guid"] + assert list(result["guid"]) == ["guid1", "guid2"] + + +def test_optional_extra__all_columns_missing(converter: TabularConverter): + """Test optional_extra when all optional columns are missing from the data""" + # Arrange + data = TabularData(test_table=pd.DataFrame({"id": [1, 2], "name": ["node1", "node2"]})) + col_def = {"optional_extra": ["guid", "station"]} # Both are missing + + # Act + result = converter._parse_col_def( + data=data, table="test_table", col_def=col_def, table_mask=None, extra_info=None, allow_missing=False + ) + + # Assert - should return empty DataFrame with correct number of rows + assert len(result) == 2 + assert len(result.columns) == 0 + + +def test_optional_extra__mixed_with_required(converter: TabularConverter): + """Test mixing required and optional extra columns""" + # Arrange + data = TabularData(test_table=pd.DataFrame({"id": [1, 2], "name": ["node1", "node2"], "guid": ["guid1", "guid2"]})) + # Mix required columns with optional_extra + col_def = ["name", {"optional_extra": ["guid", "station"]}] + + # Act + result = converter._parse_col_def( + data=data, table="test_table", col_def=col_def, table_mask=None, extra_info=None, allow_missing=False + ) + + # Assert - should have 'name' and 'guid', but not 'station' + assert list(result.columns) == ["name", "guid"] + assert list(result["name"]) == ["node1", "node2"] + assert list(result["guid"]) == ["guid1", "guid2"] + + +def test_optional_extra__in_extra_info(converter: TabularConverter): + """Test that optional_extra works correctly with _handle_extra_info""" + # Arrange + data = TabularData( + test_table=pd.DataFrame( + {"id": [1, 2], "name": ["node1", "node2"], "guid": ["guid1", "guid2"]} # 'station' is missing + ) + ) + uuids = np.array([100, 200]) + extra_info: ExtraInfo = {} + col_def = {"optional_extra": ["guid", "station"]} + + # Act + converter._handle_extra_info( + data=data, table="test_table", col_def=col_def, uuids=uuids, table_mask=None, extra_info=extra_info + ) + + # Assert - only 'guid' should be in extra_info, not 'station' + assert 100 in extra_info + assert 200 in extra_info + assert "guid" in extra_info[100] + assert "guid" in extra_info[200] + assert extra_info[100]["guid"] == "guid1" + assert extra_info[200]["guid"] == "guid2" + assert "station" not in extra_info[100] + assert "station" not in extra_info[200] + + +def test_optional_extra__all_missing_no_extra_info(converter: TabularConverter): + """Test that when all optional columns are missing, no extra_info entries are created""" + # Arrange + data = TabularData(test_table=pd.DataFrame({"id": [1, 2], "name": ["node1", "node2"]})) # Both optional missing + uuids = np.array([100, 200]) + extra_info: ExtraInfo = {} + col_def = {"optional_extra": ["guid", "station"]} + + # Act + converter._handle_extra_info( + data=data, table="test_table", col_def=col_def, uuids=uuids, table_mask=None, extra_info=extra_info + ) + + # Assert - no entries should be added to extra_info + assert len(extra_info) == 0 + + +def test_optional_extra__invalid_type(): + """Test that optional_extra raises TypeError if value is not a list""" + # Arrange + converter = TabularConverter(mapping_file=MAPPING_FILE) + data = TabularData(test_table=pd.DataFrame({"id": [1, 2]})) + col_def = {"optional_extra": "not_a_list"} # Invalid: should be a list + + # Act & Assert + with pytest.raises(TypeError, match="optional_extra value must be a list"): + converter._parse_col_def( + data=data, table="test_table", col_def=col_def, table_mask=None, extra_info=None, allow_missing=False + ) + + +def test_optional_extra__integration(): + """Integration test for optional_extra feature using a complete mapping file""" + # Arrange + mapping_file = Path(__file__).parents[2] / "data" / "config" / "test_optional_extra_mapping.yaml" + converter = TabularConverter(mapping_file=mapping_file) + + # Create test data with some optional columns present and some missing + data = TabularData( + nodes=pd.DataFrame( + { + "node_id": [1, 2, 3], + "voltage": [10.5, 10.5, 0.4], + "ID": ["N1", "N2", "N3"], + "Name": ["Node 1", "Node 2", "Node 3"], + "GUID": ["guid-1", "guid-2", "guid-3"], + # Note: StationID column is missing (optional) + } + ) + ) + + extra_info: ExtraInfo = {} + + # Act + result = converter._parse_data(data=data, data_type=DatasetType.input, extra_info=extra_info) + + # Assert + assert ComponentType.node in result + assert len(result[ComponentType.node]) == 3 + + # Check that extra_info contains the required and present optional fields + for node_id in result[ComponentType.node]["id"]: + assert node_id in extra_info + assert "ID" in extra_info[node_id] + assert "Name" in extra_info[node_id] + assert "GUID" in extra_info[node_id] # Optional but present + assert "StationID" not in extra_info[node_id] # Optional and missing + + # Verify values + node_0_id = result[ComponentType.node]["id"][0] + assert extra_info[node_0_id]["ID"] == "N1" + assert extra_info[node_0_id]["Name"] == "Node 1" + assert extra_info[node_0_id]["GUID"] == "guid-1" + + +def test_optional_extra__with_table_mask(converter: TabularConverter): + """Test optional_extra works correctly with table filtering/masking""" + # Arrange + data = TabularData( + test_table=pd.DataFrame( + { + "id": [1, 2, 3, 4], + "value": [10, 20, 30, 40], + "guid": ["g1", "g2", "g3", "g4"], + "name": ["n1", "n2", "n3", "n4"], + } + ) + ) + # Create a mask that filters to only rows 0 and 2 + table_mask = np.array([True, False, True, False]) + col_def = {"optional_extra": ["guid", "station"]} # 'station' is missing + + # Act + result = converter._parse_col_def( + data=data, table="test_table", col_def=col_def, table_mask=table_mask, extra_info=None, allow_missing=False + ) + + # Assert - should only have 2 rows (from the mask) and 1 column (guid) + assert len(result) == 2 + assert list(result.columns) == ["guid"] + assert list(result["guid"]) == ["g1", "g3"] + + +def test_optional_extra__nested_in_list(converter: TabularConverter): + """Test optional_extra can be nested within a regular list of columns""" + # Arrange + data = TabularData( + test_table=pd.DataFrame( + {"id": [1, 2], "name": ["n1", "n2"], "value": [100, 200], "guid": ["g1", "g2"]} # station missing + ) + ) + col_def = ["name", "value", {"optional_extra": ["guid", "station"]}] + + # Act + result = converter._parse_col_def( + data=data, table="test_table", col_def=col_def, table_mask=None, extra_info=None, allow_missing=False + ) + + # Assert + assert list(result.columns) == ["name", "value", "guid"] + assert list(result["name"]) == ["n1", "n2"] + assert list(result["value"]) == [100, 200] + assert list(result["guid"]) == ["g1", "g2"] + + +def test_optional_extra__with_pipe_separated_columns(converter: TabularConverter): + """Test optional_extra with pipe-separated alternative column names""" + # Arrange + data = TabularData(test_table=pd.DataFrame({"id": [1, 2], "GUID": ["g1", "g2"], "name": ["n1", "n2"]})) + # Use pipe separator for alternative column names (GUID or Guid) + col_def = {"optional_extra": ["GUID|Guid", "StationID|Station"]} # Both StationID and Station missing + + # Act + result = converter._parse_col_def( + data=data, table="test_table", col_def=col_def, table_mask=None, extra_info=None, allow_missing=False + ) + + # Assert - GUID should be found, Station alternatives should be skipped + assert list(result.columns) == ["GUID"] + assert list(result["GUID"]) == ["g1", "g2"] + + +def test_optional_extra__empty_string_values(converter: TabularConverter): + """Test that optional_extra handles empty strings correctly""" + # Arrange + data = TabularData(test_table=pd.DataFrame({"id": [1, 2, 3], "guid": ["g1", "", "g3"], "name": ["n1", "n2", ""]})) + uuids = np.array([100, 200, 300]) + extra_info: ExtraInfo = {} + col_def = {"optional_extra": ["guid", "name"]} + + # Act + converter._handle_extra_info( + data=data, table="test_table", col_def=col_def, uuids=uuids, table_mask=None, extra_info=extra_info + ) + + # Assert - empty strings should still be included (not filtered as NaN) + assert 100 in extra_info + assert 200 in extra_info + assert 300 in extra_info + assert extra_info[100]["guid"] == "g1" + assert extra_info[100]["name"] == "n1" + assert extra_info[200]["guid"] == "" # Empty string preserved + assert extra_info[200]["name"] == "n2" + assert extra_info[300]["guid"] == "g3" + assert extra_info[300]["name"] == "" # Empty string preserved + + +def test_optional_extra__with_nan_values(converter: TabularConverter): + """Test that optional_extra filters out NaN values correctly""" + # Arrange + data = TabularData( + test_table=pd.DataFrame({"id": [1, 2, 3], "guid": ["g1", np.nan, "g3"], "value": [10.0, 20.0, np.nan]}) + ) + uuids = np.array([100, 200, 300]) + extra_info: ExtraInfo = {} + col_def = {"optional_extra": ["guid", "value"]} + + # Act + converter._handle_extra_info( + data=data, table="test_table", col_def=col_def, uuids=uuids, table_mask=None, extra_info=extra_info + ) + + # Assert - NaN values should be filtered out + assert 100 in extra_info + assert extra_info[100] == {"guid": "g1", "value": 10.0} + + assert 200 in extra_info + assert extra_info[200] == {"value": 20.0} # guid was NaN, filtered out + + assert 300 in extra_info + assert extra_info[300] == {"guid": "g3"} # value was NaN, filtered out + + +def test_optional_extra__multiple_optional_extra_sections(): + """Test behavior when multiple optional_extra sections are used (should work independently)""" + # Arrange + converter = TabularConverter(mapping_file=MAPPING_FILE) + data = TabularData( + test_table=pd.DataFrame( + {"id": [1, 2], "name": ["n1", "n2"], "guid": ["g1", "g2"]} # station and zone missing + ) + ) + # Two separate optional_extra sections + col_def = [{"optional_extra": ["guid"]}, {"optional_extra": ["station", "zone"]}] + + # Act + result = converter._parse_col_def( + data=data, table="test_table", col_def=col_def, table_mask=None, extra_info=None, allow_missing=False + ) + + # Assert - only guid should be present + assert list(result.columns) == ["guid"] + assert list(result["guid"]) == ["g1", "g2"] + + +def test_convert_col_def_to_attribute__pgm_data_without_dtype_names(): + """Test error handling when pgm_data has no dtype.names (unusual edge case)""" + # Arrange + converter = TabularConverter(mapping_file=MAPPING_FILE) + data = TabularData(test_table=pd.DataFrame({"id": [1, 2], "name": ["n1", "n2"]})) + + # Create a mock array without dtype.names by using a plain ndarray + pgm_data = np.array([1, 2]) # Regular array without structured dtype + assert pgm_data.dtype.names is None + + # Act & Assert + with pytest.raises(ValueError, match="pgm_data for 'nodes' has no attributes defined"): + converter._convert_col_def_to_attribute( + data=data, + pgm_data=pgm_data, + table="test_table", + component="node", + attr="id", + col_def="id", + table_mask=None, + extra_info=None, + ) + + +def test_parse_col_def_with_allow_missing(): + """Test _parse_col_def function with allow_missing parameter both True and False""" + # Arrange + converter = TabularConverter(mapping_file=MAPPING_FILE) + data = TabularData(test_table=pd.DataFrame({"existing_col": [1, 2, 3], "another_col": ["a", "b", "c"]})) + + # Test 1: String column with allow_missing=False (default) - existing column + result = converter._parse_col_def( + data=data, + table="test_table", + col_def="existing_col", + table_mask=None, + extra_info=None, + allow_missing=False, + ) + assert list(result.iloc[:, 0]) == [1, 2, 3] + + # Test 2: String column with allow_missing=False - missing column (should raise KeyError) + with pytest.raises(KeyError, match="Could not find column 'missing_col' on table 'test_table'"): + converter._parse_col_def( + data=data, + table="test_table", + col_def="missing_col", + table_mask=None, + extra_info=None, + allow_missing=False, + ) + + # Test 3: String column with allow_missing=True - missing column (should return empty DataFrame) + result = converter._parse_col_def( + data=data, + table="test_table", + col_def="missing_col", + table_mask=None, + extra_info=None, + allow_missing=True, + ) + assert len(result.columns) == 0 + assert len(result) == 3 # Should have same number of rows as original table + + # Test 4: String column with allow_missing=True - existing column (should work normally) + result = converter._parse_col_def( + data=data, + table="test_table", + col_def="existing_col", + table_mask=None, + extra_info=None, + allow_missing=True, + ) + assert list(result.iloc[:, 0]) == [1, 2, 3] + + # Test 5: List (composite) with allow_missing=False - all existing columns + result = converter._parse_col_def( + data=data, + table="test_table", + col_def=["existing_col", "another_col"], + table_mask=None, + extra_info=None, + allow_missing=False, + ) + assert len(result.columns) == 2 + assert list(result["existing_col"]) == [1, 2, 3] + assert list(result["another_col"]) == ["a", "b", "c"] + + # Test 6: List (composite) with allow_missing=False - some missing columns (should raise error) + with pytest.raises(KeyError, match="Could not find column 'missing_col' on table 'test_table'"): + converter._parse_col_def( + data=data, + table="test_table", + col_def=["existing_col", "missing_col"], + table_mask=None, + extra_info=None, + allow_missing=False, + ) + + # Test 7: List (composite) with allow_missing=True - some missing columns (should skip missing) + result = converter._parse_col_def( + data=data, + table="test_table", + col_def=["existing_col", "missing_col", "another_col"], + table_mask=None, + extra_info=None, + allow_missing=True, + ) + assert len(result.columns) == 2 # Only existing columns should be present + assert list(result["existing_col"]) == [1, 2, 3] + assert list(result["another_col"]) == ["a", "b", "c"] + + # Test 8: List (composite) with allow_missing=True - all missing columns (should return empty with correct rows) + result = converter._parse_col_def( + data=data, + table="test_table", + col_def=["missing_col1", "missing_col2"], + table_mask=None, + extra_info=None, + allow_missing=True, + ) + assert len(result.columns) == 0 + assert len(result) == 3 # Should have same number of rows as original table + + # Test 9: Dict (optional_extra) - should automatically set allow_missing=True internally + result = converter._parse_col_def( + data=data, + table="test_table", + col_def={"optional_extra": ["existing_col", "missing_col"]}, + table_mask=None, + extra_info=None, + allow_missing=False, # This should be ignored for optional_extra + ) + assert len(result.columns) == 1 # Only existing column should be present + assert list(result["existing_col"]) == [1, 2, 3] + + # Test 10: Constant values should work regardless of allow_missing + result_false = converter._parse_col_def( + data=data, + table="test_table", + col_def=42, + table_mask=None, + extra_info=None, + allow_missing=False, + ) + result_true = converter._parse_col_def( + data=data, + table="test_table", + col_def=42, + table_mask=None, + extra_info=None, + allow_missing=True, + ) + assert list(result_false.iloc[:, 0]) == [42, 42, 42] + assert list(result_true.iloc[:, 0]) == [42, 42, 42] + + +def test_parse_col_def_with_allow_missing_and_table_mask(): + """Test _parse_col_def function with allow_missing and table_mask combinations""" + # Arrange + converter = TabularConverter(mapping_file=MAPPING_FILE) + data = TabularData(test_table=pd.DataFrame({"existing_col": [1, 2, 3, 4], "another_col": ["a", "b", "c", "d"]})) + table_mask = np.array([True, False, True, False]) # Select rows 0 and 2 + + # Test 1: Missing column with table_mask and allow_missing=True + result = converter._parse_col_def( + data=data, + table="test_table", + col_def="missing_col", + table_mask=table_mask, + extra_info=None, + allow_missing=True, + ) + assert len(result.columns) == 0 + assert len(result) == 2 # Should match filtered table length + + # Test 2: Existing column with table_mask and allow_missing=True + result = converter._parse_col_def( + data=data, + table="test_table", + col_def="existing_col", + table_mask=table_mask, + extra_info=None, + allow_missing=True, + ) + assert list(result.iloc[:, 0]) == [1, 3] # Should get filtered values + + # Test 3: Composite with missing columns, table_mask, and allow_missing=True + result = converter._parse_col_def( + data=data, + table="test_table", + col_def=["existing_col", "missing_col"], + table_mask=table_mask, + extra_info=None, + allow_missing=True, + ) + assert len(result.columns) == 1 # Only existing column + assert list(result["existing_col"]) == [1, 3] # Filtered values + + +def test_normalize_extra_col_def(): + """Test _normalize_extra_col_def method for handling duplicate columns""" + # Arrange + converter = TabularConverter(mapping_file=MAPPING_FILE) + + # Test 1: Regular list without optional_extra (should be unchanged) + col_def = ["ID", "Name", "GUID"] + result = converter._normalize_extra_col_def(col_def) + assert result == ["ID", "Name", "GUID"] + + # Test 2: Non-list input (should be unchanged) + col_def = "ID" + result = converter._normalize_extra_col_def(col_def) + assert result == "ID" + + # Test 3: List with optional_extra but no duplicates + col_def = ["ID", {"optional_extra": ["GUID", "StationID"]}] + result = converter._normalize_extra_col_def(col_def) + assert result == ["ID", {"optional_extra": ["GUID", "StationID"]}] + + # Test 4: List with duplicates - regular column should dominate + col_def = ["ID", "Name", {"optional_extra": ["ID", "GUID", "StationID"]}] + result = converter._normalize_extra_col_def(col_def) + expected = ["ID", "Name", {"optional_extra": ["GUID", "StationID"]}] + assert result == expected + + # Test 5: Multiple optional_extra sections with overlaps + col_def = ["ID", {"optional_extra": ["ID", "GUID"]}, {"optional_extra": ["Name", "StationID"]}] + result = converter._normalize_extra_col_def(col_def) + expected = ["ID", {"optional_extra": ["GUID"]}, {"optional_extra": ["Name", "StationID"]}] + assert result == expected + + # Test 6: All optional columns are duplicates (should remove optional_extra section) + col_def = ["ID", "Name", {"optional_extra": ["ID", "Name"]}] + result = converter._normalize_extra_col_def(col_def) + expected = ["ID", "Name"] + assert result == expected + + # Test 7: Empty optional_extra list (should be removed) + col_def = ["ID", {"optional_extra": []}] + result = converter._normalize_extra_col_def(col_def) + expected = ["ID"] + assert result == expected + + +def test_handle_extra_info_with_duplicates(): + """Test that _handle_extra_info correctly handles duplicates between regular and optional columns""" + # Arrange + converter = TabularConverter(mapping_file=MAPPING_FILE) + data = TabularData( + test_table=pd.DataFrame( + { + "ID": ["N001", "N002", "N003"], + "Name": ["Node1", "Node2", "Node3"], + "GUID": ["g1", "g2", "g3"], + "StationID": ["ST1", "ST2", "ST3"], + } + ) + ) + + # Column definition with duplicates (ID appears in both regular and optional_extra) + col_def = ["ID", "Name", {"optional_extra": ["ID", "GUID", "StationID"]}] + + uuids = np.array([100, 200, 300]) + extra_info = {} + + # Act + converter._handle_extra_info( + data=data, + table="test_table", + col_def=col_def, + uuids=uuids, + table_mask=None, + extra_info=extra_info, + ) + + # Assert + # ID should appear only once (from regular column, not duplicated from optional_extra) + assert 100 in extra_info + assert extra_info[100]["ID"] == "N001" + assert extra_info[100]["Name"] == "Node1" + assert extra_info[100]["GUID"] == "g1" + assert extra_info[100]["StationID"] == "ST1" + + # Check that we don't have duplicate ID columns in the result + result_keys = list(extra_info[100].keys()) + assert result_keys.count("ID") == 1, f"ID should appear only once, but got: {result_keys}" + + # Similar checks for other rows + assert extra_info[200]["ID"] == "N002" + assert extra_info[300]["ID"] == "N003" + + +def test_optional_extra_with_duplicates_integration(): + """Integration test to verify duplicate elimination works in a full conversion scenario""" + # Arrange + converter = TabularConverter(mapping_file=MAPPING_FILE) + + # Create test data with columns that will appear in both regular and optional_extra + data = TabularData( + test_table=pd.DataFrame( + { + "id": [1, 2, 3], + "name": ["Node1", "Node2", "Node3"], + "u_nom": [10.0, 10.0, 0.4], + "guid": ["g1", "g2", "g3"], + "station": ["ST1", "ST2", "ST3"], + } + ) + ) + + # Column definition that has ID in both places - regular should dominate + col_def = [ + "id", # Regular column + "name", # Regular column + {"optional_extra": ["id", "guid", "station"]}, # ID is duplicate, guid and station are new + ] + + extra_info = {} + uuids = np.array([100, 200, 300]) + + # Act + converter._handle_extra_info( + data=data, + table="test_table", + col_def=col_def, + uuids=uuids, + table_mask=None, + extra_info=extra_info, + ) + + # Assert - verify no duplicate columns and all expected columns are present + for uuid, expected_id in zip([100, 200, 300], [1, 2, 3]): + assert uuid in extra_info + extra_data = extra_info[uuid] + + # Should have all columns but ID should not be duplicated + expected_keys = {"id", "name", "guid", "station"} + assert set(extra_data.keys()) == expected_keys + + # Verify values + assert extra_data["id"] == expected_id + assert extra_data["name"] == f"Node{expected_id}" + assert extra_data["guid"] == f"g{expected_id}" + assert extra_data["station"] == f"ST{expected_id}" diff --git a/tests/unit/converters/test_vision_excel_converter.py b/tests/unit/converters/test_vision_excel_converter.py index 99bcfaec..a783c401 100644 --- a/tests/unit/converters/test_vision_excel_converter.py +++ b/tests/unit/converters/test_vision_excel_converter.py @@ -147,3 +147,105 @@ def test_ambiguity_in_vision_excel(): excel_file_checker = ExcelAmbiguityChecker(file_path=ambiguious_test_file.as_posix()) res, _ = excel_file_checker.check_ambiguity() assert res + + +def test_optional_extra_all_columns_present(): + """Test Vision Excel conversion with all optional columns present""" + # Arrange + test_file = Path(__file__).parents[2] / "data" / "vision" / "vision_optional_extra_full.xlsx" + mapping_file = Path(__file__).parents[2] / "data" / "vision" / "vision_optional_extra_mapping.yaml" + + from power_grid_model import ComponentType + + converter = VisionExcelConverter(source_file=test_file, mapping_file=mapping_file) + + # Act + result, extra_info = converter.load_input_data() + + # Assert + assert ComponentType.node in result + assert len(result[ComponentType.node]) == 3 + + # Check that all extra fields are present (including optional ones) + for node in result[ComponentType.node]: + node_id = node["id"] + assert node_id in extra_info + assert "ID" in extra_info[node_id] + assert "Name" in extra_info[node_id] + assert "GUID" in extra_info[node_id] # Optional but present + assert "StationID" in extra_info[node_id] # Optional but present + + # Verify specific values + node_0_id = result[ComponentType.node][0]["id"] + assert extra_info[node_0_id]["ID"] == "N001" + assert extra_info[node_0_id]["Name"] == "Node1" + assert extra_info[node_0_id]["GUID"] == "guid-001" + assert extra_info[node_0_id]["StationID"] == "ST1" + + +def test_optional_extra_some_columns_missing(): + """Test Vision Excel conversion with some optional columns missing""" + # Arrange + test_file = Path(__file__).parents[2] / "data" / "vision" / "vision_optional_extra_partial.xlsx" + mapping_file = Path(__file__).parents[2] / "data" / "vision" / "vision_optional_extra_mapping.yaml" + + from power_grid_model import ComponentType + + converter = VisionExcelConverter(source_file=test_file, mapping_file=mapping_file) + + # Act + result, extra_info = converter.load_input_data() + + # Assert + assert ComponentType.node in result + assert len(result[ComponentType.node]) == 3 + + # Check that required and present optional fields are included + for node in result[ComponentType.node]: + node_id = node["id"] + assert node_id in extra_info + assert "ID" in extra_info[node_id] + assert "Name" in extra_info[node_id] + assert "GUID" in extra_info[node_id] # Optional and present + assert "StationID" not in extra_info[node_id] # Optional and missing - should not be present + + # Verify specific values + node_1_id = result[ComponentType.node][1]["id"] + assert extra_info[node_1_id]["ID"] == "N002" + assert extra_info[node_1_id]["Name"] == "Node2" + assert extra_info[node_1_id]["GUID"] == "guid-002" + + +def test_optional_extra_all_optional_missing(): + """Test Vision Excel conversion with all optional columns missing""" + # Arrange + test_file = Path(__file__).parents[2] / "data" / "vision" / "vision_optional_extra_minimal.xlsx" + mapping_file = Path(__file__).parents[2] / "data" / "vision" / "vision_optional_extra_mapping.yaml" + + from power_grid_model import ComponentType + + converter = VisionExcelConverter(source_file=test_file, mapping_file=mapping_file) + + # Act + result, extra_info = converter.load_input_data() + + # Assert + assert ComponentType.node in result + assert len(result[ComponentType.node]) == 3 + + # Check that only required fields are present + for node in result[ComponentType.node]: + node_id = node["id"] + assert node_id in extra_info + assert "ID" in extra_info[node_id] + assert "Name" in extra_info[node_id] + assert "GUID" not in extra_info[node_id] # Optional and missing + assert "StationID" not in extra_info[node_id] # Optional and missing + + # Verify specific values + node_2_id = result[ComponentType.node][2]["id"] + assert extra_info[node_2_id]["ID"] == "N003" + assert extra_info[node_2_id]["Name"] == "Node3" + # Check that optional fields are not present (only ID, Name, and id_reference) + assert "GUID" not in extra_info[node_2_id] + assert "StationID" not in extra_info[node_2_id]