From d62607ffc21250f56645b77d344aee6ac7a3e83e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matthias=20B=C3=BCschelberger?= Date: Tue, 10 Dec 2024 17:15:59 +0100 Subject: [PATCH] add measurementUnit submodel --- data2rdf/config.py | 7 +++++ data2rdf/models/graph.py | 55 +++++++++++++++++++++++++++++++++++++-- data2rdf/parsers/base.py | 45 +++++++++++++++++++++++++------- data2rdf/parsers/utils.py | 23 ++++++++++++++++ data2rdf/qudt/utils.py | 35 +++++++++++++++++++++++++ data2rdf/utils.py | 16 ++++++++++++ data2rdf/warnings.py | 4 +++ tests/abox/test_models.py | 15 +++++++++++ 8 files changed, 188 insertions(+), 12 deletions(-) diff --git a/data2rdf/config.py b/data2rdf/config.py index 355ac8c5..a3b1c97d 100755 --- a/data2rdf/config.py +++ b/data2rdf/config.py @@ -20,6 +20,8 @@ class Config(BaseSettings): description="URI to QUDT quantity kind ontology for unit conversion", ) + language: str = Field("en", description="Language for the unit labels") + base_iri: Union[str, AnyUrl] = Field( "https://www.example.org", description="Base IRI for individuals." ) @@ -77,6 +79,11 @@ class Config(BaseSettings): description="In TBox mode, exclude the title of the ontology in the graph.", ) + dsms_schema_default: bool = Field( + True, + description="""Default value for the `dsms_schema` parameter of the `to_dict` method.""", + ) + model_config = ConfigDict(extra="ignore") @model_validator(mode="after") diff --git a/data2rdf/models/graph.py b/data2rdf/models/graph.py index 29be3cd1..da15eaa6 100644 --- a/data2rdf/models/graph.py +++ b/data2rdf/models/graph.py @@ -3,8 +3,8 @@ import warnings from typing import Any, Dict, List, Optional, Union -from data2rdf.qudt.utils import _get_query_match -from data2rdf.utils import make_prefix +from data2rdf.qudt.utils import _get_qudt_label_and_symbol, _get_query_match +from data2rdf.utils import make_prefix, split_namespace from data2rdf.warnings import ParserWarning from data2rdf.models.utils import ( # isort:skip @@ -18,6 +18,7 @@ BasicGraphModel, BasicSuffixModel, RelationType, + BaseConfigModel, ) from pydantic import ( # isort:skip @@ -109,6 +110,39 @@ def json_ld(self) -> "Dict[str, Any]": } +class MeasurementUnit(BaseConfigModel): + iri: Union[str, AnyUrl] = Field( + ..., + description="Ontological IRI related to the measurement unit", + ) + label: Optional[str] = Field( + None, + description="Label of the measurement unit", + ) + symbol: Optional[str] = Field( + None, + description="Symbol of the measurement unit", + ) + namespace: Optional[str] = Field( + None, + description="Namespace of the measurement unit", + ) + + @model_validator(mode="after") + @classmethod + def validate_measurement_unit(cls, self) -> "MeasurementUnit": + unit = _get_qudt_label_and_symbol( + self.iri, self.config.qudt_units, self.config.language + ) + if not self.label and "label" in unit: + self.label = unit["label"] + if not self.symbol and "symbol" in unit: + self.symbol = unit["symbol"] + if not self.namespace: + self.namespace = split_namespace(self.iri) + return self + + class QuantityGraph(BasicGraphModel, BasicSuffixModel): """Quantity with or without a discrete value and a unit E.g. a quantity with a single value and unit _or_ @@ -133,6 +167,14 @@ class QuantityGraph(BasicGraphModel, BasicSuffixModel): for mapping the data value to the individual.""", ) + measurement_unit: Optional[MeasurementUnit] = Field( + None, + description="Detailed QUDT Measurement Unit specification", + alias=AliasChoices( + "measurement_unit", "measurementunit", "measurementUnit" + ), + ) + @field_validator("value", mode="after") @classmethod def validate_value( @@ -174,6 +216,15 @@ def validate_unit( value = str(value) return value + @model_validator(mode="after") + @classmethod + def validate_quantity_graph(cls, self) -> "QuantityGraph": + if not self.measurement_unit and self.unit: + self.measurement_unit = MeasurementUnit(iri=self.unit) + if self.measurement_unit and not self.unit: + self.unit = self.measurement_unit.iri + return self + @property def json_ld(cls) -> Dict[str, Any]: """Return dict of json-ld for graph""" diff --git a/data2rdf/parsers/base.py b/data2rdf/parsers/base.py index 84081afa..e075692a 100644 --- a/data2rdf/parsers/base.py +++ b/data2rdf/parsers/base.py @@ -1,6 +1,7 @@ """Data2RDF base model for parsers""" import json +import warnings from abc import abstractmethod from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union @@ -9,7 +10,7 @@ from data2rdf.config import Config from data2rdf.modes import PipelineMode -from .utils import load_mapping_file +from .utils import generate_id, load_mapping_file from pydantic import ( # isort:skip BaseModel, @@ -189,15 +190,39 @@ def time_series(self) -> "pd.DataFrame": return self._time_series @property - def plain_metadata(self) -> "Dict[str, Any]": - """Metadata as flat json - without units and iris. - Useful e.g. for the custom properties of the DSMS.""" - return { - str(metadatum.iri).split(self.config.separator)[ - -1 - ]: metadatum.value - for metadatum in self.general_metadata - } + def plain_metadata(self) -> List[Dict[str, Any]]: + message = """ + `plain_metadata` is deprecated and will be removed in a future version. + Use the `to_dict()` instead.""" + warnings.warn(message, DeprecationWarning) + return self.to_dict(dsms_schema=self.config.dsms_schema_default) + + def to_dict(self, dsms_schema: bool = False) -> "List[Dict[str, Any]]": + """Return list of general metadata as DSMS custom properties""" + metadata = [] + for metadatum in self.general_metadata: + prop = { + "label": str(metadatum.iri).split(self.config.separator)[-1], + "value": metadatum.value, + } + if hasattr(metadatum, "measurement_unit"): + prop[ + "measurementUnit" + ] = metadatum.measurement_unit.model_dump(exclude={"config"}) + metadata.append(prop) + if dsms_schema: + for metadatum in metadata: + metadatum["id"] = generate_id() + metadata = { + "sections": [ + { + "id": generate_id(), + "name": "General", + "entries": metadata, + } + ] + } + return metadata class BaseFileParser(BaseParser): diff --git a/data2rdf/parsers/utils.py b/data2rdf/parsers/utils.py index f787bbe7..066fc62a 100644 --- a/data2rdf/parsers/utils.py +++ b/data2rdf/parsers/utils.py @@ -1,6 +1,9 @@ """Data2RDF parser utilities""" import json +import random +import string +import time import warnings from typing import TYPE_CHECKING @@ -216,3 +219,23 @@ def _value_exists(value: "Any") -> bool: bool: True if the value exists and is valid, otherwise False. """ return pd.notnull(value) and value != "" + + +def generate_id(prefix: str = "id") -> str: + # Generate a unique part using time and random characters + """ + Generates a unique id using a combination of the current time and 6 random characters. + + Args: + prefix (str): The prefix to use for the generated id. Defaults to "id". + + Returns: + str: The generated id. + """ + unique_part = f"{int(time.time() * 1000)}" # Milliseconds since epoch + random_part = "".join( + random.choices(string.ascii_lowercase + string.digits, k=6) # nosec + ) + # Combine prefix, unique part, and random part + generated_id = f"{prefix}{unique_part}{random_part}" + return generated_id diff --git a/data2rdf/qudt/utils.py b/data2rdf/qudt/utils.py index c2ede851..811cb87d 100644 --- a/data2rdf/qudt/utils.py +++ b/data2rdf/qudt/utils.py @@ -7,6 +7,8 @@ import requests from rdflib import Graph +from data2rdf.warnings import QUDTMappingWarning + def _qudt_sparql(symbol: str) -> str: return f"""PREFIX qudt: @@ -82,3 +84,36 @@ def _check_qudt_mapping(symbol: Optional[str]) -> Optional[str]: else: unit = {} return unit + + +def _get_qudt_label_and_symbol( + iri: str, qudt_iri: str, language: str +) -> Optional[str]: + graph = _get_qudt_graph(qudt_iri) + gen_query = f"""PREFIX qudt: + PREFIX rdfs: + SELECT DISTINCT ?label ?symbol + WHERE {{ + <{iri}> rdfs:label ?label . + <{iri}> qudt:symbol ?symbol . + FILTER (LANG(?label) = "{language}") + }}""" + match = [ + {"label": str(row["label"]), "symbol": str(row["symbol"])} + for row in graph.query(gen_query) + ] + if len(match) == 0: + warnings.warn( + f"No QUDT label and symbol found for unit with iri `{iri}`.", + QUDTMappingWarning, + ) + unit = {} + elif len(match) > 1: + warnings.warn( + f"Multiple QUDT symbols and labels found for unit with iri `{iri}`.", + QUDTMappingWarning, + ) + unit = match[0] + else: + unit = match.pop() + return unit diff --git a/data2rdf/utils.py b/data2rdf/utils.py index e80aab5b..3d865706 100644 --- a/data2rdf/utils.py +++ b/data2rdf/utils.py @@ -12,3 +12,19 @@ def make_prefix(config: "Config") -> str: else: prefix = str(config.base_iri) return prefix + + +def split_namespace(iri: str) -> tuple[str, str]: + """ + Split the given iri into a namespace and a localname. + + Args: + iri: The iri to split. + + Returns: + A tuple of the namespace and the localname. + """ + if "#" in iri: + return iri.split("#")[0] + else: + return "/".join(iri.split("/")[:-1]) diff --git a/data2rdf/warnings.py b/data2rdf/warnings.py index 807e987b..4bda86d4 100644 --- a/data2rdf/warnings.py +++ b/data2rdf/warnings.py @@ -7,3 +7,7 @@ class MappingMissmatchWarning(UserWarning): class ParserWarning(UserWarning): """A warning raised for a specific context set for a parser""" + + +class QUDTMappingWarning(UserWarning): + """A warning raised for a specific context set for a QUDT mapping""" diff --git a/tests/abox/test_models.py b/tests/abox/test_models.py index 6091cf62..78ca60cf 100644 --- a/tests/abox/test_models.py +++ b/tests/abox/test_models.py @@ -37,6 +37,11 @@ def test_quantity_model(config) -> None: assert model.graph.isomorphic(expected_graph) assert str(model.graph.identifier) == config["graph_identifier"] + assert model.measurement_unit.iri == "http://qudt.org/vocab/unit/MilliM" + assert model.measurement_unit.symbol == "mm" + assert model.measurement_unit.label == "Millimetre" + assert model.measurement_unit.namespace == "http://qudt.org/vocab/unit" + @pytest.mark.parametrize("unit", [unit_string, unit_iri]) def test_valued_quantity(unit): @@ -61,6 +66,11 @@ def test_valued_quantity(unit): assert model.graph.isomorphic(expected_graph) + assert model.measurement_unit.iri == "http://qudt.org/vocab/unit/MilliM" + assert model.measurement_unit.symbol == "mm" + assert model.measurement_unit.label == "Millimetre" + assert model.measurement_unit.namespace == "http://qudt.org/vocab/unit" + def test_bad_with_blank_space(): from rdflib import Graph @@ -83,3 +93,8 @@ def test_bad_with_blank_space(): ) assert model.graph.isomorphic(expected_graph) + + assert model.measurement_unit.iri == "http://qudt.org/vocab/unit/MilliM" + assert model.measurement_unit.symbol == "mm" + assert model.measurement_unit.label == "Millimetre" + assert model.measurement_unit.namespace == "http://qudt.org/vocab/unit"