From d62607ffc21250f56645b77d344aee6ac7a3e83e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matthias=20B=C3=BCschelberger?=
 <matthias.bueschelberger@iwm.fraunhofer.de>
Date: Tue, 10 Dec 2024 17:15:59 +0100
Subject: [PATCH] add measurementUnit submodel

---
 data2rdf/config.py        |  7 +++++
 data2rdf/models/graph.py  | 55 +++++++++++++++++++++++++++++++++++++--
 data2rdf/parsers/base.py  | 45 +++++++++++++++++++++++++-------
 data2rdf/parsers/utils.py | 23 ++++++++++++++++
 data2rdf/qudt/utils.py    | 35 +++++++++++++++++++++++++
 data2rdf/utils.py         | 16 ++++++++++++
 data2rdf/warnings.py      |  4 +++
 tests/abox/test_models.py | 15 +++++++++++
 8 files changed, 188 insertions(+), 12 deletions(-)

diff --git a/data2rdf/config.py b/data2rdf/config.py
index 355ac8c5..a3b1c97d 100755
--- a/data2rdf/config.py
+++ b/data2rdf/config.py
@@ -20,6 +20,8 @@ class Config(BaseSettings):
         description="URI to QUDT quantity kind ontology for unit conversion",
     )
 
+    language: str = Field("en", description="Language for the unit labels")
+
     base_iri: Union[str, AnyUrl] = Field(
         "https://www.example.org", description="Base IRI for individuals."
     )
@@ -77,6 +79,11 @@ class Config(BaseSettings):
         description="In TBox mode, exclude the title of the ontology in the graph.",
     )
 
+    dsms_schema_default: bool = Field(
+        True,
+        description="""Default value for the `dsms_schema` parameter of the `to_dict` method.""",
+    )
+
     model_config = ConfigDict(extra="ignore")
 
     @model_validator(mode="after")
diff --git a/data2rdf/models/graph.py b/data2rdf/models/graph.py
index 29be3cd1..da15eaa6 100644
--- a/data2rdf/models/graph.py
+++ b/data2rdf/models/graph.py
@@ -3,8 +3,8 @@
 import warnings
 from typing import Any, Dict, List, Optional, Union
 
-from data2rdf.qudt.utils import _get_query_match
-from data2rdf.utils import make_prefix
+from data2rdf.qudt.utils import _get_qudt_label_and_symbol, _get_query_match
+from data2rdf.utils import make_prefix, split_namespace
 from data2rdf.warnings import ParserWarning
 
 from data2rdf.models.utils import (  # isort:skip
@@ -18,6 +18,7 @@
     BasicGraphModel,
     BasicSuffixModel,
     RelationType,
+    BaseConfigModel,
 )
 
 from pydantic import (  # isort:skip
@@ -109,6 +110,39 @@ def json_ld(self) -> "Dict[str, Any]":
         }
 
 
+class MeasurementUnit(BaseConfigModel):
+    iri: Union[str, AnyUrl] = Field(
+        ...,
+        description="Ontological IRI related to the measurement unit",
+    )
+    label: Optional[str] = Field(
+        None,
+        description="Label of the measurement unit",
+    )
+    symbol: Optional[str] = Field(
+        None,
+        description="Symbol of the measurement unit",
+    )
+    namespace: Optional[str] = Field(
+        None,
+        description="Namespace of the measurement unit",
+    )
+
+    @model_validator(mode="after")
+    @classmethod
+    def validate_measurement_unit(cls, self) -> "MeasurementUnit":
+        unit = _get_qudt_label_and_symbol(
+            self.iri, self.config.qudt_units, self.config.language
+        )
+        if not self.label and "label" in unit:
+            self.label = unit["label"]
+        if not self.symbol and "symbol" in unit:
+            self.symbol = unit["symbol"]
+        if not self.namespace:
+            self.namespace = split_namespace(self.iri)
+        return self
+
+
 class QuantityGraph(BasicGraphModel, BasicSuffixModel):
     """Quantity with or without a discrete value and a unit
     E.g. a quantity with a single value and unit _or_
@@ -133,6 +167,14 @@ class QuantityGraph(BasicGraphModel, BasicSuffixModel):
         for mapping the data value to the individual.""",
     )
 
+    measurement_unit: Optional[MeasurementUnit] = Field(
+        None,
+        description="Detailed QUDT Measurement Unit specification",
+        alias=AliasChoices(
+            "measurement_unit", "measurementunit", "measurementUnit"
+        ),
+    )
+
     @field_validator("value", mode="after")
     @classmethod
     def validate_value(
@@ -174,6 +216,15 @@ def validate_unit(
             value = str(value)
         return value
 
+    @model_validator(mode="after")
+    @classmethod
+    def validate_quantity_graph(cls, self) -> "QuantityGraph":
+        if not self.measurement_unit and self.unit:
+            self.measurement_unit = MeasurementUnit(iri=self.unit)
+        if self.measurement_unit and not self.unit:
+            self.unit = self.measurement_unit.iri
+        return self
+
     @property
     def json_ld(cls) -> Dict[str, Any]:
         """Return dict of json-ld for graph"""
diff --git a/data2rdf/parsers/base.py b/data2rdf/parsers/base.py
index 84081afa..e075692a 100644
--- a/data2rdf/parsers/base.py
+++ b/data2rdf/parsers/base.py
@@ -1,6 +1,7 @@
 """Data2RDF base model for parsers"""
 
 import json
+import warnings
 from abc import abstractmethod
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
@@ -9,7 +10,7 @@
 from data2rdf.config import Config
 from data2rdf.modes import PipelineMode
 
-from .utils import load_mapping_file
+from .utils import generate_id, load_mapping_file
 
 from pydantic import (  # isort:skip
     BaseModel,
@@ -189,15 +190,39 @@ def time_series(self) -> "pd.DataFrame":
         return self._time_series
 
     @property
-    def plain_metadata(self) -> "Dict[str, Any]":
-        """Metadata as flat json - without units and iris.
-        Useful e.g. for the custom properties of the DSMS."""
-        return {
-            str(metadatum.iri).split(self.config.separator)[
-                -1
-            ]: metadatum.value
-            for metadatum in self.general_metadata
-        }
+    def plain_metadata(self) -> List[Dict[str, Any]]:
+        message = """
+        `plain_metadata` is deprecated and will be removed in a future version.
+        Use the `to_dict()` instead."""
+        warnings.warn(message, DeprecationWarning)
+        return self.to_dict(dsms_schema=self.config.dsms_schema_default)
+
+    def to_dict(self, dsms_schema: bool = False) -> "List[Dict[str, Any]]":
+        """Return list of general metadata as DSMS custom properties"""
+        metadata = []
+        for metadatum in self.general_metadata:
+            prop = {
+                "label": str(metadatum.iri).split(self.config.separator)[-1],
+                "value": metadatum.value,
+            }
+            if hasattr(metadatum, "measurement_unit"):
+                prop[
+                    "measurementUnit"
+                ] = metadatum.measurement_unit.model_dump(exclude={"config"})
+            metadata.append(prop)
+        if dsms_schema:
+            for metadatum in metadata:
+                metadatum["id"] = generate_id()
+            metadata = {
+                "sections": [
+                    {
+                        "id": generate_id(),
+                        "name": "General",
+                        "entries": metadata,
+                    }
+                ]
+            }
+        return metadata
 
 
 class BaseFileParser(BaseParser):
diff --git a/data2rdf/parsers/utils.py b/data2rdf/parsers/utils.py
index f787bbe7..066fc62a 100644
--- a/data2rdf/parsers/utils.py
+++ b/data2rdf/parsers/utils.py
@@ -1,6 +1,9 @@
 """Data2RDF parser utilities"""
 
 import json
+import random
+import string
+import time
 import warnings
 from typing import TYPE_CHECKING
 
@@ -216,3 +219,23 @@ def _value_exists(value: "Any") -> bool:
         bool: True if the value exists and is valid, otherwise False.
     """
     return pd.notnull(value) and value != ""
+
+
+def generate_id(prefix: str = "id") -> str:
+    # Generate a unique part using time and random characters
+    """
+    Generates a unique id using a combination of the current time and 6 random characters.
+
+    Args:
+    prefix (str): The prefix to use for the generated id. Defaults to "id".
+
+    Returns:
+    str: The generated id.
+    """
+    unique_part = f"{int(time.time() * 1000)}"  # Milliseconds since epoch
+    random_part = "".join(
+        random.choices(string.ascii_lowercase + string.digits, k=6)  # nosec
+    )
+    # Combine prefix, unique part, and random part
+    generated_id = f"{prefix}{unique_part}{random_part}"
+    return generated_id
diff --git a/data2rdf/qudt/utils.py b/data2rdf/qudt/utils.py
index c2ede851..811cb87d 100644
--- a/data2rdf/qudt/utils.py
+++ b/data2rdf/qudt/utils.py
@@ -7,6 +7,8 @@
 import requests
 from rdflib import Graph
 
+from data2rdf.warnings import QUDTMappingWarning
+
 
 def _qudt_sparql(symbol: str) -> str:
     return f"""PREFIX qudt: <http://qudt.org/schema/qudt/>
@@ -82,3 +84,36 @@ def _check_qudt_mapping(symbol: Optional[str]) -> Optional[str]:
     else:
         unit = {}
     return unit
+
+
+def _get_qudt_label_and_symbol(
+    iri: str, qudt_iri: str, language: str
+) -> Optional[str]:
+    graph = _get_qudt_graph(qudt_iri)
+    gen_query = f"""PREFIX qudt: <http://qudt.org/schema/qudt/>
+    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+    SELECT DISTINCT ?label ?symbol
+    WHERE {{
+        <{iri}> rdfs:label ?label .
+        <{iri}> qudt:symbol ?symbol .
+        FILTER (LANG(?label) = "{language}")
+    }}"""
+    match = [
+        {"label": str(row["label"]), "symbol": str(row["symbol"])}
+        for row in graph.query(gen_query)
+    ]
+    if len(match) == 0:
+        warnings.warn(
+            f"No QUDT label and symbol found for unit with iri `{iri}`.",
+            QUDTMappingWarning,
+        )
+        unit = {}
+    elif len(match) > 1:
+        warnings.warn(
+            f"Multiple QUDT symbols and labels found for unit with iri `{iri}`.",
+            QUDTMappingWarning,
+        )
+        unit = match[0]
+    else:
+        unit = match.pop()
+    return unit
diff --git a/data2rdf/utils.py b/data2rdf/utils.py
index e80aab5b..3d865706 100644
--- a/data2rdf/utils.py
+++ b/data2rdf/utils.py
@@ -12,3 +12,19 @@ def make_prefix(config: "Config") -> str:
     else:
         prefix = str(config.base_iri)
     return prefix
+
+
+def split_namespace(iri: str) -> tuple[str, str]:
+    """
+    Split the given iri into a namespace and a localname.
+
+    Args:
+        iri: The iri to split.
+
+    Returns:
+        A tuple of the namespace and the localname.
+    """
+    if "#" in iri:
+        return iri.split("#")[0]
+    else:
+        return "/".join(iri.split("/")[:-1])
diff --git a/data2rdf/warnings.py b/data2rdf/warnings.py
index 807e987b..4bda86d4 100644
--- a/data2rdf/warnings.py
+++ b/data2rdf/warnings.py
@@ -7,3 +7,7 @@ class MappingMissmatchWarning(UserWarning):
 
 class ParserWarning(UserWarning):
     """A warning raised for a specific context set for a parser"""
+
+
+class QUDTMappingWarning(UserWarning):
+    """A warning raised for a specific context set for a QUDT mapping"""
diff --git a/tests/abox/test_models.py b/tests/abox/test_models.py
index 6091cf62..78ca60cf 100644
--- a/tests/abox/test_models.py
+++ b/tests/abox/test_models.py
@@ -37,6 +37,11 @@ def test_quantity_model(config) -> None:
     assert model.graph.isomorphic(expected_graph)
     assert str(model.graph.identifier) == config["graph_identifier"]
 
+    assert model.measurement_unit.iri == "http://qudt.org/vocab/unit/MilliM"
+    assert model.measurement_unit.symbol == "mm"
+    assert model.measurement_unit.label == "Millimetre"
+    assert model.measurement_unit.namespace == "http://qudt.org/vocab/unit"
+
 
 @pytest.mark.parametrize("unit", [unit_string, unit_iri])
 def test_valued_quantity(unit):
@@ -61,6 +66,11 @@ def test_valued_quantity(unit):
 
     assert model.graph.isomorphic(expected_graph)
 
+    assert model.measurement_unit.iri == "http://qudt.org/vocab/unit/MilliM"
+    assert model.measurement_unit.symbol == "mm"
+    assert model.measurement_unit.label == "Millimetre"
+    assert model.measurement_unit.namespace == "http://qudt.org/vocab/unit"
+
 
 def test_bad_with_blank_space():
     from rdflib import Graph
@@ -83,3 +93,8 @@ def test_bad_with_blank_space():
     )
 
     assert model.graph.isomorphic(expected_graph)
+
+    assert model.measurement_unit.iri == "http://qudt.org/vocab/unit/MilliM"
+    assert model.measurement_unit.symbol == "mm"
+    assert model.measurement_unit.label == "Millimetre"
+    assert model.measurement_unit.namespace == "http://qudt.org/vocab/unit"