v2.4.1 (#78)

* add alias for dataframe fields for backwards compatibility and add pytests * replace time series with dataframe in descriptions
MI-FraunhoferIWM · Jan 21, 2025 · ad576b3 · ad576b3
1 parent 3ec64eb
commit ad576b3
Show file tree

Hide file tree

Showing 32 changed files with 348 additions and 106 deletions.
diff --git a/data2rdf/config.py b/data2rdf/config.py
@@ -39,7 +39,7 @@ class Config(BaseSettings):
 
     data_download_uri: Union[str, AnyUrl] = Field(
         "https://www.example.org/download",
-        description="General base iri for downloading the time series after uploading",
+        description="General base iri for downloading the dataframe after uploading",
     )
 
     graph_identifier: Optional[Union[str, AnyUrl]] = Field(

diff --git a/data2rdf/models/graph.py b/data2rdf/models/graph.py
@@ -146,7 +146,7 @@ def validate_measurement_unit(cls, self) -> "MeasurementUnit":
 class QuantityGraph(BasicGraphModel, BasicSuffixModel):
     """Quantity with or without a discrete value and a unit
     E.g. a quantity with a single value and unit _or_
-    a quantity describing a column of a time series or table with a unit."""
+    a quantity describing a column of a dataframe or table with a unit."""
 
     unit: Optional[Union[str, AnyUrl]] = Field(
         None, description="QUDT Symbol or any other IRI for the unit mapping"
@@ -273,7 +273,7 @@ class PropertyGraph(BasicGraphModel, BasicSuffixModel):
     """Mapping for an individual with arbitrary property. E.g. the
     name of a tester or a testing facility. The value must not have a
     discrete value but can also be a reference to a column in a table or
-    time series."""
+    dataframe."""
 
     value: Optional[
         Union[str, int, float, bool, AnyUrl, "PropertyGraph", "QuantityGraph"]

diff --git a/data2rdf/models/mapping.py b/data2rdf/models/mapping.py
@@ -185,7 +185,8 @@ class ABoxExcelMapping(ABoxBaseMapping):
 
     dataframe_start: Optional[str] = Field(
         None,
-        description="Cell location for the start of the time series quantity",
+        description="Cell location for the start of the dataframe quantity",
+        alias=AliasChoices("dataframe_start", "time_series_start"),
     )
     worksheet: Optional[str] = Field(
         None,

diff --git a/data2rdf/parsers/base.py b/data2rdf/parsers/base.py
@@ -46,7 +46,7 @@ class BaseParser(BaseModel):
 
     dropna: bool = Field(
         False,
-        description="Drop all rows where ONLY NaN and None occur in the time series.",
+        description="Drop all rows where ONLY NaN and None occur in the dataframe.",
     )
 
     config: Config = Field(
@@ -344,7 +344,7 @@ def general_metadata(self) -> "List[BasicConceptMapping]":
 
     @property
     def dataframe_metadata(self) -> "List[BasicConceptMapping]":
-        """Return time series metadata"""
+        """Return dataframe metadata"""
         if self.mode == PipelineMode.ABOX:
             return self.abox.dataframe_metadata
         else:
@@ -354,7 +354,7 @@ def dataframe_metadata(self) -> "List[BasicConceptMapping]":
 
     @property
     def dataframe(self) -> "Dict[str, Any]":
-        """Return time series"""
+        """Return dataframe"""
         if self.mode == PipelineMode.ABOX:
             return self.abox.dataframe
         else:

diff --git a/data2rdf/parsers/csv.py b/data2rdf/parsers/csv.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 import pandas as pd
-from pydantic import Field
+from pydantic import AliasChoices, Field
 
 from data2rdf.models.graph import PropertyGraph, QuantityGraph
 from data2rdf.utils import make_prefix
@@ -120,10 +120,16 @@ class CSVABoxParser(ABoxBaseParser):
     )
     metadata_length: int = Field(..., description="Length of the metadata")
     dataframe_sep: Optional[str] = Field(
-        None, description="Column separator of the time series header"
+        None,
+        description="Column separator of the dataframe header",
+        alias=AliasChoices("dataframe_sep", "time_series_sep"),
     )
     dataframe_header_length: int = Field(
-        2, description="Length of header of the time series"
+        2,
+        description="Length of header of the dataframe",
+        alias=AliasChoices(
+            "dataframe_header_length", "time_series_header_length"
+        ),
     )
     fillna: Optional[Any] = Field(
         "", description="Value to fill NaN values in the parsed dataframe."
@@ -148,7 +154,7 @@ def json_ld(self) -> "Dict[str, Any]":
         Returns a JSON-LD representation of the CSV data in ABox mode.
 
         This method generates a JSON-LD object that describes the CSV data,
-        including its metadata, time series data, and relationships between them.
+        including its metadata, dataframe data, and relationships between them.
 
         The returned JSON-LD object is in the format of a csvw:TableGroup,
         which contains one or more csvw:Table objects. Each csvw:Table object
@@ -212,7 +218,7 @@ def json_ld(self) -> "Dict[str, Any]":
                 tables += [
                     {
                         "@type": "csvw:Table",
-                        "rdfs:label": "Time series data",
+                        "rdfs:label": "Dataframe",
                         "csvw:tableSchema": column_schema,
                     }
                 ]
@@ -302,7 +308,7 @@ def _run_parser(
         mapping: "List[ABoxBaseMapping]",
     ) -> None:
         """
-        This function is responsible for parsing metadata, time series metadata, and time series data from a CSV file.
+        This function is responsible for parsing metadata, dataframe metadata, and dataframe data from a CSV file.
 
         It takes in three parameters:
         - `self`: The CSVParser instance.
@@ -311,8 +317,8 @@ def _run_parser(
 
         The function returns None, but it populates the following instance variables:
         - `self._general_metadata`: A list of PropertyGraph or QuantityGraph instances representing the general metadata.
-        - `self._dataframe_metadata`: A list of QuantityGraph instances representing the time series metadata.
-        - `self._dataframe`: A pandas DataFrame containing the time series data.
+        - `self._dataframe_metadata`: A list of QuantityGraph instances representing the dataframe metadata.
+        - `self._dataframe`: A pandas DataFrame containing the dataframe data.
 
         The function also raises ValueError if the `metadata_length` is greater than 0 but `metadata_sep` is not set.
         It raises TypeError if the unit for a key is not a string.
@@ -398,7 +404,7 @@ def _run_parser(
                         MappingMissmatchWarning,
                     )
 
-        # parse time series data and meta data
+        # parse dataframe data and meta data
         self._dataframe_metadata = []
         self._dataframe = {}
 
@@ -441,7 +447,7 @@ def _run_parser(
                 # append model
                 self.dataframe_metadata.append(model)
 
-                # assign time series data
+                # assign dataframe data
                 self._dataframe[model.suffix] = dataframe[key][
                     self.dataframe_header_length - 1 :
                 ].to_list()
@@ -451,7 +457,7 @@ def _run_parser(
                     f"No match found in mapping for key `{key}`",
                     MappingMissmatchWarning,
                 )
-        # set time series as pd dataframe
+        # set dataframe as pd dataframe
         self._dataframe = pd.DataFrame.from_dict(
             self._dataframe, orient="index"
         ).transpose()
@@ -485,7 +491,7 @@ def _parse_dataframe(
             ]
         else:
             warnings.warn(
-                "`dataframe_sep` is not set. Any potential time series in the data file will be skipped.",
+                "`dataframe_sep` is not set. Any potential dataframe in the data file will be skipped.",
                 ParserWarning,
             )
             response = []

diff --git a/data2rdf/parsers/excel.py b/data2rdf/parsers/excel.py
@@ -145,8 +145,8 @@ def json_ld(self) -> Dict[str, Any]:
         """
         Returns the JSON-LD representation of the data in ABox mode.
 
-        The JSON-LD is constructed based on the metadata and time series data.
-        If the file description is not suppressed, it includes the metadata and time series data tables.
+        The JSON-LD is constructed based on the metadata and dataframe data.
+        If the file description is not suppressed, it includes the metadata and dataframe data tables.
         Otherwise, it returns a list of JSON-LD representations of the individual models.
 
         :return: A dictionary representing the JSON-LD data.
@@ -194,7 +194,7 @@ def json_ld(self) -> Dict[str, Any]:
                 tables += [
                     {
                         "@type": "csvw:Table",
-                        "rdfs:label": "Time series data",
+                        "rdfs:label": "Dataframe",
                         "csvw:tableSchema": column_schema,
                     }
                 ]
@@ -284,7 +284,7 @@ def _run_parser(
         mapping: "List[ABoxExcelMapping]",
     ) -> None:
         """
-        Parses the metadata, time series metadata, and time series from an Excel file.
+        Parses the metadata, dataframe metadata, and dataframe from an Excel file.
 
         Args:
             self (ExcelABoxParser): The instance of the ExcelABoxParser class.
@@ -326,7 +326,7 @@ def _run_parser(
                         are set. Only one of them must be set."""
                     )
 
-                # find data for time series
+                # find data for dataframe
                 if datum.dataframe_start:
                     column_name = datum.dataframe_start.rstrip("0123456789")
                     dataframe_end = f"{column_name}{worksheet.max_row}"
@@ -338,7 +338,7 @@ def _run_parser(
                         ]
                     else:
                         message = f"""Concept with key `{datum.key}`
-                                    does not have a time series from `{datum.dataframe_start}`
+                                    does not have a dataframe from `{datum.dataframe_start}`
                                     until `{dataframe_end}` .
                                     Concept will be omitted in graph.
                                     """
@@ -476,7 +476,7 @@ def _run_parser(
                                         """
                         warnings.warn(message, MappingMissmatchWarning)
 
-        # set time series as pd dataframe
+        # set dataframe as pd dataframe
         self._dataframe = pd.DataFrame.from_dict(
             self._dataframe, orient="index"
         ).transpose()

diff --git a/data2rdf/parsers/json.py b/data2rdf/parsers/json.py
@@ -139,7 +139,7 @@ def json_ld(self) -> Dict[str, Any]:
 
         This method generates the JSON-LD representation of the parser's data,
         including the context, id, type, and members. The members are generated
-        based on the general metadata and time series metadata.
+        based on the general metadata and dataframe metadata.
 
         The method returns a dictionary containing the JSON-LD representation.
 
@@ -271,8 +271,8 @@ def _run_parser(
         mapping: "List[ABoxBaseMapping]",
     ) -> None:
         """
-        Class method for parsing metadata, time series metadata,
-        and time series from a given data file and mapping.
+        Class method for parsing metadata, dataframe metadata,
+        and dataframe from a given data file and mapping.
 
         Args:
             self: An instance of JsonABoxParser.
@@ -359,8 +359,8 @@ def _run_parser(
 
                     # if we have a series and a unit and we are *not* expanding:
                     # * make a QuantityGraph with the unit
-                    # * add the graph to the time series metadata
-                    # * add the values of the series to the time series array
+                    # * add the graph to the dataframe metadata
+                    # * add the values of the series to the dataframe array
                     if (
                         isinstance(value, list)
                         and unit
@@ -388,8 +388,8 @@ def _run_parser(
                             self._general_metadata.append(model)
                     # if we have a series and *no* unit and we are *not* expanding:
                     # * make a PropertyGraph
-                    # * add the graph to the time series metadata
-                    # * add the values of the series to the time series array
+                    # * add the graph to the dataframe metadata
+                    # * add the values of the series to the dataframe array
                     elif (
                         isinstance(value, list)
                         and not unit
@@ -466,7 +466,7 @@ def _run_parser(
                             relation, subdataset, datum, suffix
                         )
 
-        # set time series as pd dataframe
+        # set dataframe as pd dataframe
         self._dataframe = pd.DataFrame.from_dict(
             self._dataframe, orient="index"
         ).transpose()

diff --git a/data2rdf/pipelines/main.py b/data2rdf/pipelines/main.py
@@ -243,7 +243,7 @@ def general_metadata(self) -> "List[BasicConceptMapping]":
 
     @property
     def dataframe_metadata(self) -> "List[BasicConceptMapping]":
-        """Return list object with time series metadata"""
+        """Return list object with dataframe metadata"""
         if self.mode == PipelineMode.ABOX:
             return self.parser.abox.dataframe_metadata
         else:
@@ -253,7 +253,7 @@ def dataframe_metadata(self) -> "List[BasicConceptMapping]":
 
     @property
     def dataframe(self) -> "Dict[str, Any]":
-        """Return time series"""
+        """Return dataframe"""
         if self.mode == PipelineMode.ABOX:
             return self.parser.abox.dataframe
         else:

diff --git a/docs/config.md b/docs/config.md
@@ -11,7 +11,7 @@ The configuration of the package is crucial for the correct parsing and transfor
 | prefix_name | str | Prefix used referencing the base_iri in the context of the graph | fileid | No |
 | separator | str | Separator between base IRI and suffix | / | No |
 | encoding | str | Encoding used while parsing | utf-8 | No |
-| data_download_uri | AnyUrl | General base iri for downloading the time series after uploading | https://www.example.org/download | No |
+| data_download_uri | AnyUrl | General base iri for downloading the dataframe after uploading | https://www.example.org/download | No |
 | graph_identifier | Optional[str, AnyUrl] | Identifier of the graph to be produced | None | No |
 | namespace_placeholder | Union[str, AnyUrl] | Placeholder of the extra triples to be replaced with the base_iri during the pipeline run | http://abox-namespace-placeholder.org/ | No |
 | remove_from_unit | List[str] | Characters which should be removed from the input value for the unit | ["[", "]", '"', " "] | No |