diff --git a/data2rdf/config.py b/data2rdf/config.py index 69f6a82c..f50cdef3 100755 --- a/data2rdf/config.py +++ b/data2rdf/config.py @@ -39,7 +39,7 @@ class Config(BaseSettings): data_download_uri: Union[str, AnyUrl] = Field( "https://www.example.org/download", - description="General base iri for downloading the time series after uploading", + description="General base iri for downloading the dataframe after uploading", ) graph_identifier: Optional[Union[str, AnyUrl]] = Field( diff --git a/data2rdf/models/graph.py b/data2rdf/models/graph.py index 11f84e31..867fcbeb 100644 --- a/data2rdf/models/graph.py +++ b/data2rdf/models/graph.py @@ -146,7 +146,7 @@ def validate_measurement_unit(cls, self) -> "MeasurementUnit": class QuantityGraph(BasicGraphModel, BasicSuffixModel): """Quantity with or without a discrete value and a unit E.g. a quantity with a single value and unit _or_ - a quantity describing a column of a time series or table with a unit.""" + a quantity describing a column of a dataframe or table with a unit.""" unit: Optional[Union[str, AnyUrl]] = Field( None, description="QUDT Symbol or any other IRI for the unit mapping" @@ -273,7 +273,7 @@ class PropertyGraph(BasicGraphModel, BasicSuffixModel): """Mapping for an individual with arbitrary property. E.g. the name of a tester or a testing facility. The value must not have a discrete value but can also be a reference to a column in a table or - time series.""" + dataframe.""" value: Optional[ Union[str, int, float, bool, AnyUrl, "PropertyGraph", "QuantityGraph"] diff --git a/data2rdf/models/mapping.py b/data2rdf/models/mapping.py index 878047c2..a2e32d96 100644 --- a/data2rdf/models/mapping.py +++ b/data2rdf/models/mapping.py @@ -185,7 +185,8 @@ class ABoxExcelMapping(ABoxBaseMapping): dataframe_start: Optional[str] = Field( None, - description="Cell location for the start of the time series quantity", + description="Cell location for the start of the dataframe quantity", + alias=AliasChoices("dataframe_start", "time_series_start"), ) worksheet: Optional[str] = Field( None, diff --git a/data2rdf/parsers/base.py b/data2rdf/parsers/base.py index 4caebfc6..2818291c 100644 --- a/data2rdf/parsers/base.py +++ b/data2rdf/parsers/base.py @@ -46,7 +46,7 @@ class BaseParser(BaseModel): dropna: bool = Field( False, - description="Drop all rows where ONLY NaN and None occur in the time series.", + description="Drop all rows where ONLY NaN and None occur in the dataframe.", ) config: Config = Field( @@ -344,7 +344,7 @@ def general_metadata(self) -> "List[BasicConceptMapping]": @property def dataframe_metadata(self) -> "List[BasicConceptMapping]": - """Return time series metadata""" + """Return dataframe metadata""" if self.mode == PipelineMode.ABOX: return self.abox.dataframe_metadata else: @@ -354,7 +354,7 @@ def dataframe_metadata(self) -> "List[BasicConceptMapping]": @property def dataframe(self) -> "Dict[str, Any]": - """Return time series""" + """Return dataframe""" if self.mode == PipelineMode.ABOX: return self.abox.dataframe else: diff --git a/data2rdf/parsers/csv.py b/data2rdf/parsers/csv.py index a4d041ad..168c9329 100644 --- a/data2rdf/parsers/csv.py +++ b/data2rdf/parsers/csv.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from pydantic import Field +from pydantic import AliasChoices, Field from data2rdf.models.graph import PropertyGraph, QuantityGraph from data2rdf.utils import make_prefix @@ -120,10 +120,16 @@ class CSVABoxParser(ABoxBaseParser): ) metadata_length: int = Field(..., description="Length of the metadata") dataframe_sep: Optional[str] = Field( - None, description="Column separator of the time series header" + None, + description="Column separator of the dataframe header", + alias=AliasChoices("dataframe_sep", "time_series_sep"), ) dataframe_header_length: int = Field( - 2, description="Length of header of the time series" + 2, + description="Length of header of the dataframe", + alias=AliasChoices( + "dataframe_header_length", "time_series_header_length" + ), ) fillna: Optional[Any] = Field( "", description="Value to fill NaN values in the parsed dataframe." @@ -148,7 +154,7 @@ def json_ld(self) -> "Dict[str, Any]": Returns a JSON-LD representation of the CSV data in ABox mode. This method generates a JSON-LD object that describes the CSV data, - including its metadata, time series data, and relationships between them. + including its metadata, dataframe data, and relationships between them. The returned JSON-LD object is in the format of a csvw:TableGroup, which contains one or more csvw:Table objects. Each csvw:Table object @@ -212,7 +218,7 @@ def json_ld(self) -> "Dict[str, Any]": tables += [ { "@type": "csvw:Table", - "rdfs:label": "Time series data", + "rdfs:label": "Dataframe", "csvw:tableSchema": column_schema, } ] @@ -302,7 +308,7 @@ def _run_parser( mapping: "List[ABoxBaseMapping]", ) -> None: """ - This function is responsible for parsing metadata, time series metadata, and time series data from a CSV file. + This function is responsible for parsing metadata, dataframe metadata, and dataframe data from a CSV file. It takes in three parameters: - `self`: The CSVParser instance. @@ -311,8 +317,8 @@ def _run_parser( The function returns None, but it populates the following instance variables: - `self._general_metadata`: A list of PropertyGraph or QuantityGraph instances representing the general metadata. - - `self._dataframe_metadata`: A list of QuantityGraph instances representing the time series metadata. - - `self._dataframe`: A pandas DataFrame containing the time series data. + - `self._dataframe_metadata`: A list of QuantityGraph instances representing the dataframe metadata. + - `self._dataframe`: A pandas DataFrame containing the dataframe data. The function also raises ValueError if the `metadata_length` is greater than 0 but `metadata_sep` is not set. It raises TypeError if the unit for a key is not a string. @@ -398,7 +404,7 @@ def _run_parser( MappingMissmatchWarning, ) - # parse time series data and meta data + # parse dataframe data and meta data self._dataframe_metadata = [] self._dataframe = {} @@ -441,7 +447,7 @@ def _run_parser( # append model self.dataframe_metadata.append(model) - # assign time series data + # assign dataframe data self._dataframe[model.suffix] = dataframe[key][ self.dataframe_header_length - 1 : ].to_list() @@ -451,7 +457,7 @@ def _run_parser( f"No match found in mapping for key `{key}`", MappingMissmatchWarning, ) - # set time series as pd dataframe + # set dataframe as pd dataframe self._dataframe = pd.DataFrame.from_dict( self._dataframe, orient="index" ).transpose() @@ -485,7 +491,7 @@ def _parse_dataframe( ] else: warnings.warn( - "`dataframe_sep` is not set. Any potential time series in the data file will be skipped.", + "`dataframe_sep` is not set. Any potential dataframe in the data file will be skipped.", ParserWarning, ) response = [] diff --git a/data2rdf/parsers/excel.py b/data2rdf/parsers/excel.py index 65ecbd4d..61bbe614 100644 --- a/data2rdf/parsers/excel.py +++ b/data2rdf/parsers/excel.py @@ -145,8 +145,8 @@ def json_ld(self) -> Dict[str, Any]: """ Returns the JSON-LD representation of the data in ABox mode. - The JSON-LD is constructed based on the metadata and time series data. - If the file description is not suppressed, it includes the metadata and time series data tables. + The JSON-LD is constructed based on the metadata and dataframe data. + If the file description is not suppressed, it includes the metadata and dataframe data tables. Otherwise, it returns a list of JSON-LD representations of the individual models. :return: A dictionary representing the JSON-LD data. @@ -194,7 +194,7 @@ def json_ld(self) -> Dict[str, Any]: tables += [ { "@type": "csvw:Table", - "rdfs:label": "Time series data", + "rdfs:label": "Dataframe", "csvw:tableSchema": column_schema, } ] @@ -284,7 +284,7 @@ def _run_parser( mapping: "List[ABoxExcelMapping]", ) -> None: """ - Parses the metadata, time series metadata, and time series from an Excel file. + Parses the metadata, dataframe metadata, and dataframe from an Excel file. Args: self (ExcelABoxParser): The instance of the ExcelABoxParser class. @@ -326,7 +326,7 @@ def _run_parser( are set. Only one of them must be set.""" ) - # find data for time series + # find data for dataframe if datum.dataframe_start: column_name = datum.dataframe_start.rstrip("0123456789") dataframe_end = f"{column_name}{worksheet.max_row}" @@ -338,7 +338,7 @@ def _run_parser( ] else: message = f"""Concept with key `{datum.key}` - does not have a time series from `{datum.dataframe_start}` + does not have a dataframe from `{datum.dataframe_start}` until `{dataframe_end}` . Concept will be omitted in graph. """ @@ -476,7 +476,7 @@ def _run_parser( """ warnings.warn(message, MappingMissmatchWarning) - # set time series as pd dataframe + # set dataframe as pd dataframe self._dataframe = pd.DataFrame.from_dict( self._dataframe, orient="index" ).transpose() diff --git a/data2rdf/parsers/json.py b/data2rdf/parsers/json.py index 8e929d45..a34ac9d0 100644 --- a/data2rdf/parsers/json.py +++ b/data2rdf/parsers/json.py @@ -139,7 +139,7 @@ def json_ld(self) -> Dict[str, Any]: This method generates the JSON-LD representation of the parser's data, including the context, id, type, and members. The members are generated - based on the general metadata and time series metadata. + based on the general metadata and dataframe metadata. The method returns a dictionary containing the JSON-LD representation. @@ -271,8 +271,8 @@ def _run_parser( mapping: "List[ABoxBaseMapping]", ) -> None: """ - Class method for parsing metadata, time series metadata, - and time series from a given data file and mapping. + Class method for parsing metadata, dataframe metadata, + and dataframe from a given data file and mapping. Args: self: An instance of JsonABoxParser. @@ -359,8 +359,8 @@ def _run_parser( # if we have a series and a unit and we are *not* expanding: # * make a QuantityGraph with the unit - # * add the graph to the time series metadata - # * add the values of the series to the time series array + # * add the graph to the dataframe metadata + # * add the values of the series to the dataframe array if ( isinstance(value, list) and unit @@ -388,8 +388,8 @@ def _run_parser( self._general_metadata.append(model) # if we have a series and *no* unit and we are *not* expanding: # * make a PropertyGraph - # * add the graph to the time series metadata - # * add the values of the series to the time series array + # * add the graph to the dataframe metadata + # * add the values of the series to the dataframe array elif ( isinstance(value, list) and not unit @@ -466,7 +466,7 @@ def _run_parser( relation, subdataset, datum, suffix ) - # set time series as pd dataframe + # set dataframe as pd dataframe self._dataframe = pd.DataFrame.from_dict( self._dataframe, orient="index" ).transpose() diff --git a/data2rdf/pipelines/main.py b/data2rdf/pipelines/main.py index 54df3489..3bda5191 100644 --- a/data2rdf/pipelines/main.py +++ b/data2rdf/pipelines/main.py @@ -243,7 +243,7 @@ def general_metadata(self) -> "List[BasicConceptMapping]": @property def dataframe_metadata(self) -> "List[BasicConceptMapping]": - """Return list object with time series metadata""" + """Return list object with dataframe metadata""" if self.mode == PipelineMode.ABOX: return self.parser.abox.dataframe_metadata else: @@ -253,7 +253,7 @@ def dataframe_metadata(self) -> "List[BasicConceptMapping]": @property def dataframe(self) -> "Dict[str, Any]": - """Return time series""" + """Return dataframe""" if self.mode == PipelineMode.ABOX: return self.parser.abox.dataframe else: diff --git a/docs/config.md b/docs/config.md index f32c42a5..94fef524 100644 --- a/docs/config.md +++ b/docs/config.md @@ -11,7 +11,7 @@ The configuration of the package is crucial for the correct parsing and transfor | prefix_name | str | Prefix used referencing the base_iri in the context of the graph | fileid | No | | separator | str | Separator between base IRI and suffix | / | No | | encoding | str | Encoding used while parsing | utf-8 | No | -| data_download_uri | AnyUrl | General base iri for downloading the time series after uploading | https://www.example.org/download | No | +| data_download_uri | AnyUrl | General base iri for downloading the dataframe after uploading | https://www.example.org/download | No | | graph_identifier | Optional[str, AnyUrl] | Identifier of the graph to be produced | None | No | | namespace_placeholder | Union[str, AnyUrl] | Placeholder of the extra triples to be replaced with the base_iri during the pipeline run | http://abox-namespace-placeholder.org/ | No | | remove_from_unit | List[str] | Characters which should be removed from the input value for the unit | ["[", "]", '"', " "] | No | diff --git a/docs/examples/abox/1_csv.md b/docs/examples/abox/1_csv.md index b758e93c..344714c2 100644 --- a/docs/examples/abox/1_csv.md +++ b/docs/examples/abox/1_csv.md @@ -1,4 +1,4 @@ -# CSV file with metadata and time series +# CSV file with metadata and dataframe ```{note} Please follow [this link here](https://github.com/MI-FraunhoferIWM/data2rdf/blob/main/examples/1_csv.ipynb) in order to access the related jupyter notebook. @@ -8,10 +8,10 @@ Please follow [this link here](https://github.com/MI-FraunhoferIWM/data2rdf/blob In this example, we want to transfor a csv file which encorporates stress/strain of the measurement and some metadata about the experiment into an RDF repesentation. -For this purpose, we are describing the **general metadata** of the experiment as well as the **metadata of the time series**. +For this purpose, we are describing the **general metadata** of the experiment as well as the **metadata of the dataframe**. ```{note} -We do not target to transform the time series itself into RDF, since it usually includes several thousands of datums per column. Hence we would give a reference in the form of an URI which is is pointing to the location of the file (e.g. a route in a web API or a local system path). +We do not target to transform the dataframe itself into RDF, since it usually includes several thousands of datums per column. Hence we would give a reference in the form of an URI which is is pointing to the location of the file (e.g. a route in a web API or a local system path). ``` ## The inputs @@ -29,19 +29,19 @@ The csv file produced by the tensile test machine looks like this: ![details](../../assets/img/docu/CSV-Parser.png) -The original file can be accessed [here](https://github.com/MI-FraunhoferIWM/data2rdf/raw/bbde50919c50f3428eec179f94f29315f31165fe/tests/abox/csv_pipeline_test/input/data/DX56_D_FZ2_WR00_43.TXT). Due to clarify reasons, we truncated the time series in this document here. +The original file can be accessed [here](https://github.com/MI-FraunhoferIWM/data2rdf/raw/bbde50919c50f3428eec179f94f29315f31165fe/tests/abox/csv_pipeline_test/input/data/DX56_D_FZ2_WR00_43.TXT). Due to clarify reasons, we truncated the dataframe in this document here. ```{note} -We are strictly assuming that metadata is on top of the time series and has a the key-value-unit pattern. Therefore the metadata up to now needs to have a width of 2 to 3 columns. In the future, we may support extending the default width of the metadata, in case if we need to have a width of 4 or more columns, e.g. if there are be more concepts than just value and unit. +We are strictly assuming that metadata is on top of the dataframe and has a the key-value-unit pattern. Therefore the metadata up to now needs to have a width of 2 to 3 columns. In the future, we may support extending the default width of the metadata, in case if we need to have a width of 4 or more columns, e.g. if there are be more concepts than just value and unit. We generally assume that the **direction of the metadata is horizontally oriented**, which means that the firs key in each row is the index (or primary key) of the metadata. **All of the values in this metadata shall be represented in an RDF graph**. -Accordingly the **direction of the time series is vertically oriented**, which means that the first key in the header of each column will be the index (or primary key) of the time series. **In contrast to the metadata, we only want to describe the metadata of the time series in RDF and do not want to include each datum of each in the time series into the RDF**. +Accordingly the **direction of the dataframe is vertically oriented**, which means that the first key in the header of each column will be the index (or primary key) of the dataframe. **In contrast to the metadata, we only want to describe the metadata of the dataframe in RDF and do not want to include each datum of each in the dataframe into the RDF**. ``` As you may see, the metadata of file _has a length of 22 rows_. The metadata itself is tab-separated, has the name of a concept in the first column (e.g. `"Vorkraft"`=Preload), the value related to this metadatum in the second column (e.g. `"22"`) and optionally a unit in the third column (e.g. `"MPa"`). -Subsequently, there is the time series with a header which has a length of rows: one with a the concept name (e.g. `"Prüfzeit"`=Test time) columns and one with the respective unit again (e.g. `"s"`). +Subsequently, there is the dataframe with a header which has a length of rows: one with a the concept name (e.g. `"Prüfzeit"`=Test time) columns and one with the respective unit again (e.g. `"s"`). ### The parser arguments @@ -49,8 +49,8 @@ Since we are assuming to have a csv file, we can assume the following parser arg * `"metadata_sep"`: The separator of the metadata In this example, we assume that the metadata is tab-separated. Hence the argument is `"\t"`. -* `"dataframe_sep"`: The separator of the time series - In this example, we assume that the time series is tab-separated. Hence the argument is `"\t"`. +* `"dataframe_sep"`: The separator of the dataframe + In this example, we assume that the dataframe is tab-separated. Hence the argument is `"\t"`. * `"metadata_length"`: The length of the metadata In this example, we assume that the metadata has 22 rows. Hence the argument is `22`. @@ -65,15 +65,15 @@ Since we are assuming to have a csv file, we can assume the following parser arg "Temperatur" 22 "°C" "Bemerkung" "" ``` -* `dataframe_header_length`: The length of the header of the time series. - In this example, we assume that the time series has 2 rows, which is the name of the concept in the first row and the corresponding unit in the second row: +* `dataframe_header_length`: The length of the header of the dataframe. + In this example, we assume that the dataframe has 2 rows, which is the name of the concept in the first row and the corresponding unit in the second row: ``` "Standardweg" "Breitenänderung" "Dehnung" "s" "N" "mm" "mm" "mm" "mm" Hence the argument is `2` ``` * `"fillna"`: The value to fill NaN values in the parsed dataframe. - In this example, we assume that the NaN values in the dataframe are filled with `""`. Hence the argument is `""`. This is in particular of importance when the time series is parsed from the csv file. Since we are using pandas to parse the csv file, we need to make sure that gaps in the time series are filled with `""`, instead of the default `np.nan` values in the dataframe. If not applied here, this might lead to problems in the data2rdf pipeline. + In this example, we assume that the NaN values in the dataframe are filled with `""`. Hence the argument is `""`. This is in particular of importance when the dataframe is parsed from the csv file. Since we are using pandas to parse the csv file, we need to make sure that gaps in the dataframe are filled with `""`, instead of the default `np.nan` values in the dataframe. If not applied here, this might lead to problems in the data2rdf pipeline. The according parser args hence will look like this: @@ -225,7 +225,7 @@ You may see that we simply need to deliver a list of dictionaries with the keys Let us take the following concept for the example: the key of the concept is `"Vorkraft"` and the IRI is `"https://w3id.org/steel/ProcessOntology/Preload"`, which has been defined in the introduced ontology above. Of course, you **may also choose any ontological IRI from any ontology** which matches your concept from the data file. -Please also note that we are defining the mappings of the metadata and the time series in the same dictionary. The **units of the metadata** is assumed to be read from the **third column of the metadata-section** whereas the **unit of the time series columns** is assumed to be read from the **second row of the time series header**. +Please also note that we are defining the mappings of the metadata and the dataframe in the same dictionary. The **units of the metadata** is assumed to be read from the **third column of the metadata-section** whereas the **unit of the dataframe columns** is assumed to be read from the **second row of the dataframe header**. During the pipeline run, the units extracted from the metadata and timeseries will be mapped to ontological concepts of the [QUDT ontology](https://www.qudt.org/pages/QUDToverviewPage.html), describing a large set of SI units and off-system units. @@ -527,8 +527,8 @@ The pipeline will deliver you the following outputs: * `graph`: the generated RDF graph * `plain_metadata`: the plain values of the metadata of the experiment -* `dataframe`: the plain time series of the experiment -* `dataframe_metadata`: the metadata of the time series +* `dataframe`: the plain dataframe of the experiment +* `dataframe_metadata`: the metadata of the dataframe ### The RDF graph @@ -711,7 +711,7 @@ fileid:tableGroup a csvw:TableGroup ; csvw:rownum 16 ; csvw:titles "Prüfgeschwindigkeit"^^xsd:string ] ], [ a csvw:Table ; - rdfs:label "Time series data" ; + rdfs:label "dataframe data" ; csvw:tableSchema [ a csvw:Schema ; csvw:column [ a csvw:Column ; qudt:quantity fileid:Elongation ; @@ -878,7 +878,7 @@ fileid:TensileTestSpecimen a prov:Agent, You can see that the graph consist now out of several subgraphs: * the data graph - * graph describing the metadata of the experiment and the metadata of the time series. + * graph describing the metadata of the experiment and the metadata of the dataframe. * the graph describing the structure of the csv file * the additional triples @@ -887,7 +887,7 @@ Please see the following sections for more details. #### Data graph -The part of the graph describing the metadata of the experiment and the metadata of the time series may look like this: +The part of the graph describing the metadata of the experiment and the metadata of the dataframe may look like this: ``` [...] @@ -939,7 +939,7 @@ fileid:tableGroup a csvw:TableGroup ; [ a csvw:Table ; - rdfs:label "Time series data" ; + rdfs:label "dataframe data" ; csvw:tableSchema [ a csvw:Schema ; csvw:column [ a csvw:Column ; qudt:quantity fileid:AbsoluteCrossheadTravel ; @@ -964,9 +964,9 @@ fileid:tableGroup a csvw:TableGroup ; -You may note here that the two kinds of metadata are described through two tables groups in the output graph: one with an `rdfs:label "Metadata" ` and one with an `rdfs:label "Time series data"`. Both table groups are either describing the rows (e.g. for `fileid:Material` and `fileid:OriginalGaugeLength`) or the columns (e.g. `fileid:AbsoluteCrossheadTravel` and `fileid:Extension`) of the respective metadata and are pointing to the definition of the individuals datum (see snippet above). +You may note here that the two kinds of metadata are described through two tables groups in the output graph: one with an `rdfs:label "Metadata" ` and one with an `rdfs:label "dataframe data"`. Both table groups are either describing the rows (e.g. for `fileid:Material` and `fileid:OriginalGaugeLength`) or the columns (e.g. `fileid:AbsoluteCrossheadTravel` and `fileid:Extension`) of the respective metadata and are pointing to the definition of the individuals datum (see snippet above). -Please note here that the concepts for the individuals `fileid:AbloluteCrossheadTravel` and `fileid:Extension` describing the time series data only make a reference to an access url described by `https://www.example.org/download/column-2` and `https://www.example.org/download/column-3`, which may be the routes in a web server or an access url in a database. The base of this access url is `https://www.example.org/download` and can be adjusted in the config of the pipeline by setting `config = {"data_download_uri": "https://www.example.org/download/dataset-123"}`. +Please note here that the concepts for the individuals `fileid:AbloluteCrossheadTravel` and `fileid:Extension` describing the dataframe data only make a reference to an access url described by `https://www.example.org/download/column-2` and `https://www.example.org/download/column-3`, which may be the routes in a web server or an access url in a database. The base of this access url is `https://www.example.org/download` and can be adjusted in the config of the pipeline by setting `config = {"data_download_uri": "https://www.example.org/download/dataset-123"}`. By setting `config = {"suppress_file_description": True}` this file description of the table groups will be neglected in the output graph. @@ -1448,9 +1448,9 @@ You will notice that this `plain_metadata` is a shorthand for a code snippet lik print({obj.suffix: obj.value for obj in pipeline.general_metadata}) ``` -### The time series metadata +### The dataframe metadata -In case of the need of further processing the time series metadata (header of the time series) resulting from the pipeline after parsing, the `dataframe_metadata` property can be accessed as follows: +In case of the need of further processing the dataframe metadata (header of the dataframe) resulting from the pipeline after parsing, the `dataframe_metadata` property can be accessed as follows: ``` print(pipeline.dataframe_metadata) @@ -1509,12 +1509,12 @@ The result should look like this: value_relation=qudt:value)] ``` -The result is a list of `QuantityGraph` which (or `PropertyGraph` in case of non-quantative columns) which result from the pipeline. Each object contains information about the time series metadata of a single quantity, such as the key, the unit and the value. +The result is a list of `QuantityGraph` which (or `PropertyGraph` in case of non-quantative columns) which result from the pipeline. Each object contains information about the dataframe metadata of a single quantity, such as the key, the unit and the value. -### The time series data +### The dataframe data -In case of the need of further processing the time series data (tabular data) resulting from the pipeline after parsing, the `dataframe` property can be accessed as follows: +In case of the need of further processing the dataframe data (tabular data) resulting from the pipeline after parsing, the `dataframe` property can be accessed as follows: ``` print(pipeline.dataframe) diff --git a/docs/examples/abox/2_excel.md b/docs/examples/abox/2_excel.md index c2724815..6aeaf7aa 100644 --- a/docs/examples/abox/2_excel.md +++ b/docs/examples/abox/2_excel.md @@ -1,4 +1,4 @@ -# Excel file with metadata and time series +# Excel file with metadata and dataframe ```{note} Please follow [this link here](https://github.com/MI-FraunhoferIWM/data2rdf/blob/main/examples/2_excel.ipynb) in order to access the related jupyter notebook. @@ -6,7 +6,7 @@ Please follow [this link here](https://github.com/MI-FraunhoferIWM/data2rdf/blob ```{note} -This example is building up on the previous one about the [CSV file with metadata and time series](1_csv.md). +This example is building up on the previous one about the [CSV file with metadata and dataframe](1_csv.md). Please start from this chapter in order to fully understand the content of this example. ``` @@ -14,7 +14,7 @@ Please start from this chapter in order to fully understand the content of this In this example, we want to transfor an excel file which encorporates stress/strain of the measurement and some metadata about the experiment into an RDF repesentation. -For this purpose, we are describing the **general metadata** of the experiment as well as the **metadata of the time series**. +For this purpose, we are describing the **general metadata** of the experiment as well as the **metadata of the dataframe**. ## The inputs @@ -36,14 +36,14 @@ The excel file produced by the tensile test machine looks like this: ![details](../../assets/img/docu/excel_parser/excel2.JPG) -Again, we are facing metadata of the experiment like e.g. `Projekt`, `Prüfer`, `Werkstoff`, etc. and time series with the quantities of `Zeit`, `F`, `B`, which need to be mapped to ontological concepts. +Again, we are facing metadata of the experiment like e.g. `Projekt`, `Prüfer`, `Werkstoff`, etc. and dataframe with the quantities of `Zeit`, `F`, `B`, which need to be mapped to ontological concepts. The original file can be accessed [here](https://github.com/MI-FraunhoferIWM/data2rdf/raw/bbde50919c50f3428eec179f94f29315f31165fe/tests/abox/xls_pipeline_test/input/data/AFZ1-Fz-S1Q.xlsm). ### The mapping -In contrast to the previous CSV example, we have to provide more information about the location of the data in the excel file. Previously, we simply had to provide the `key` of the concept in the data file. But since we are using `openpyxl`, we need to provide the `worksheet`, `value_location` (in case of metadata), `dataframe_start` (in case of time series) and `unit_location` (in case of quantitative data) for each concept in the excel file. +In contrast to the previous CSV example, we have to provide more information about the location of the data in the excel file. Previously, we simply had to provide the `key` of the concept in the data file. But since we are using `openpyxl`, we need to provide the `worksheet`, `value_location` (in case of metadata), `dataframe_start` (in case of dataframe) and `unit_location` (in case of quantitative data) for each concept in the excel file. A valid mapping for the example file show above may look like this: @@ -193,7 +193,7 @@ Please note that a mapping for a metadatum looks like this: ... ``` -Whereas the mapping of a time series looks like this: +Whereas the mapping of a dataframe looks like this: ``` { @@ -558,7 +558,7 @@ fileid:tableGroup a csvw:TableGroup ; csvw:describes fileid:ProjectNumber ; csvw:titles "Projekt"^^xsd:string ] ], [ a csvw:Table ; - rdfs:label "Time series data" ; + rdfs:label "dataframe data" ; csvw:tableSchema [ a csvw:Schema ; csvw:column [ a csvw:Column ; qudt:quantity fileid:WidthChange ; diff --git a/docs/examples/abox/3_json.md b/docs/examples/abox/3_json.md index 4cf7100f..58050d22 100644 --- a/docs/examples/abox/3_json.md +++ b/docs/examples/abox/3_json.md @@ -1,4 +1,4 @@ -# JSON file or Python-dict with metadata and time series +# JSON file or Python-dict with metadata and dataframe ```{note} Please follow [this link here](https://github.com/MI-FraunhoferIWM/data2rdf/blob/main/examples/3_json.ipynb) in order to access the related jupyter notebook. @@ -6,7 +6,7 @@ Please follow [this link here](https://github.com/MI-FraunhoferIWM/data2rdf/blob ```{note} -This example is building up on the very first one about the [CSV file with metadata and time series](1_csv.md). +This example is building up on the very first one about the [CSV file with metadata and dataframe](1_csv.md). Please start from this chapter in order to fully understand the content of this example. ``` @@ -14,7 +14,7 @@ Please start from this chapter in order to fully understand the content of this Typically, data can also be provided in the serialization of a json file, which ulimately can be parsed in to a dict object in Python. -In this section, we would like to showcase how to write a mapping for a json file with metadata and time series. The content of the json can be nested into an arbitrary depth. +In this section, we would like to showcase how to write a mapping for a json file with metadata and dataframe. The content of the json can be nested into an arbitrary depth. ## The inputs @@ -56,7 +56,7 @@ raw_data = { } ``` -As you may notice, concepts like `Breitenänderung` and `Dehnung` both are time series with slighly different key-patters: `Standardkraft` is a dictionary/object with an additional subelement called `array` for the values and `unit` for the unit, whereas the `Dehnung` key directly has a list of values and no reference for the unit. +As you may notice, concepts like `Breitenänderung` and `Dehnung` both are dataframe with slighly different key-patters: `Standardkraft` is a dictionary/object with an additional subelement called `array` for the values and `unit` for the unit, whereas the `Dehnung` key directly has a list of values and no reference for the unit. ### The mapping diff --git a/docs/examples/abox/4_csv_wo_metadata.md b/docs/examples/abox/4_csv_wo_metadata.md index ff6dc1c5..880dac82 100644 --- a/docs/examples/abox/4_csv_wo_metadata.md +++ b/docs/examples/abox/4_csv_wo_metadata.md @@ -6,7 +6,7 @@ Please follow [this link here](https://github.com/MI-FraunhoferIWM/data2rdf/blob ## General understanding -In this example, we are looking into dummy sensor data which is provided by a csv file. However, we do not have any metadata in this case, but directly start with the time series. Additionaly, each column in this time series is of the same ontological class, but was recorded by a different sensor. +In this example, we are looking into dummy sensor data which is provided by a csv file. However, we do not have any metadata in this case, but directly start with the dataframe. Additionaly, each column in this dataframe is of the same ontological class, but was recorded by a different sensor. ## The inputs @@ -30,15 +30,15 @@ time,column_01,column_02,column_03 7,7,7,7 ``` -You may note that the first column is the time and the rest of the columns are of the same class. As already mentioned above, there is no metadata, but only time series in this case. +You may note that the first column is the time and the rest of the columns are of the same class. As already mentioned above, there is no metadata, but only dataframe in this case. ### The parser arguments Since we are considering the csv parser again, we need to take the following parser arguments into account: -* `dataframe_sep`: the separator for the time series. In this case, it is a `,`. +* `dataframe_sep`: the separator for the dataframe. In this case, it is a `,`. * `metadata_length`: the length of the metadata in the csv file. In this case, it is 0, since we do not have any metadata. -* `dataframe_header_length`: the length of the header of the time series in the csv file. In this case, it is 1, since the time series start at the second row. +* `dataframe_header_length`: the length of the header of the dataframe in the csv file. In this case, it is 1, since the dataframe start at the second row. The resulting Python dictionary for the parser arguments would look like this: @@ -168,7 +168,7 @@ fileid:TestTime a . fileid:tableGroup a csvw:TableGroup ; csvw:table [ a csvw:Table ; - rdfs:label "Time series data" ; + rdfs:label "dataframe data" ; csvw:tableSchema [ a csvw:Schema ; csvw:column [ a csvw:Column ; qudt:quantity fileid:Sensor2 ; diff --git a/docs/examples/abox/5_csv_w_na.md b/docs/examples/abox/5_csv_w_na.md index a84e406e..6de63fb7 100644 --- a/docs/examples/abox/5_csv_w_na.md +++ b/docs/examples/abox/5_csv_w_na.md @@ -6,7 +6,7 @@ Please follow [this link here](https://github.com/MI-FraunhoferIWM/data2rdf/blob ## General understanding -In comparision to the [previous example of the csv file without metadata](1.4_csv_wo_metadata.md), we are using the similar data again, but now we have some missing values in the time series. Again, there will be no metadata in this case. +In comparision to the [previous example of the csv file without metadata](1.4_csv_wo_metadata.md), we are using the similar data again, but now we have some missing values in the dataframe. Again, there will be no metadata in this case. ## The inputs @@ -14,7 +14,7 @@ For this example, we will consider the following inputs: * the csv file produced to be parsed * the mapping for describing the data in RDF -* the parser arguments telling the pipeline that we do not have any metadata in the file and that we have missing values in the time series. +* the parser arguments telling the pipeline that we do not have any metadata in the file and that we have missing values in the dataframe. ### The raw data @@ -56,7 +56,7 @@ Temperature[°C];Coefficient of thermal exapansion[1/K];Specific heat[J/kgK];You 1200;2.80E-05;1.34E+03;3.80E+09;0.494;2; ``` -As you may have noticed, this csv file here strictly speaking does not feature any time series, but a data frame of different samples with multiple properties like Young's modulus, specific heat capacity, etc. +As you may have noticed, this csv file here strictly speaking does not feature any dataframe, but a data frame of different samples with multiple properties like Young's modulus, specific heat capacity, etc. Since the data frame here is vertically oriented, we are considering not to transform the data values into RDF again. Additionally, there are some missing values, which are marked with `;;` in the csv file. These locations need to be properly handled in the pipeline, since we do not want to drop these rows while parsing. @@ -65,9 +65,9 @@ Additionally, there are some missing values, which are marked with `;;` in the c According to the condition of the csv parser, we need to take the following parser arguments into account: -* `dataframe_sep`: the separator for the time series. In this case, it is a `;`. +* `dataframe_sep`: the separator for the dataframe. In this case, it is a `;`. * `metadata_length`: the length of the metadata in the csv file. In this case, it is 0, since we do not have any metadata. -* `dataframe_header_length`: the length of the header of the time series in the csv file. In this case, it is 1, since the time series start at the second row. +* `dataframe_header_length`: the length of the header of the dataframe in the csv file. In this case, it is 1, since the dataframe start at the second row. * `drop_na`: whether to drop the rows with missing values. In this case, it is `False`. The according Python dict for the parser arguments would look like this: @@ -269,7 +269,7 @@ fileid:ThermalExpansionCoefficient a None: metadata ) assert sort_entries(pipeline.to_dict()) == as_non_dsms_schema(metadata) + + +def test_csv_pipeline_alias() -> None: + from rdflib import Graph + + from data2rdf import ( # isort:skip + Data2RDF, + Parser, + PropertyGraph, + QuantityGraph, + ) + + with open(raw_data, encoding="utf-8") as file: + input_obj = file.read() + + pipeline = Data2RDF( + raw_data=input_obj, + mapping=os.path.join(mapping_folder, "tensile_test_mapping.json"), + parser=Parser.csv, + parser_args={ + "metadata_sep": "\t", + "time_series_sep": "\t", + "metadata_length": 20, + "time_series_header_length": 2, + }, + additional_triples=template, + ) + + assert len(pipeline.general_metadata) == 20 + for row in pipeline.general_metadata: + assert isinstance(row, QuantityGraph) or isinstance(row, PropertyGraph) + + assert len(pipeline.dataframe_metadata) == 6 + for row in pipeline.dataframe_metadata: + assert isinstance(row, QuantityGraph) + + assert len(pipeline.dataframe.columns) == 6 + assert sorted(list(pipeline.dataframe.columns)) == sorted(columns) + for name, column in pipeline.dataframe.items(): + assert len(column) == 5734 + + expected_graph = Graph() + expected_graph.parse(expected) + + assert pipeline.graph.isomorphic(expected_graph) + + assert remove_ids(pipeline.to_dict(schema=dsms_schema)) == sort_entries( + metadata + ) + assert sort_entries(pipeline.to_dict()) == as_non_dsms_schema(metadata) diff --git a/tests/abox/csv_without_header/output/output_csv_parser.ttl b/tests/abox/csv_without_header/output/output_csv_parser.ttl index 8684ab20..6246c0b6 100644 --- a/tests/abox/csv_without_header/output/output_csv_parser.ttl +++ b/tests/abox/csv_without_header/output/output_csv_parser.ttl @@ -8,7 +8,7 @@ fileid:tableGroup a csvw:TableGroup ; csvw:table [ a csvw:Table ; - rdfs:label "Time series data" ; + rdfs:label "Dataframe data" ; csvw:tableSchema [ a csvw:Schema ; csvw:column [ a csvw:Column ; qudt:quantity fileid:Sensor1 ; diff --git a/tests/abox/csv_without_header/output/output_csv_pipeline.ttl b/tests/abox/csv_without_header/output/output_csv_pipeline.ttl index ede541fd..d24157ba 100644 --- a/tests/abox/csv_without_header/output/output_csv_pipeline.ttl +++ b/tests/abox/csv_without_header/output/output_csv_pipeline.ttl @@ -23,7 +23,7 @@ fileid:TestTime a . fileid:tableGroup a csvw:TableGroup ; csvw:table [ a csvw:Table ; - rdfs:label "Time series data" ; + rdfs:label "Dataframe data" ; csvw:tableSchema [ a csvw:Schema ; csvw:column [ a csvw:Column ; qudt:quantity fileid:TestTime ; diff --git a/tests/abox/xls_pipeline_test/input/mapping/tensile_test_mapping_alias.json b/tests/abox/xls_pipeline_test/input/mapping/tensile_test_mapping_alias.json new file mode 100644 index 00000000..58e2fbf2 --- /dev/null +++ b/tests/abox/xls_pipeline_test/input/mapping/tensile_test_mapping_alias.json @@ -0,0 +1,126 @@ +[ + { + "iri": "https://w3id.org/steel/ProcessOntology/Remark", + "key": "Bemerkungen", + "value_location": "UU31", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/WidthChange", + "key": "Breiten\u00e4nderung", + "time_series_start": "E15", + "unit_location": "E14", + "worksheet": "Messdaten" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/TimeStamp", + "key": "Datum", + "value_location": "AD6", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/PercentageElongation", + "key": "Dehnung", + "time_series_start": "Q15", + "unit": "\u00f7", + "worksheet": "Messdaten" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/OriginalGaugeLength", + "key": "Messl\u00e4nge Standardweg", + "unit_location": "P16", + "value_location": "M16", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/SpecimenWidth", + "key": "Probenbreite b", + "unit_location": "P15", + "value_location": "M15", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/SpecimenThickness", + "key": "Probendicke a", + "unit_location": "P14", + "value_location": "M14", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/SpecimenType", + "key": "Probenform", + "value_location": "AE7", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/SampleIdentifier-2", + "key": "Probenkennung 2", + "value_location": "U7", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/ProjectNumber", + "key": "Projekt", + "value_location": "F6", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/Tester", + "key": "Pr\u00fcfer", + "value_location": "U6", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/TestingRate", + "key": "Pr\u00fcfgeschwindigkeit", + "value_location": "J9", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/MachineData", + "key": "Pr\u00fcfmaschine", + "value_location": "I8", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/Temperature", + "key": "Pr\u00fcftemperatur", + "value_location": "U8", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/StandardForce", + "key": "Standardkraft", + "time_series_start": "C15", + "unit_location": "C14", + "worksheet": "Messdaten" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/Extension", + "key": "Standardweg", + "time_series_start": "D15", + "unit_location": "D14", + "worksheet": "Messdaten" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/AbsoluteCrossheadTravel", + "key": "Traversenweg absolut", + "time_series_start": "B15", + "unit_location": "B14", + "worksheet": "Messdaten" + }, + { + "annotation": "https://w3id.org/steel/ProcessOntology", + "iri": "https://w3id.org/steel/ProcessOntology/Material", + "key": "Werkstoff", + "value_location": "H7", + "worksheet": "Protokoll" + }, + { + "iri": "https://w3id.org/steel/ProcessOntology/TestTime", + "key": "Zeit", + "time_series_start": "A15", + "unit_location": "A14", + "worksheet": "Messdaten" + } +] diff --git a/tests/abox/xls_pipeline_test/output/output_excel_parser.ttl b/tests/abox/xls_pipeline_test/output/output_excel_parser.ttl index 87bb5ca3..8b36a019 100644 --- a/tests/abox/xls_pipeline_test/output/output_excel_parser.ttl +++ b/tests/abox/xls_pipeline_test/output/output_excel_parser.ttl @@ -8,7 +8,7 @@ fileid:tableGroup a csvw:TableGroup ; csvw:table [ a csvw:Table ; - rdfs:label "Time series data" ; + rdfs:label "Dataframe data" ; csvw:tableSchema [ a csvw:Schema ; csvw:column [ a csvw:Column ; qudt:quantity fileid:Extension ; diff --git a/tests/abox/xls_pipeline_test/output/output_pipeline.ttl b/tests/abox/xls_pipeline_test/output/output_pipeline.ttl index ac449523..d867b75b 100644 --- a/tests/abox/xls_pipeline_test/output/output_pipeline.ttl +++ b/tests/abox/xls_pipeline_test/output/output_pipeline.ttl @@ -90,7 +90,7 @@ fileid:dataset a dcat:Dataset, fileid:tableGroup a csvw:TableGroup ; csvw:table [ a csvw:Table ; - rdfs:label "Time series data" ; + rdfs:label "Dataframe data" ; csvw:tableSchema [ a csvw:Schema ; csvw:column [ a csvw:Column ; qudt:quantity fileid:WidthChange ; diff --git a/tests/abox/xls_pipeline_test/output/output_pipeline_suffix.ttl b/tests/abox/xls_pipeline_test/output/output_pipeline_suffix.ttl index a06d0295..ea1bc13e 100644 --- a/tests/abox/xls_pipeline_test/output/output_pipeline_suffix.ttl +++ b/tests/abox/xls_pipeline_test/output/output_pipeline_suffix.ttl @@ -113,7 +113,7 @@ fileid:tableGroup a csvw:TableGroup ; qudt:quantity fileid:TestingRate ; csvw:titles "Prüfgeschwindigkeit"^^xsd:string ] ], [ a csvw:Table ; - rdfs:label "Time series data" ; + rdfs:label "Dataframe data" ; csvw:tableSchema [ a csvw:Schema ; csvw:column [ a csvw:Column ; qudt:quantity fileid:StandardForce ; diff --git a/tests/abox/xls_pipeline_test/test_pipeline.py b/tests/abox/xls_pipeline_test/test_pipeline.py index 4fa94df8..e8df7dfd 100644 --- a/tests/abox/xls_pipeline_test/test_pipeline.py +++ b/tests/abox/xls_pipeline_test/test_pipeline.py @@ -500,3 +500,62 @@ def test_excel_pipeline_suffix() -> None: ) assert sorted(list(pipeline.dataframe.columns)) == sorted(columns_suffix) + + +def test_excel_pipeline_test_alias() -> None: + from rdflib import Graph + + from data2rdf.warnings import MappingMissmatchWarning + + from data2rdf import ( # isort:skip + Data2RDF, + Parser, + PropertyGraph, + QuantityGraph, + ) + + with open(raw_data, "rb") as file: + input_obj = file.read() + + with pytest.warns( + MappingMissmatchWarning, match="Concept with key" + ) as warnings: + pipeline = Data2RDF( + raw_data=input_obj, + mapping=os.path.join( + mapping_folder, "tensile_test_mapping_alias.json" + ), + parser=Parser.excel, + additional_triples=template, + parser_args={"dropna": True, "unit_from_macro": True}, + ) + + missmatches = [ + warning + for warning in warnings + if warning.category == MappingMissmatchWarning + ] + assert len(missmatches) == 1 + + assert len(pipeline.general_metadata) == 12 + for row in pipeline.general_metadata: + assert isinstance(row, QuantityGraph) or isinstance(row, PropertyGraph) + + assert len(pipeline.dataframe_metadata) == 6 + for row in pipeline.dataframe_metadata: + assert isinstance(row, QuantityGraph) + + assert len(pipeline.dataframe.columns) == 6 + assert sorted(list(pipeline.dataframe.columns)) == sorted(columns) + for name, column in pipeline.dataframe.items(): + assert len(column) == 460 + + expected_graph = Graph() + expected_graph.parse(expected) + + assert pipeline.graph.isomorphic(expected_graph) + + assert remove_ids(pipeline.to_dict(schema=dsms_schema)) == sort_entries( + metadata + ) + assert sort_entries(pipeline.to_dict()) == as_non_dsms_schema(metadata)