Skip to content

Commit

Permalink
v2.4.1 (#78)
Browse files Browse the repository at this point in the history
* add alias for dataframe fields for backwards compatibility and add pytests

* replace time series with dataframe in descriptions
  • Loading branch information
MBueschelberger authored Jan 21, 2025
1 parent 3ec64eb commit ad576b3
Show file tree
Hide file tree
Showing 32 changed files with 348 additions and 106 deletions.
2 changes: 1 addition & 1 deletion data2rdf/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class Config(BaseSettings):

data_download_uri: Union[str, AnyUrl] = Field(
"https://www.example.org/download",
description="General base iri for downloading the time series after uploading",
description="General base iri for downloading the dataframe after uploading",
)

graph_identifier: Optional[Union[str, AnyUrl]] = Field(
Expand Down
4 changes: 2 additions & 2 deletions data2rdf/models/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def validate_measurement_unit(cls, self) -> "MeasurementUnit":
class QuantityGraph(BasicGraphModel, BasicSuffixModel):
"""Quantity with or without a discrete value and a unit
E.g. a quantity with a single value and unit _or_
a quantity describing a column of a time series or table with a unit."""
a quantity describing a column of a dataframe or table with a unit."""

unit: Optional[Union[str, AnyUrl]] = Field(
None, description="QUDT Symbol or any other IRI for the unit mapping"
Expand Down Expand Up @@ -273,7 +273,7 @@ class PropertyGraph(BasicGraphModel, BasicSuffixModel):
"""Mapping for an individual with arbitrary property. E.g. the
name of a tester or a testing facility. The value must not have a
discrete value but can also be a reference to a column in a table or
time series."""
dataframe."""

value: Optional[
Union[str, int, float, bool, AnyUrl, "PropertyGraph", "QuantityGraph"]
Expand Down
3 changes: 2 additions & 1 deletion data2rdf/models/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ class ABoxExcelMapping(ABoxBaseMapping):

dataframe_start: Optional[str] = Field(
None,
description="Cell location for the start of the time series quantity",
description="Cell location for the start of the dataframe quantity",
alias=AliasChoices("dataframe_start", "time_series_start"),
)
worksheet: Optional[str] = Field(
None,
Expand Down
6 changes: 3 additions & 3 deletions data2rdf/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class BaseParser(BaseModel):

dropna: bool = Field(
False,
description="Drop all rows where ONLY NaN and None occur in the time series.",
description="Drop all rows where ONLY NaN and None occur in the dataframe.",
)

config: Config = Field(
Expand Down Expand Up @@ -344,7 +344,7 @@ def general_metadata(self) -> "List[BasicConceptMapping]":

@property
def dataframe_metadata(self) -> "List[BasicConceptMapping]":
"""Return time series metadata"""
"""Return dataframe metadata"""
if self.mode == PipelineMode.ABOX:
return self.abox.dataframe_metadata
else:
Expand All @@ -354,7 +354,7 @@ def dataframe_metadata(self) -> "List[BasicConceptMapping]":

@property
def dataframe(self) -> "Dict[str, Any]":
"""Return time series"""
"""Return dataframe"""
if self.mode == PipelineMode.ABOX:
return self.abox.dataframe
else:
Expand Down
30 changes: 18 additions & 12 deletions data2rdf/parsers/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import numpy as np
import pandas as pd
from pydantic import Field
from pydantic import AliasChoices, Field

from data2rdf.models.graph import PropertyGraph, QuantityGraph
from data2rdf.utils import make_prefix
Expand Down Expand Up @@ -120,10 +120,16 @@ class CSVABoxParser(ABoxBaseParser):
)
metadata_length: int = Field(..., description="Length of the metadata")
dataframe_sep: Optional[str] = Field(
None, description="Column separator of the time series header"
None,
description="Column separator of the dataframe header",
alias=AliasChoices("dataframe_sep", "time_series_sep"),
)
dataframe_header_length: int = Field(
2, description="Length of header of the time series"
2,
description="Length of header of the dataframe",
alias=AliasChoices(
"dataframe_header_length", "time_series_header_length"
),
)
fillna: Optional[Any] = Field(
"", description="Value to fill NaN values in the parsed dataframe."
Expand All @@ -148,7 +154,7 @@ def json_ld(self) -> "Dict[str, Any]":
Returns a JSON-LD representation of the CSV data in ABox mode.
This method generates a JSON-LD object that describes the CSV data,
including its metadata, time series data, and relationships between them.
including its metadata, dataframe data, and relationships between them.
The returned JSON-LD object is in the format of a csvw:TableGroup,
which contains one or more csvw:Table objects. Each csvw:Table object
Expand Down Expand Up @@ -212,7 +218,7 @@ def json_ld(self) -> "Dict[str, Any]":
tables += [
{
"@type": "csvw:Table",
"rdfs:label": "Time series data",
"rdfs:label": "Dataframe",
"csvw:tableSchema": column_schema,
}
]
Expand Down Expand Up @@ -302,7 +308,7 @@ def _run_parser(
mapping: "List[ABoxBaseMapping]",
) -> None:
"""
This function is responsible for parsing metadata, time series metadata, and time series data from a CSV file.
This function is responsible for parsing metadata, dataframe metadata, and dataframe data from a CSV file.
It takes in three parameters:
- `self`: The CSVParser instance.
Expand All @@ -311,8 +317,8 @@ def _run_parser(
The function returns None, but it populates the following instance variables:
- `self._general_metadata`: A list of PropertyGraph or QuantityGraph instances representing the general metadata.
- `self._dataframe_metadata`: A list of QuantityGraph instances representing the time series metadata.
- `self._dataframe`: A pandas DataFrame containing the time series data.
- `self._dataframe_metadata`: A list of QuantityGraph instances representing the dataframe metadata.
- `self._dataframe`: A pandas DataFrame containing the dataframe data.
The function also raises ValueError if the `metadata_length` is greater than 0 but `metadata_sep` is not set.
It raises TypeError if the unit for a key is not a string.
Expand Down Expand Up @@ -398,7 +404,7 @@ def _run_parser(
MappingMissmatchWarning,
)

# parse time series data and meta data
# parse dataframe data and meta data
self._dataframe_metadata = []
self._dataframe = {}

Expand Down Expand Up @@ -441,7 +447,7 @@ def _run_parser(
# append model
self.dataframe_metadata.append(model)

# assign time series data
# assign dataframe data
self._dataframe[model.suffix] = dataframe[key][
self.dataframe_header_length - 1 :
].to_list()
Expand All @@ -451,7 +457,7 @@ def _run_parser(
f"No match found in mapping for key `{key}`",
MappingMissmatchWarning,
)
# set time series as pd dataframe
# set dataframe as pd dataframe
self._dataframe = pd.DataFrame.from_dict(
self._dataframe, orient="index"
).transpose()
Expand Down Expand Up @@ -485,7 +491,7 @@ def _parse_dataframe(
]
else:
warnings.warn(
"`dataframe_sep` is not set. Any potential time series in the data file will be skipped.",
"`dataframe_sep` is not set. Any potential dataframe in the data file will be skipped.",
ParserWarning,
)
response = []
Expand Down
14 changes: 7 additions & 7 deletions data2rdf/parsers/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ def json_ld(self) -> Dict[str, Any]:
"""
Returns the JSON-LD representation of the data in ABox mode.
The JSON-LD is constructed based on the metadata and time series data.
If the file description is not suppressed, it includes the metadata and time series data tables.
The JSON-LD is constructed based on the metadata and dataframe data.
If the file description is not suppressed, it includes the metadata and dataframe data tables.
Otherwise, it returns a list of JSON-LD representations of the individual models.
:return: A dictionary representing the JSON-LD data.
Expand Down Expand Up @@ -194,7 +194,7 @@ def json_ld(self) -> Dict[str, Any]:
tables += [
{
"@type": "csvw:Table",
"rdfs:label": "Time series data",
"rdfs:label": "Dataframe",
"csvw:tableSchema": column_schema,
}
]
Expand Down Expand Up @@ -284,7 +284,7 @@ def _run_parser(
mapping: "List[ABoxExcelMapping]",
) -> None:
"""
Parses the metadata, time series metadata, and time series from an Excel file.
Parses the metadata, dataframe metadata, and dataframe from an Excel file.
Args:
self (ExcelABoxParser): The instance of the ExcelABoxParser class.
Expand Down Expand Up @@ -326,7 +326,7 @@ def _run_parser(
are set. Only one of them must be set."""
)

# find data for time series
# find data for dataframe
if datum.dataframe_start:
column_name = datum.dataframe_start.rstrip("0123456789")
dataframe_end = f"{column_name}{worksheet.max_row}"
Expand All @@ -338,7 +338,7 @@ def _run_parser(
]
else:
message = f"""Concept with key `{datum.key}`
does not have a time series from `{datum.dataframe_start}`
does not have a dataframe from `{datum.dataframe_start}`
until `{dataframe_end}` .
Concept will be omitted in graph.
"""
Expand Down Expand Up @@ -476,7 +476,7 @@ def _run_parser(
"""
warnings.warn(message, MappingMissmatchWarning)

# set time series as pd dataframe
# set dataframe as pd dataframe
self._dataframe = pd.DataFrame.from_dict(
self._dataframe, orient="index"
).transpose()
Expand Down
16 changes: 8 additions & 8 deletions data2rdf/parsers/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def json_ld(self) -> Dict[str, Any]:
This method generates the JSON-LD representation of the parser's data,
including the context, id, type, and members. The members are generated
based on the general metadata and time series metadata.
based on the general metadata and dataframe metadata.
The method returns a dictionary containing the JSON-LD representation.
Expand Down Expand Up @@ -271,8 +271,8 @@ def _run_parser(
mapping: "List[ABoxBaseMapping]",
) -> None:
"""
Class method for parsing metadata, time series metadata,
and time series from a given data file and mapping.
Class method for parsing metadata, dataframe metadata,
and dataframe from a given data file and mapping.
Args:
self: An instance of JsonABoxParser.
Expand Down Expand Up @@ -359,8 +359,8 @@ def _run_parser(

# if we have a series and a unit and we are *not* expanding:
# * make a QuantityGraph with the unit
# * add the graph to the time series metadata
# * add the values of the series to the time series array
# * add the graph to the dataframe metadata
# * add the values of the series to the dataframe array
if (
isinstance(value, list)
and unit
Expand Down Expand Up @@ -388,8 +388,8 @@ def _run_parser(
self._general_metadata.append(model)
# if we have a series and *no* unit and we are *not* expanding:
# * make a PropertyGraph
# * add the graph to the time series metadata
# * add the values of the series to the time series array
# * add the graph to the dataframe metadata
# * add the values of the series to the dataframe array
elif (
isinstance(value, list)
and not unit
Expand Down Expand Up @@ -466,7 +466,7 @@ def _run_parser(
relation, subdataset, datum, suffix
)

# set time series as pd dataframe
# set dataframe as pd dataframe
self._dataframe = pd.DataFrame.from_dict(
self._dataframe, orient="index"
).transpose()
Expand Down
4 changes: 2 additions & 2 deletions data2rdf/pipelines/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def general_metadata(self) -> "List[BasicConceptMapping]":

@property
def dataframe_metadata(self) -> "List[BasicConceptMapping]":
"""Return list object with time series metadata"""
"""Return list object with dataframe metadata"""
if self.mode == PipelineMode.ABOX:
return self.parser.abox.dataframe_metadata
else:
Expand All @@ -253,7 +253,7 @@ def dataframe_metadata(self) -> "List[BasicConceptMapping]":

@property
def dataframe(self) -> "Dict[str, Any]":
"""Return time series"""
"""Return dataframe"""
if self.mode == PipelineMode.ABOX:
return self.parser.abox.dataframe
else:
Expand Down
2 changes: 1 addition & 1 deletion docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The configuration of the package is crucial for the correct parsing and transfor
| prefix_name | str | Prefix used referencing the base_iri in the context of the graph | fileid | No |
| separator | str | Separator between base IRI and suffix | / | No |
| encoding | str | Encoding used while parsing | utf-8 | No |
| data_download_uri | AnyUrl | General base iri for downloading the time series after uploading | https://www.example.org/download | No |
| data_download_uri | AnyUrl | General base iri for downloading the dataframe after uploading | https://www.example.org/download | No |
| graph_identifier | Optional[str, AnyUrl] | Identifier of the graph to be produced | None | No |
| namespace_placeholder | Union[str, AnyUrl] | Placeholder of the extra triples to be replaced with the base_iri during the pipeline run | http://abox-namespace-placeholder.org/ | No |
| remove_from_unit | List[str] | Characters which should be removed from the input value for the unit | ["[", "]", '"', " "] | No |
Expand Down
Loading

0 comments on commit ad576b3

Please sign in to comment.