Skip to content

Commit

Permalink
rename time_series to dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
MBueschelberger committed Jan 17, 2025
1 parent 5a852c9 commit c9c513e
Show file tree
Hide file tree
Showing 34 changed files with 297 additions and 280 deletions.
2 changes: 1 addition & 1 deletion data2rdf/models/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def validate_model(cls, self: "ABoxBaseMapping") -> "ABoxBaseMapping":
class ABoxExcelMapping(ABoxBaseMapping):
"""A special model for mapping from excel files to semantic concepts in the ABox"""

time_series_start: Optional[str] = Field(
dataframe_start: Optional[str] = Field(
None,
description="Cell location for the start of the time series quantity",
)
Expand Down
24 changes: 12 additions & 12 deletions data2rdf/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,23 +171,23 @@ class ABoxBaseParser(AnyBoxBaseParser):
"""Basic Parser for ABox mode"""

_general_metadata: Any = PrivateAttr()
_time_series_metadata: Any = PrivateAttr()
_time_series: Any = PrivateAttr()
_dataframe_metadata: Any = PrivateAttr()
_dataframe: Any = PrivateAttr()

@property
def general_metadata(self) -> "List[BasicConceptMapping]":
"""Return list object with general metadata"""
return self._general_metadata

@property
def time_series_metadata(self) -> "List[BasicConceptMapping]":
def dataframe_metadata(self) -> "List[BasicConceptMapping]":
"""Return list object with general metadata"""
return self._time_series_metadata
return self._dataframe_metadata

@property
def time_series(self) -> "pd.DataFrame":
def dataframe(self) -> "pd.DataFrame":
"""Return times series found in the data as pd.DataFrame"""
return self._time_series
return self._dataframe

@property
def plain_metadata(self) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -343,23 +343,23 @@ def general_metadata(cls) -> "List[BasicConceptMapping]":
)

@property
def time_series_metadata(cls) -> "List[BasicConceptMapping]":
def dataframe_metadata(cls) -> "List[BasicConceptMapping]":
"""Return time series metadata"""
if cls.mode == PipelineMode.ABOX:
return cls.abox.time_series_metadata
return cls.abox.dataframe_metadata
else:
raise NotImplementedError(
"`time_series_metadata` is not available in `tbox`-mode."
"`dataframe_metadata` is not available in `tbox`-mode."
)

@property
def time_series(cls) -> "Dict[str, Any]":
def dataframe(cls) -> "Dict[str, Any]":
"""Return time series"""
if cls.mode == PipelineMode.ABOX:
return cls.abox.time_series
return cls.abox.dataframe
else:
raise NotImplementedError(
"`time_series` is not available in `tbox`-mode."
"`dataframe` is not available in `tbox`-mode."
)

@property
Expand Down
48 changes: 24 additions & 24 deletions data2rdf/parsers/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,10 @@ class CSVABoxParser(ABoxBaseParser):
None, description="Metadata column separator"
)
metadata_length: int = Field(..., description="Length of the metadata")
time_series_sep: Optional[str] = Field(
dataframe_sep: Optional[str] = Field(
None, description="Column separator of the time series header"
)
time_series_header_length: int = Field(
dataframe_header_length: int = Field(
2, description="Length of header of the time series"
)
fillna: Optional[Any] = Field(
Expand Down Expand Up @@ -207,7 +207,7 @@ def json_ld(cls) -> "Dict[str, Any]":
)
tables += [meta_table]

if cls.time_series_metadata:
if cls.dataframe_metadata:
column_schema = {"@type": "csvw:Schema", "csvw:column": []}
tables += [
{
Expand All @@ -216,7 +216,7 @@ def json_ld(cls) -> "Dict[str, Any]":
"csvw:tableSchema": column_schema,
}
]
for idx, mapping in enumerate(cls.time_series_metadata):
for idx, mapping in enumerate(cls.dataframe_metadata):
if isinstance(mapping, QuantityGraph):
entity = {"qudt:quantity": mapping.json_ld}
elif isinstance(mapping, PropertyGraph):
Expand Down Expand Up @@ -289,7 +289,7 @@ def json_ld(cls) -> "Dict[str, Any]":
else:
json_ld = {
"@graph": [model.json_ld for model in cls.general_metadata]
+ [model.json_ld for model in cls.time_series_metadata]
+ [model.json_ld for model in cls.dataframe_metadata]
}
return json_ld

Expand All @@ -311,8 +311,8 @@ def _run_parser(
The function returns None, but it populates the following instance variables:
- `self._general_metadata`: A list of PropertyGraph or QuantityGraph instances representing the general metadata.
- `self._time_series_metadata`: A list of QuantityGraph instances representing the time series metadata.
- `self._time_series`: A pandas DataFrame containing the time series data.
- `self._dataframe_metadata`: A list of QuantityGraph instances representing the time series metadata.
- `self._dataframe`: A pandas DataFrame containing the time series data.
The function also raises ValueError if the `metadata_length` is greater than 0 but `metadata_sep` is not set.
It raises TypeError if the unit for a key is not a string.
Expand All @@ -327,11 +327,11 @@ def _run_parser(

mapping = {model.key: model for model in mapping}

time_series: Union[pd.DataFrame, List[None]] = cls._parse_time_series(
dataframe: Union[pd.DataFrame, List[None]] = cls._parse_dataframe(
self, datafile
)
if self.dropna:
time_series.dropna(inplace=True)
dataframe.dropna(inplace=True)
datafile.seek(0)

# iterate over general metadata
Expand Down Expand Up @@ -399,10 +399,10 @@ def _run_parser(
)

# parse time series data and meta data
self._time_series_metadata = []
self._time_series = {}
self._dataframe_metadata = []
self._dataframe = {}

for key in time_series:
for key in dataframe:
# get matching mapping
mapping_match = mapping.get(key)

Expand All @@ -411,8 +411,8 @@ def _run_parser(
unit = (
mapping_match.unit
or (
time_series[key].iloc[0]
if self.time_series_header_length == 2
dataframe[key].iloc[0]
if self.dataframe_header_length == 2
else None
)
or None
Expand All @@ -439,11 +439,11 @@ def _run_parser(
model.unit_relation = mapping_match.unit_relation

# append model
self.time_series_metadata.append(model)
self.dataframe_metadata.append(model)

# assign time series data
self._time_series[model.suffix] = time_series[key][
self.time_series_header_length - 1 :
self._dataframe[model.suffix] = dataframe[key][
self.dataframe_header_length - 1 :
].to_list()

else:
Expand All @@ -452,12 +452,12 @@ def _run_parser(
MappingMissmatchWarning,
)
# set time series as pd dataframe
self._time_series = pd.DataFrame.from_dict(
self._time_series, orient="index"
self._dataframe = pd.DataFrame.from_dict(
self._dataframe, orient="index"
).transpose()
# check if drop na:
if self.dropna:
self._time_series.dropna(how="all", inplace=True)
self._dataframe.dropna(how="all", inplace=True)

# OVERRIDE
@classmethod
Expand All @@ -466,14 +466,14 @@ def _load_data_file(cls, self: "CSVABoxParser") -> StringIO:
return _load_data_file(self)

@classmethod
def _parse_time_series(
def _parse_dataframe(
cls, self: "CSVParser", datafile: "StringIO"
) -> Union[pd.DataFrame, List[None]]:
if self.time_series_sep:
if self.dataframe_sep:
response = pd.read_csv(
datafile,
encoding=self.config.encoding,
sep=self.time_series_sep,
sep=self.dataframe_sep,
skiprows=self.metadata_length,
)
response = response.map(
Expand All @@ -485,7 +485,7 @@ def _parse_time_series(
]
else:
warnings.warn(
"`time_series_sep` is not set. Any potential time series in the data file will be skipped.",
"`dataframe_sep` is not set. Any potential time series in the data file will be skipped.",
ParserWarning,
)
response = []
Expand Down
42 changes: 20 additions & 22 deletions data2rdf/parsers/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def json_ld(cls) -> Dict[str, Any]:
)
tables += [meta_table]

if cls.time_series_metadata:
if cls.dataframe_metadata:
column_schema = {"@type": "csvw:Schema", "csvw:column": []}
tables += [
{
Expand All @@ -198,7 +198,7 @@ def json_ld(cls) -> Dict[str, Any]:
"csvw:tableSchema": column_schema,
}
]
for idx, mapping in enumerate(cls.time_series_metadata):
for idx, mapping in enumerate(cls.dataframe_metadata):
if isinstance(mapping, QuantityGraph):
entity = {"qudt:quantity": mapping.json_ld}
elif isinstance(mapping, PropertyGraph):
Expand Down Expand Up @@ -271,7 +271,7 @@ def json_ld(cls) -> Dict[str, Any]:
else:
json_ld = {
"@graph": [model.json_ld for model in cls.general_metadata]
+ [model.json_ld for model in cls.time_series_metadata]
+ [model.json_ld for model in cls.dataframe_metadata]
}
return json_ld

Expand Down Expand Up @@ -301,8 +301,8 @@ def _run_parser(
datafile.seek(0)

self._general_metadata = []
self._time_series_metadata = []
self._time_series = {}
self._dataframe_metadata = []
self._dataframe = {}

for datum in mapping:
worksheet = workbook[datum.worksheet]
Expand All @@ -320,28 +320,26 @@ def _run_parser(
suffix = quote(suffix)

if not datum.custom_relations:
if datum.value_location and datum.time_series_start:
if datum.value_location and datum.dataframe_start:
raise RuntimeError(
"""Both, `value_location` and `time_series_start
"""Both, `value_location` and `dataframe_start
are set. Only one of them must be set."""
)

# find data for time series
if datum.time_series_start:
column_name = datum.time_series_start.rstrip("0123456789")
time_series_end = f"{column_name}{worksheet.max_row}"
if datum.dataframe_start:
column_name = datum.dataframe_start.rstrip("0123456789")
dataframe_end = f"{column_name}{worksheet.max_row}"

column = worksheet[
datum.time_series_start : time_series_end
]
column = worksheet[datum.dataframe_start : dataframe_end]
if column:
self._time_series[suffix] = [
self._dataframe[suffix] = [
cell[0].value for cell in column
]
else:
message = f"""Concept with key `{datum.key}`
does not have a time series from `{datum.time_series_start}`
until `{time_series_end}` .
does not have a time series from `{datum.dataframe_start}`
until `{dataframe_end}` .
Concept will be omitted in graph.
"""
warnings.warn(message, MappingMissmatchWarning)
Expand Down Expand Up @@ -391,7 +389,7 @@ def _run_parser(
"config": self.config,
}

if datum.value_location and not datum.time_series_start:
if datum.value_location and not datum.dataframe_start:
value = worksheet[datum.value_location].value

if model_data.get("unit") and _value_exists(value):
Expand All @@ -409,7 +407,7 @@ def _run_parser(

value_exists = _value_exists(value)

if value_exists or suffix in self.time_series:
if value_exists or suffix in self.dataframe:
if datum.value_relation:
model_data["value_relation"] = datum.value_relation
if model_data.get("unit"):
Expand All @@ -426,7 +424,7 @@ def _run_parser(
if value_exists:
self._general_metadata.append(model)
else:
self._time_series_metadata.append(model)
self._dataframe_metadata.append(model)

else:
for relation in datum.custom_relations:
Expand Down Expand Up @@ -479,12 +477,12 @@ def _run_parser(
warnings.warn(message, MappingMissmatchWarning)

# set time series as pd dataframe
self._time_series = pd.DataFrame.from_dict(
self._time_series, orient="index"
self._dataframe = pd.DataFrame.from_dict(
self._dataframe, orient="index"
).transpose()
# check if drop na:
if self.dropna:
self._time_series.dropna(how="all", inplace=True)
self._dataframe.dropna(how="all", inplace=True)

# OVERRIDE
@classmethod
Expand Down
Loading

0 comments on commit c9c513e

Please sign in to comment.