diff --git a/egsim/smtk/flatfile/__init__.py b/egsim/smtk/flatfile/__init__.py index 0c5b777f..b29907d1 100644 --- a/egsim/smtk/flatfile/__init__.py +++ b/egsim/smtk/flatfile/__init__.py @@ -14,8 +14,9 @@ from openquake.hazardlib.contexts import RuptureContext from .columns import (ColumnDtype, get_rupture_param_columns, - get_dtypes_and_defaults, get_column_names, - get_intensity_measure_columns) + get_dtypes_and_defaults, get_all_names_of, + get_intensity_measure_columns, MissingColumn, + InvalidDataInColumn, InvalidColumnName, ConflictingColumns) from .. import get_SA_period from ...smtk.trellis.configure import vs30_to_z1pt0_cy14, vs30_to_z2pt5_cb14 @@ -246,7 +247,7 @@ def read_csv(filepath_or_buffer: Union[str, IO], invalid_columns.append(col) if invalid_columns: - raise ValueError(f'Invalid values in column(s): {", ".join(invalid_columns)}') + raise InvalidDataInColumn(*invalid_columns) # set defaults: invalid_defaults = [] @@ -260,6 +261,8 @@ def read_csv(filepath_or_buffer: Union[str, IO], pass invalid_defaults.append(col) + if not isinstance(dfr, pd.RangeIndex): + dfr.reset_index(drop=True, inplace=True) return dfr @@ -362,7 +365,7 @@ def get_column_name(flatfile:pd.DataFrame, column:str) -> Union[str, None]: Returns None if no column is found, raise `ConflictingColumns` if more than a matching column is found""" ff_cols = set(flatfile.columns) - cols = get_column_names(column) & ff_cols + cols = get_all_names_of(column) & ff_cols if len(cols) > 1: raise ConflictingColumns(*cols) elif len(cols) == 0: @@ -396,30 +399,44 @@ def get_station_id_column_names(flatfile: pd.DataFrame) -> list[str, ...]: def prepare_for_residuals(flatfile: pd.DataFrame, - gsims: Iterable[GMPE], imts: Iterable[str]) -> pd.Dataframe: + gsims: Iterable[GMPE], imts: Iterable[str]) -> pd.DataFrame: """Return a new dataframe with all columns required to compute residuals from the given models (`gsim`) and intensity measures (`imts`) given with periods, when needed (e.g. "SA(0.2)") """ - new_flatfile = pd.DataFrame(index=flatfile.index) + new_dataframes = [] # prepare the flatfile for the required ground motion properties: + props_flatfile = pd.DataFrame(index=flatfile.index) for prop in get_required_ground_motion_properties(gsims): - new_flatfile[prop] = \ + props_flatfile[prop] = \ get_ground_motion_property_values(flatfile, prop) + if not props_flatfile.empty: + new_dataframes.append(props_flatfile) # validate imts: imts = set(imts) - non_sa = {_ for _ in imts if not get_SA_period(_) is None} + non_sa_imts = {_ for _ in imts if get_SA_period(_) is None} # get supported imts but does not allow 'SA' alone to be valid: - supported_imts = get_intensity_measure_columns() - {'SA'} - if non_sa - supported_imts: - raise InvalidColumn(*{non_sa - supported_imts}) + if non_sa_imts: + supported_imts = get_intensity_measure_columns() - {'SA'} + if non_sa_imts - supported_imts: + raise InvalidColumnName(*list(non_sa_imts - supported_imts)) + # raise if some imts are not in the flatfile: + if non_sa_imts - set(flatfile.columns): + raise MissingColumn(*list(non_sa_imts - set(flatfile.columns))) + # add non SA imts: + new_dataframes.append(flatfile[sorted(non_sa_imts)]) # prepare the flatfile for SA (create new columns by interpolation if necessary): - sa = imts - non_sa - sa_dataframe = _prepare_for_sa(flatfile, sa) - if not sa_dataframe.empty: - new_flatfile[list(sa_dataframe.columns)] = sa_dataframe + sa_imts = imts - non_sa_imts + if sa_imts: + sa_dataframe = _prepare_for_sa(flatfile, sa_imts) + if not sa_dataframe.empty: + new_dataframes.append(sa_dataframe) + + if not new_dataframes: + return pd.DataFrame(columns=flatfile.columns) # empty dataframe + + return pd.concat(new_dataframes, axis=1) - return new_flatfile def get_required_ground_motion_properties(gsims: Iterable[GMPE]) -> set[str]: @@ -528,10 +545,12 @@ def fill_na(flatfile:pd.DataFrame, def _prepare_for_sa(flatfile: pd.DataFrame, sa_imts: Iterable[str]) -> pd.DataFrame: - """Modify inplace the flatfile assuring the SA columns in `sa_imts` (e.g. "SA(0.2)") - are present. The SA column of the flatfile will be used to obtain - the target SA via interpolation, and removed if not necessary. + """Return a new Dataframe with the SA columns defined in `sa_imts` + The returned DataFrame will have all strings supplied in `sa_imts` as columns, + with relative values copied (or inferred via interpolation) from the given flatfile + :param flatfile: the flatfile + :param sa_imts: Iterable of strings denoting SA (e.g. "SA(0.2)") Return the newly created Sa columns, as tuple of strings """ src_sa = [] @@ -552,16 +571,18 @@ def _prepare_for_sa(flatfile: pd.DataFrame, sa_imts: Iterable[str]) -> pd.DataFr if p not in source_sa: tgt_sa.append((p, i)) if invalid_sa: - raise InvalidColumn(*invalid_sa) + raise InvalidDataInColumn(*invalid_sa) # source_sa: period [float] -> mapped to the relative column: target_sa: dict[float, str] = {p: c for p, c in sorted(tgt_sa, key=lambda t: t[0])} - if not source_sa or not target_sa: - return pd.DataFrame(index=flatfile.index, data=[]) + source_sa_flatfile = flatfile[list(source_sa.values())] + + if not target_sa: + return source_sa_flatfile # Take the log10 of all SA: - source_spectrum = np.log10(flatfile[list(source_sa.values())]) + source_spectrum = np.log10(source_sa_flatfile) # we need to interpolate row wise # build the interpolation function: interp = interp1d(list(source_sa), source_spectrum, axis=1) @@ -618,46 +639,6 @@ def __getattr__(self, column_name): return values -class InvalidColumn(Exception): - """ - General flatfile column(s) error. See subclasses for details - """ - def __init__(self, *names, sep=', '): - super().__init__(*names) - self._sep = sep - - def __str__(self): - """Make str(self) more clear""" - prefix = self.__class__.__name__ - # replace upper cases with space + lower case letter - prefix = re.sub("([A-Z])", " \\1", prefix).strip().capitalize() - names = self.args - suffix = self._sep.join(repr(_) for _ in names) - return f"{prefix}{'s' if len(names) > 1 else ''} {suffix}" - - def __repr__(self): - return self.__str__() - - -class MissingColumn(InvalidColumn, AttributeError, KeyError): - """MissingColumnError. It inherits also from AttributeError and - KeyError to be compliant with pandas and OpenQuake""" - - def __init__(self, name): - sorted_names = get_column_names(name, sort=True) - suffix_str = repr(sorted_names[0] or name) - if len(sorted_names) > 1: - suffix_str += " (alias" if len(sorted_names) == 2 else " (aliases" - suffix_str += f": {', '.join(repr(_) for _ in sorted_names[1:])})" - super().__init__(suffix_str) - - -class ConflictingColumns(InvalidColumn): - - def __init__(self, *names): - InvalidColumn.__init__(self, *names, sep=" vs. ") - - # FIXME REMOVE LEGACY STUFF CHECK WITH GW: # FIXME: remove columns checks will be done when reading the flatfile and diff --git a/egsim/smtk/flatfile/columns.py b/egsim/smtk/flatfile/columns.py index 8ca18e43..35f05d89 100644 --- a/egsim/smtk/flatfile/columns.py +++ b/egsim/smtk/flatfile/columns.py @@ -2,19 +2,20 @@ module containing all column metadata information stored in the associated YAML file """ +import re from datetime import datetime, date from enum import Enum from os.path import join, dirname -from typing import Union, Any, Iterable +from typing import Union, Any # try to speed up yaml.safe_load (https://pyyaml.org/wiki/PyYAMLDocumentation): from yaml import load as yaml_load try: - from yaml import CSafeLoader as default_yaml_loader # faster, if available + from yaml import CSafeLoader as SafeLoader # faster, if available except ImportError: - from yaml import SafeLoader as default_yaml_loader # same as using yaml.safe_load + from yaml import SafeLoader # same as using yaml.safe_load import numpy as np import pandas as pd @@ -70,21 +71,15 @@ def get_intensity_measure_columns() -> set[str]: _alias: dict[str, set[str]] = None # noqa -def get_column_names(column, sort=False) -> Union[set[str], list[str]]: - """Return all possible names of the given column, as set of strings. If sort is - True, a list is returned where the first element is the primary column name (one of - the top-level keys defined in the YAML dict) +def get_all_names_of(column) -> set[str]: + """Return all possible names of the given column, as set of strings. The set + will be empty if `column` does not denote a flatfile column """ global _alias if _alias is None: _alias = {} _extract_from_columns(load_from_yaml(), alias=_alias) - names = _alias.get(column, set()) - if not sort: - return names - else: - return [n for n in names if n in _columns] + \ - [n for n in names if n in _columns] + return _alias.get(column, set()) def get_dtypes_and_defaults() -> \ @@ -98,6 +93,81 @@ def get_dtypes_and_defaults() -> \ return _dtype, _default +class InvalidColumn(Exception): + """ + General flatfile column(s) error. See subclasses for details + """ + def __init__(self, *names, sep=', ', plural_suffix='s'): + super().__init__(*names) + self._sep = sep + self._plural_suffix = plural_suffix + + @property + def names(self): + """return the names (usually column names) raising this Exception + and passed in `__init__`""" + return [repr(_) for _ in self.args] + + def __str__(self): + """Make str(self) more clear""" + # get prefix (e.g. 'Missing column(s)'): + prefix = self.__class__.__name__ + # replace upper cases with space + lower case letter + prefix = re.sub("([A-Z])", " \\1", prefix).strip().capitalize() + names = self.names + if len(names) != 1: + prefix += self._plural_suffix + # return full string: + return f"{prefix} {self._sep.join(names)}" + + def __repr__(self): + return self.__str__() + + +class MissingColumn(InvalidColumn, AttributeError, KeyError): + """MissingColumnError. It inherits also from AttributeError and + KeyError to be compliant with pandas and OpenQuake""" + + @property + def names(self): + """return the names with their alias(es), if any""" + _names = [] + for name in self.args: + sorted_names = self.get_all_names_of(name) + suffix_str = repr(sorted_names[0]) + if len(sorted_names) > 1: + suffix_str += f" (or {', '.join(repr(_) for _ in sorted_names[1:])})" + _names.append(suffix_str) + return _names + + @classmethod + def get_all_names_of(cls, col_name) -> list[str]: + """Return a list of all column names of the argument, with the first element + being the flatfile primary name. Returns `[col_name]` if the argument does not + denote any flatfile column""" + names = get_all_names_of(col_name) + if len(names) <= 1: + return [col_name] + global _columns # not needed, just as reminder + return [n for n in names if n in _columns] + \ + [n for n in names if n not in _columns] + + +class ConflictingColumns(InvalidColumn): + + def __init__(self, name1, name2, *other_names): + InvalidColumn.__init__(self, name1, name2, *other_names, + sep=" vs. ", plural_suffix='') + + +class InvalidDataInColumn(InvalidColumn, ValueError, TypeError): + pass + + +class InvalidColumnName(InvalidColumn): + pass + + # YAML file path: _ff_metadata_path = join(dirname(__file__), 'columns.yaml') # cache storage of the data in the YAML: @@ -116,7 +186,7 @@ def load_from_yaml(cache=True) -> dict[str, dict[str, Any]]: if cache and _columns: return _columns with open(_ff_metadata_path) as fpt: - _cols = yaml_load(fpt, default_yaml_loader) + _cols = yaml_load(fpt, SafeLoader) if cache: _columns = _cols return _cols diff --git a/egsim/smtk/flatfile/columns.yaml b/egsim/smtk/flatfile/columns.yaml index b93f6a12..18ea637d 100644 --- a/egsim/smtk/flatfile/columns.yaml +++ b/egsim/smtk/flatfile/columns.yaml @@ -6,21 +6,22 @@ # possible values of the column data (which must be all the same dtype). # null or missing means "not specified" and will skip data type checks. # The dtype determines also how to fill missing data (See `default` below -# for details), whereas data that is not parsable -# to the given dtype (e.g.: 'x' for float) will always raise Exceptions +# for details), whereas data that is not parsable to the given dtype +# (e.g.: 'x' for float) will always raise Exceptions # default: The default value to replace missing data (e.g. '', null, NaN). When a # default is not provided, it will be `dtype` dependent: NaN (for dtype # float), NaT (datetime), None/null (str or categorical), 0 (int), False # (bool). As such, please note that int and bool do not really support # missing data, as 0 and False also denote valid values for the dtype # alias: The column alias(es), as string or list of strings. You can put here -# any additional name, e.g., depending on the column type (see below) the -# name used in OpenQuake, if different. Columns of type i (intensity -# measure) must be typed exactly as in OpenQuake and cannot have aliases -# type: The column type (optional). Type either rupture (rupture parameter), sites -# (sites parameter), intensity (intensity measure) or distance (distance -# measure). If a type is given, the column name or any of its aliases -# must match the parameter / measure name implemented in OpenQuake +# any additional name, e.g., the name of the ground motion property defined +# in OpenQuake that corresponds to this column, if the names are different. +# Note that intensity measure columns (see `type` below) must be spelled +# exactly as in OpenQuake and cannot have aliases +# type: The column type (optional): rupture, sites, distance, intensity (rupture +# parameter, sites parameter, distance measure, intensity measure, +# respectively). If a type is given, the column name or any of its aliases +# must match the relative OpenQuake name # help: The field description, used for help. Depending on the application, a # default might be provided (e.g. "") # ">" (with quotation marks because > and < are special characters in YAML) @@ -262,60 +263,3 @@ ev_time: alias: event_time dtype: datetime help: the event time (as ISO formatted string, e.g. 2006-03-31T00:12:24) - - -# FIXME: REMOVE: - -# Here the collection of the parameters required by all OpenQuake models, mapped to their -# corresponding flatfile column. With this information programs can tell, for any given -# flatfile, which Ground motion models can be used with it. **As such, a parameter with -# no mapping (or mapped to null) will programmatically discard all models requiring it**. - -#openquake_models_parameters: -# distance_measure: # OpenQuake model attribute: REQUIRES_DISTANCES -# azimuth: azimuth -# repi: repi -# rrup: rrup -# rjb: rjb -# rhypo: rhypo -# rx: rx -# ry0: ry0 -# rvolc: rvolc -# rcdpp: -# rupture_parameter: # OpenQuake model attribute: REQUIRES_RUPTURE_PARAMETERS -# hypo_lat: event_latitude -# hypo_lon: event_longitude -# hypo_depth: event_depth -# mag: magnitude -# ztor: depth_top_of_rupture -# width: rupture_width -# dip: dip -# rake: rake -# in_cshm: -# site_parameter: # OpenQuake model attribute: REQUIRES_SITES_PARAMETERS -# z1pt0: z1 -# z2pt5: z2pt5 -# vs30: vs30 -# backarc: backarc -# lat: station_latitude -# lon: station_longitude -# vs30measured: vs30measured -# xvf: xvf -# fpeak: fpeak -# region: region -# geology: geology -# slope: slope -# f0: -# z1pt4: -# ec8_p18: -# bas: -# siteclass: -# h800: -# ec8: -# soiltype: -# THV: -# PHV: -# T_15: -# F_15: -# D50_15: -# freeface_ratio: \ No newline at end of file diff --git a/egsim/smtk/residuals.py b/egsim/smtk/residuals.py index e54197ee..b7e4e0af 100644 --- a/egsim/smtk/residuals.py +++ b/egsim/smtk/residuals.py @@ -65,7 +65,7 @@ def get_residuals(gsims: Iterable[str], imts: Iterable[str], flatfile2[ev_cols] = flatfile[ev_cols] # copy station columns (for the moment not used, so skip if no station columns) try: - st_cols = get_station_id_column_names() + st_cols = get_station_id_column_names(flatfile) flatfile2[st_cols] = flatfile[st_cols] except InvalidColumn: pass diff --git a/tests/smtk/flatfile/test_flatfile_exceptions.py b/tests/smtk/flatfile/test_flatfile_exceptions.py new file mode 100644 index 00000000..0324c113 --- /dev/null +++ b/tests/smtk/flatfile/test_flatfile_exceptions.py @@ -0,0 +1,78 @@ +""" +Created on 16 Feb 2018 + +@author: riccardo +""" +import pytest + +from egsim.smtk.flatfile import columns +# from egsim.smtk.flatfile.columns import (InvalidColumn, MissingColumn, +# ConflictingColumns, +# get_all_names_of, InvalidDataInColumn, +# InvalidColumnName)InvalidColumnName + + +def test_flatfile_exceptions(): + tested_classes = [] + + for cols in [['hypo_lat'], ['unknown'], ['hypo_lat', 'unknown'], + ['st_lon', 'hypo_lat', 'unknown']]: + + tested_classes.append(columns.InvalidColumn) + exc = tested_classes[-1](*cols) + if len(cols) == 1: + assert str(exc) == 'Invalid column ' + repr(cols[0]) + else: + assert str(exc).startswith('Invalid columns ' + repr(cols[0]) + ', ') + + tested_classes.append(columns.MissingColumn) + exc = tested_classes[-1](*cols) + c_names = exc.get_all_names_of(cols[0]) + if len(cols) == 1: + assert str(exc).startswith('Missing column ' + repr(c_names[0])) + else: + assert str(exc).startswith('Missing columns ' + repr(c_names[0])) + if cols[0] == 'hypo_lat': + assert f"{repr(c_names[0])} (or " in str(exc) + assert all (repr(_) in str(exc) for _ in c_names) + + tested_classes.append(columns.ConflictingColumns) + if len(cols) <=1: + with pytest.raises(TypeError): + # conflicting cols need at least two arguments: + exc = tested_classes[-1](*cols) + continue + exc = tested_classes[-1](*cols) + assert str(exc).startswith('Conflicting columns ' + repr(cols[0]) + ' vs. ') + + tested_classes.append(columns.InvalidDataInColumn) + exc = tested_classes[-1](*cols) + if len(cols) == 1: + assert str(exc).startswith('Invalid data in column ' + repr(cols[0])) + else: + assert str(exc).startswith('Invalid data in columns ' + repr(cols[0])) + + tested_classes.append(columns.InvalidColumnName) + exc = tested_classes[-1](*cols) + if len(cols) == 1: + assert str(exc).startswith('Invalid column name ' + repr(cols[0])) + else: + assert str(exc).startswith('Invalid column names ' + repr(cols[0])) + + # check that we tested all exception types: + excs = set(tested_classes) + found = 0 + for exc in dir(columns): + cls = getattr(columns, exc, None) + try: + is_subcls = issubclass(cls, columns.InvalidColumn) + except TypeError: + is_subcls = False + if is_subcls: + found +=1 + if cls not in excs: + raise ValueError(f'Not tested: {str(cls)}') + + if found != len(excs): + raise ValueError(f'Expected {len(excs)} InvalidColumn subclasses ' + f'in module {str(columns)}, found {found}') \ No newline at end of file