diff --git a/egsim/smtk/residuals.py b/egsim/smtk/residuals.py deleted file mode 100644 index 56c24cd7..00000000 --- a/egsim/smtk/residuals.py +++ /dev/null @@ -1,625 +0,0 @@ -""" -Residuals module -""" - -from collections.abc import Iterable, Collection - -from pandas import RangeIndex -from typing import Union - -from math import sqrt, ceil - -import numpy as np -import pandas as pd -from pandas.core.indexes.numeric import IntegerIndex - -from scipy.special import erf -from scipy.stats import norm -from scipy.linalg import solve - -from openquake.hazardlib.gsim.base import GMPE -from openquake.hazardlib import imt, const -from openquake.hazardlib.contexts import RuptureContext - -from . import check_gsim_list, get_gsim_name, get_SA_period #, convert_accel_units -from .flatfile.preparation import (get_event_id_column_names, - get_station_id_column_names, - setup_flatfile_for_residuals) -from .flatfile.columns import InvalidColumn, MissingColumn, get_rupture_params - - -def get_sa_limits(gsim: GMPE) -> Union[tuple[float, float], None]: - pers = None - for c in dir(gsim): - if 'COEFFS' in c: - pers = [sa.period for sa in getattr(gsim, c).sa_coeffs] - break - return (min(pers), max(pers)) if pers is not None else None - - -def check_sa_limits(gsim: GMPE, im: str): # FIXME remove? is it used? - """Return False if the period defined for the SA given in `im` does not match the - SA limits defined for the given model (`gsim`). Return True in any other case - (period within limits, gsim not defining any SA limit, `im` not representing SA) - """ - period = get_SA_period(im) - if period is None: - return True - limits = get_sa_limits(gsim) - if limits is None: - return True - return limits[0] < period < limits[1] - - -def get_residuals(gsims: Iterable[str], imts: Collection[str], - flatfile: pd.DataFrame, nodal_plane_index=1, - component="Geometric", compute_lh=False, - normalise=True) -> pd.DataFrame: # FIXME remove unused arguments? - """ - Calculate the residuals from a given flatfile gsim(s) and imt(s). This function - modifies the passed flatfile inplace (e.g. by adding all residuals-related computed - columns, see :ref:`c_labels` for details) - - :poram: imts iterable of strings denoting intensity measures (Sa must be - given with period, e.g. "SA(0.2)") - :return: the passed flatfile with all residuals computed columns added - """ - gsims = check_gsim_list(gsims) - flatfile2 = setup_flatfile_for_residuals(flatfile, gsims.values(), imts) - # copy event columns (raises if columns not found): - ev_cols = get_event_id_column_names(flatfile) - flatfile2[ev_cols] = flatfile[ev_cols] - # copy station columns (for the moment not used, so skip if no station columns) - try: - st_cols = get_station_id_column_names(flatfile) - flatfile2[st_cols] = flatfile[st_cols] - except InvalidColumn: - pass - # compute residuals: - residuals = calculate_flatfile_residuals(gsims, imts, flatfile2, - normalise=normalise) - # concatenate expected in flatfile (add new columns): - flatfile[list(residuals.columns)] = residuals - if compute_lh: - get_residuals_likelihood(gsims, imts, flatfile) - return flatfile - - -def calculate_flatfile_residuals(gsims: dict[str, GMPE], imts: Iterable[str], - flatfile: pd.DataFrame, normalise=True) -> pd.DataFrame: - residuals:pd.DataFrame = pd.DataFrame(index=flatfile.index) - imts = list(imts) - # computget the observations (compute the log for all once here): - observations = pd.DataFrame(index=flatfile.index, - columns=imts, - data=np.log(flatfile[imts])) - for context in yield_event_contexts(flatfile): - # Get the expected ground motions by event - expected = calculate_expected_motions(gsims.values(), imts, context) - # Get residuals: - res = calculate_residuals(gsims, imts, observations.loc[expected.index, :], - expected, normalise=normalise) - if residuals.empty: - for col in list(expected.columns) + list(res.columns): - residuals[col] = np.nan - # copy values: - residuals.loc[expected.index, expected.columns] = expected - residuals.loc[res.index, res.columns] = res - - return residuals - - -class EventContext(RuptureContext): - """A RuptureContext accepting a flatfile (pandas DataFrame) as input""" - - rupture_params:set[str] = None - - def __init__(self, flatfile: pd.DataFrame): - super().__init__() - if not isinstance(flatfile.index, IntegerIndex): - raise ValueError('flatfile index should be made of unique integers') - self._flatfile = flatfile - if self.__class__.rupture_params is None: - # get rupture params once for all instances the first time only: - self.__class__.rupture_params = get_rupture_params() - - def __eq__(self, other): # FIXME: legacy code, is it still used? - assert isinstance(other, EventContext) and \ - self._flatfile is other._flatfile - - @property - def sids(self) -> IntegerIndex: - """Return the ids (iterable of integers) of the records (or sites) used to build - this context. The returned pandas `IntegerIndex` must have unique values so that - the records (flatfile rows) can always be retrieved from the source flatfile via - `flatfile.loc[self.sids, :]` - """ - # note that this attribute is used also when calculating `len(self)` so do not - # delete or rename. See superclass for details - return self._flatfile.index - - - def __getattr__(self, column_name): - """Return a non-found Context attribute by searching in the underlying - flatfile column. Raises AttributeError (as usual) if `item` is not found - """ - try: - values = self._flatfile[column_name].values - except KeyError: - raise MissingColumn(column_name) - if column_name in self.rupture_params: - values = values[0] - return values - - -def yield_event_contexts(flatfile: pd.DataFrame) -> Iterable[EventContext]: - """Group the flatfile by events, and yield `EventContext`s objects, one for - each event""" - # assure each row has a unique int id from 0 until row_count-1: - if not isinstance(flatfile.index, RangeIndex): - flatfile.reset_index(drop=True, inplace=True) - - # check event id column or use the event location to group events: - # group flatfile by events. Use ev. id (_EVENT_COLUMNS[0]) or, when - # no ID found, event spatio-temporal coordinates (_EVENT_COLUMNS[1:]) - ev_sub_flatfiles = flatfile.groupby(get_event_id_column_names(flatfile)) - - for ev_id, dfr in ev_sub_flatfiles: - if not dfr.empty: # for safety ... - yield EventContext(dfr) - - -def calculate_expected_motions(gsims: Iterable[GMPE], imts: Iterable[str], - ctx: EventContext) -> pd.DataFrame: - """ - Calculate the expected ground motions from the context - """ - expected:pd.DataFrame = pd.DataFrame(index=ctx.sids) - label_of = { - const.StdDev.TOTAL: c_labels.total, - const.StdDev.INTER_EVENT: c_labels.inter_ev, - const.StdDev.INTRA_EVENT: c_labels.intra_ev - } - imts_dict = {i: imt.from_string(i) for i in imts} - # Period range for GSIM - for gsim in gsims: - types = gsim.DEFINED_FOR_STANDARD_DEVIATION_TYPES - for imt_name, imtx in imts_dict.items(): - period = get_SA_period(imtx) - if period is not None: - sa_period_limits = get_sa_limits(gsim) - if sa_period_limits is not None and not \ - (sa_period_limits[0] < period < sa_period_limits[1]): - continue - mean, stddev = gsim.get_mean_and_stddevs(ctx, ctx, ctx, imtx, types) - expected[column_label(gsim, imt_name, c_labels.mean)] = mean - for std_type, values in zip(types, stddev): - expected[column_label(gsim, imt_name, label_of[std_type])] = values - return expected - - -def calculate_residuals(gsims: Iterable[str], imts: Iterable[str], - observations: pd.DataFrame, expected_motions: pd.DataFrame, - normalise=True): - """ - Calculate the residual terms, modifies `flatfile` inplace - - :param observations: the flatfile of the (natural logarithm of) the - observed ground motions - """ - residuals: pd.DataFrame = pd.DataFrame(index=observations.index) - for gsim in gsims: - for imtx in imts: - obs = observations.get(imtx) - if obs is None: - continue - mean = expected_motions.get(column_label(gsim, imtx, c_labels.mean)) - total_stddev = expected_motions.get(column_label(gsim, imtx, c_labels.total)) - if mean is None or total_stddev is None: - continue - residuals[column_label(gsim, imtx, c_labels.total_res)] = \ - (obs - mean) / total_stddev - inter_ev = expected_motions.get(column_label(gsim, imtx, c_labels.inter_ev)) - intra_ev = expected_motions.get(column_label(gsim, imtx, c_labels.intra_ev)) - if inter_ev is None or intra_ev is None: - continue - inter, intra = _get_random_effects_residuals(obs, mean, inter_ev, - intra_ev, normalise) - residuals[column_label(gsim, imtx, c_labels.inter_ev_res)] = inter - residuals[column_label(gsim, imtx, c_labels.intra_ev_res)] = intra - return residuals - - -def _get_random_effects_residuals(obs, mean, inter, intra, normalise=True): - """ - Calculate the random effects residuals using the inter-event - residual formula described in Abrahamson & Youngs (1992) Eq. 10 - """ - nvals = float(len(mean)) - inter_res = ((inter ** 2.) * sum(obs - mean)) /\ - (nvals * (inter ** 2.) + (intra ** 2.)) - intra_res = obs - (mean + inter_res) - if normalise: - return inter_res / inter, intra_res / intra - return inter_res, intra_res - - -class c_labels: - """Flatfile computed column labels (e.g. expected motion, residual)""" - mean = "Mean" - total = const.StdDev.TOTAL.capitalize() - inter_ev = const.StdDev.INTER_EVENT.replace(" ", "-").capitalize() - intra_ev = const.StdDev.INTRA_EVENT.replace(" ", "-").capitalize() - expected_motion_column = {total, inter_ev, intra_ev} - res = "residual" - total_res = total + f" {res}" - inter_ev_res = inter_ev + f" {res}" - intra_ev_res = intra_ev + f" {res}" - residuals_columns = {total_res, inter_ev_res, intra_ev_res} - lh = "LH" # likelihood - total_lh = total + f" {lh}" - inter_ev_lh = inter_ev + f" {lh}" - intra_ev_lh = intra_ev + f" {lh}" - lh_columns = {total_res, inter_ev_res, intra_ev_res} - - -def column_label(gsim: Union[str, GMPE], imtx: str, c_label: str): - """ - Return the column label for the given Gsim, imt and type (e.g. "Mean", - "Total-residuals", see :ref:`labels`). If residuals is True, the column is - supposed to denote a residual computed with the observed IMT stored in another column - """ - if isinstance(gsim, GMPE): - gsim = get_gsim_name(gsim) - return f"{gsim} {imtx} {c_label}" - - -def get_computed_columns(gsims: Union[None, Iterable[str]], - imts: Union[None, Iterable[str]], - flatfile: pd.DataFrame): - """Yield tuples from columns of `flatfile` denoting computed columns - (expected motions, residuals) for the given `gsims` and `imts`. A computed column - name has the form: `model + " "+ imt + " " + label` (See :ref:`column_label`). - Each yielded tuple is: - `column name, gsim name, imt name, residual label` - """ - if gsims is not None: - gsims = set(gsims) - if imts is not None: - imts = set(imts) - for col in flatfile.columns: - chunks = col.split(" ", 2) - if len(chunks) == 3 and \ - (gsims is None or chunks[0] in gsims) and \ - (imts is None or chunks[1] in imts): - yield col, chunks[0], chunks[1], chunks[2] - - -def get_residuals_stats(gsim_names: Iterable[str], imt_names: Iterable[str], - flatfile: pd.DataFrame) -> \ - dict[str, dict[str, dict[str, float]]]: - """ - Retrieve the mean and standard deviation values of the residuals - - :flatfile: the result of :ref:`get_residuals` - """ - stats = {} - for col, gsim, imtx, label in get_computed_columns(gsim_names, imt_names, flatfile): - if label not in c_labels.residuals_columns: - continue - if gsim not in stats: - stats[gsim] = {} - if imtx not in stats[gsim]: - stats[gsim][imtx] = {} - stats[gsim][imtx][f"{label} Mean"] = flatfile[col].mean() - stats[gsim][imtx][f"{label} Stddev"] = flatfile[col].std() - return stats - - -def get_residuals_likelihood(gsims: Iterable[str], imts: Iterable[str], - flatfile: pd.DataFrame) -> \ - dict[str, dict[str, dict[str, float]]]: - """ - Return the likelihood values for the residuals column found in `flatfile` - (e.g. Total, inter- intra-event) according to Equation 9 of Scherbaum et al (2004) - - :param flatfile: a pandas DataFrame resulting from :ref:`get_residuals`. - NOTE: the flatfile might be modified inplace (new likelihood columns added) - """ - result = {} - for col, gsim, imtx, label in get_computed_columns(gsims, imts, flatfile): - if label in c_labels.residuals_columns: - values = get_likelihood(flatfile[col]) - # get LH label corresponding to residuals: - lh_label = c_labels.total_lh - if label == c_labels.inter_ev_res: - lh_label = c_labels.inter_ev_lh - elif label == c_labels.intra_ev_res: - lh_label = c_labels.intra_ev_lh - # add new flatfile column: - result[column_label(gsim, imtx, lh_label)] = values - elif label in c_labels.lh_columns: - lh_label = label - values = flatfile[col] - else: - continue - if gsim not in result: - result[gsim] = {} - if imtx not in result[gsim]: - result[gsim][imtx] = {} - p25, p50, p75 = np.nanpercentile(values, [25, 50, 75]) - result[f"{lh_label} Median"] = p50 - result[f"{lh_label} IQR"] = p75 - p25 - return result - - -def get_likelihood(values: Union[np.ndarray, pd.Series]) -> Union[np.ndarray, pd.Series]: - """ - Returns the likelihood of the given values according to Equation 9 of - Scherbaum et al (2004) - """ - zvals = np.fabs(values) - return 1.0 - erf(zvals / sqrt(2.)) - - -def get_residuals_loglikelihood(gsims: Iterable[str], imts: Iterable[str], - flatfile: pd.DataFrame) -> \ - dict[str, dict[str, float]]: - """ - Return the loglikelihood fit for the "Total residuals" columns found in `flatfile` - using the loglikehood (LLH) function described in Scherbaum et al. (2009) - Scherbaum, F., Delavaud, E., Riggelsen, C. (2009) "Model Selection in - Seismic Hazard Analysis: An Information-Theoretic Perspective", - Bulletin of the Seismological Society of America, 99(6), 3234-3247 - - :param flatfile: a pandas DataFrame resulting from :ref:`get_residuals` - """ - result = {} - log_residuals = {} - for col, gsim, imtx, label in get_computed_columns(gsims, imts, flatfile): - if label != c_labels.total_res: - continue - asll = np.log2(norm.pdf(flatfile[col], 0., 1.0)) - if gsim not in result: - result[gsim] = {} - result[imtx] = -(1.0 / len(asll)) * np.sum(asll) - log_residuals[gsim] = np.hstack([log_residuals[gsim], asll]) - for gsim, asll in log_residuals.items(): - result[gsim]["All"] = -(1.0 / len(asll)) * np.sum(asll) - return result - - -def get_residuals_edr_values(gsims: Iterable[str], imts: Iterable[str], - flatfile: pd.DataFrame, bandwidth=0.01, - multiplier=3.0) -> \ - dict[str, dict[str, float]]: - """ - Calculates the EDR values for each Gsim found in `flatfile` with computed residuals - according to the Euclidean Distance Ranking method of Kale & Akkar (2013) - - Kale, O., and Akkar, S. (2013) "A New Procedure for Selecting and - Ranking Ground Motion Predicion Equations (GMPEs): The Euclidean - Distance-Based Ranking Method", Bulletin of the Seismological Society - of America, 103(2A), 1069 - 1084. - - :param flatfile: a pandas DataFrame resulting from :ref:`get_residuals` - :param float bandwidth: Discretisation width - :param float multiplier: "Multiplier of standard deviation (equation 8 of Kale - and Akkar) - """ - result = {} - for gsim in gsims: - obs, expected, stddev = _get_edr_gsim_information(gsim, imts, flatfile) - results = get_edr(obs, expected, stddev, bandwidth, multiplier) - if gsim not in result: - result[gsim] = {} - result[gsim]["MDE Norm"] = float(results[0]) - result[gsim]["sqrt Kappa"] = float(results[1]) - result[gsim]["EDR"] = float(results[2]) - return result - - -def _get_edr_gsim_information(gsim: str, imts: Iterable[str], flatfile:pd.DataFrame) -> \ - tuple[np.ndarray, np.ndarray, np.ndarray]: - """ - Extract the observed ground motions, expected and total standard - deviation for the GMPE (aggregating over all IMS) - - Note: `get_residuals` must have been called on `flatfile` before this function - """ - obs = np.array([], dtype=float) - expected = np.array([], dtype=float) - stddev = np.array([], dtype=float) - for col, gsim, imtx, label in get_computed_columns([gsim], imts, flatfile): - if label != c_labels.total: - continue - _stddev = flatfile[column_label(gsim, imtx, c_labels.total)] - _obs = flatfile.get(imtx) - _expected = flatfile.get(column_label(gsim, imtx, c_labels.mean)) - if _expected is not None and _obs is not None: - obs = np.hstack([obs, np.log(flatfile[imtx])]) - expected = np.hstack([expected, _expected]) - stddev = np.hstack([stddev, _stddev]) - - return obs, expected, stddev - - -def get_edr(obs: Union[np.ndarray, pd.Series], - expected: Union[np.ndarray, pd.Series], - stddev: Union[np.ndarray, pd.Series], - bandwidth=0.01, multiplier=3.0) -> tuple[float, float, float]: - """ - Calculated the Euclidean Distanced-Based Rank for a set of - observed and expected values from a particular Gsim - """ - finite = np.isfinite(obs) & np.isfinite(expected) & np.isfinite(stddev) - if not finite.any(): - return np.nan, np.nan, np.nan - elif not finite.all(): - obs, expected, stddev = obs[finite], expected[finite], stddev[finite] - nvals = len(obs) - min_d = bandwidth / 2. - kappa = _get_edr_kappa(obs, expected) - mu_d = obs - expected - d1c = np.fabs(obs - (expected - (multiplier * stddev))) - d2c = np.fabs(obs - (expected + (multiplier * stddev))) - dc_max = ceil(np.max(np.array([np.max(d1c), np.max(d2c)]))) - num_d = len(np.arange(min_d, dc_max, bandwidth)) - mde = np.zeros(nvals) - for iloc in range(0, num_d): - d_val = (min_d + (float(iloc) * bandwidth)) * np.ones(nvals) - d_1 = d_val - min_d - d_2 = d_val + min_d - p_1 = norm.cdf((d_1 - mu_d) / stddev) -\ - norm.cdf((-d_1 - mu_d) / stddev) - p_2 = norm.cdf((d_2 - mu_d) / stddev) -\ - norm.cdf((-d_2 - mu_d) / stddev) - mde += (p_2 - p_1) * d_val - inv_n = 1.0 / float(nvals) - mde_norm = np.sqrt(inv_n * np.sum(mde ** 2.)) - edr = np.sqrt(kappa * inv_n * np.sum(mde ** 2.)) - return float(mde_norm), float(np.sqrt(kappa)), float(edr) - - -def _get_edr_kappa(obs: Union[np.ndarray, pd.Series], - expected: Union[np.ndarray, pd.Series]) -> np.floating: - """ - Returns the correction factor kappa - """ - mu_a = np.mean(obs) - mu_y = np.mean(expected) - b_1 = np.sum((obs - mu_a) * (expected - mu_y)) /\ - np.sum((obs - mu_a) ** 2.) - b_0 = mu_y - b_1 * mu_a - y_c = expected - ((b_0 + b_1 * obs) - obs) - de_orig = np.sum((obs - expected) ** 2.) - de_corr = np.sum((obs - y_c) ** 2.) - return de_orig / de_corr - - -# FIXME: from here on multivariate_llh need to be fixed with GW - - -# Mak et al multivariate LLH functions -def get_residuals_multivariate_loglikelihood(gsim_names: list[str], imt_names: list[str], - flatfile: pd.DataFrame, contexts=None): - """ - Calculates the multivariate LLH for a set of GMPEs and IMTS according - to the approach described in Mak et al. (2017) - - Mak, S., Clements, R. A. and Schorlemmer, D. (2017) "Empirical - Evaluation of Hierarchical Ground-Motion Models: Score Uncertainty - and Model Weighting", Bulletin of the Seismological Society of America, - 107(2), 949-965 - - """ - raise NotImplementedError('this method is not supported and needs verification') - result = pd.DataFrame(index=flatfile.index) - for col, gsim, imt, label in residuals_columns(gsim_names, imt_names, flatfile): - result[col + "-multivariate-loglikelihood"] = _get_multivariate_ll( - contexts, gsim, imt) - return result - - -def _get_multivariate_ll(contexts, gmpe, imt): - """ - Returns the multivariate loglikelihood, as described om equation 7 of - Mak et al. (2017) - """ - observations, v_mat, expected_mat, neqs, nrecs = _build_matrices( - contexts, gmpe, imt) - sign, logdetv = np.linalg.slogdet(v_mat) - b_mat = observations - expected_mat - # `solve` below needs only finite values (see doc), but unfortunately raises - # in case. In order to silently skip non-finite values: - # 1. Check v_mat (square matrix), by simply returning nan if any cell value - # is nan (FIXME: handle better?): - if not np.isfinite(v_mat).all(): - return np.nan - # 2. Check b_mat (array) by removing finite values: - b_finite = np.isfinite(b_mat) - if not b_finite.all(): - if not b_finite.any(): - return np.nan - b_mat = b_mat[b_finite] - v_mat = v_mat[b_finite, :][:, b_finite] - # call `solve(...check_finite=False)` because the check is already done. - # The function shouldn't raise anymore: - return (float(nrecs) * np.log(2.0 * np.pi) + logdetv + - (b_mat.T.dot(solve(v_mat, b_mat, check_finite=False)))) / 2. - - -# The following methods are used for the MultivariateLLH function -def _build_matrices(contexts, gmpe, imtx): - """ - Constructs the R and Z_G matrices (based on the implementation - in the supplement to Mak et al (2017) - """ - neqs = len(contexts) - nrecs = sum(ctxt["Num. Sites"] for ctxt in contexts) - - r_mat = np.zeros(nrecs, dtype=float) - z_g_mat = np.zeros([nrecs, neqs], dtype=float) - expected_mat = np.zeros(nrecs, dtype=float) - # Get observations - observations = np.zeros(nrecs) - i = 0 - # Determine the total number of records and pass the log of the - # observations to the observations dictionary - for ctxt in contexts: - n_s = ctxt["Num. Sites"] - observations[i:(i + n_s)] = np.log(ctxt["Observations"][imtx]) - i += n_s - - i = 0 - for j, ctxt in enumerate(contexts): - if not("Intra event" in ctxt["Expected"][gmpe][imtx]) and\ - not("Inter event" in ctxt["Expected"][gmpe][imtx]): - # Only the total sigma exists - # Total sigma is used as intra-event sigma (from S. Mak) - n_r = len(ctxt["Expected"][gmpe][imtx]["Total"]) - r_mat[i:(i + n_r)] = ctxt["Expected"][gmpe][imtx]["Total"] - expected_mat[i:(i + n_r)] = ctxt["Expected"][gmpe][imtx]["Mean"] - # Inter-event sigma is set to 0 - i += n_r - continue - n_r = len(ctxt["Expected"][gmpe][imtx]["Intra event"]) - r_mat[i:(i + n_r)] = ctxt["Expected"][gmpe][imtx]["Intra event"] - # Get expected mean - expected_mat[i:(i + n_r)] = ctxt["Expected"][gmpe][imtx]["Mean"] - if len(ctxt["Expected"][gmpe][imtx]["Inter event"]) == 1: - # Single inter event residual - z_g_mat[i:(i + n_r), j] =\ - ctxt["Expected"][gmpe][imtx]["Inter event"][0] - else: - # inter-event residual given at a vector - z_g_mat[i:(i + n_r), j] =\ - ctxt["Expected"][gmpe][imtx]["Inter event"] - i += n_r - - v_mat = np.diag(r_mat ** 2.) + z_g_mat.dot(z_g_mat.T) - return observations, v_mat, expected_mat, neqs, nrecs - - -# ============================================================ - - -# GSIM_LIST = AVAILABLE_GSIMS -# GSIM_KEYS = set(GSIM_LIST) -# -# # SCALAR_IMTS = ["PGA", "PGV", "PGD", "CAV", "Ia"] -# SCALAR_IMTS = ["PGA", "PGV"] -# STDDEV_KEYS = ["Mean", "Total", "Inter event", "Intra event"] - - -GSIM_MODEL_DATA_TESTS = { - "Residuals": lambda residuals, config: - residuals.get_residual_statistics(), - "Likelihood": lambda residuals, config: residuals.get_likelihood_values(), - "LLH": lambda residuals, config: residuals.get_loglikelihood_values( - config.get("LLH IMTs", [imt for imt in residuals.imts])), - "MultivariateLLH": lambda residuals, config: - residuals.get_multivariate_loglikelihood_values(), - "EDR": lambda residuals, config: residuals.get_edr_values( - config.get("bandwidth", 0.01), config.get("multiplier", 3.0)) -} diff --git a/egsim/smtk/residuals/__init__.py b/egsim/smtk/residuals/__init__.py new file mode 100644 index 00000000..6ae05faf --- /dev/null +++ b/egsim/smtk/residuals/__init__.py @@ -0,0 +1,335 @@ +""" +Residuals module +""" + +from collections.abc import Iterable, Collection + +from pandas import RangeIndex +from typing import Union + +from math import sqrt + +import numpy as np +import pandas as pd +from pandas.core.indexes.numeric import IntegerIndex + +from scipy.special import erf + +from openquake.hazardlib.gsim.base import GMPE +from openquake.hazardlib import imt, const +from openquake.hazardlib.contexts import RuptureContext + +from .. import check_gsim_list, get_gsim_name, get_SA_period #, convert_accel_units +from ..flatfile.preparation import (get_event_id_column_names, + get_station_id_column_names, + setup_flatfile_for_residuals) +from ..flatfile.columns import InvalidColumn, MissingColumn, get_rupture_params + + +def get_sa_limits(gsim: GMPE) -> Union[tuple[float, float], None]: + pers = None + for c in dir(gsim): + if 'COEFFS' in c: + pers = [sa.period for sa in getattr(gsim, c).sa_coeffs] + break + return (min(pers), max(pers)) if pers is not None else None + + +def check_sa_limits(gsim: GMPE, im: str): # FIXME remove? is it used? + """Return False if the period defined for the SA given in `im` does not match the + SA limits defined for the given model (`gsim`). Return True in any other case + (period within limits, gsim not defining any SA limit, `im` not representing SA) + """ + period = get_SA_period(im) + if period is None: + return True + limits = get_sa_limits(gsim) + if limits is None: + return True + return limits[0] < period < limits[1] + + +def get_residuals(gsims: Iterable[str], imts: Collection[str], + flatfile: pd.DataFrame, nodal_plane_index=1, + component="Geometric", likelihood=False, + normalise=True) -> pd.DataFrame: # FIXME remove unused arguments? + """ + Calculate the residuals from a given flatfile gsim(s) and imt(s). This function + modifies the passed flatfile inplace (e.g. by adding all residuals-related computed + columns, see :ref:`c_labels` for details) + + :poram: imts iterable of strings denoting intensity measures (Sa must be + given with period, e.g. "SA(0.2)") + :param likelihood: boolean telling if also the likelihood of the residuals + (according to Equation 9 of Scherbaum et al (2004)) should be computed + """ + gsims = check_gsim_list(gsims) + flatfile2 = setup_flatfile_for_residuals(flatfile, gsims.values(), imts) + # copy event columns (raises if columns not found): + ev_cols = get_event_id_column_names(flatfile) + flatfile2[ev_cols] = flatfile[ev_cols] + # copy station columns (for the moment not used, so skip if no station columns) + try: + st_cols = get_station_id_column_names(flatfile) + flatfile2[st_cols] = flatfile[st_cols] + except InvalidColumn: + pass + # compute residuals: + residuals = calculate_flatfile_residuals(gsims, imts, flatfile2, + normalise=normalise) + # concatenate expected in flatfile (add new columns): + flatfile[list(residuals.columns)] = residuals + if likelihood: + get_residuals_likelihood(gsims, imts, flatfile) + return flatfile + + +def calculate_flatfile_residuals(gsims: dict[str, GMPE], imts: Iterable[str], + flatfile: pd.DataFrame, normalise=True) -> pd.DataFrame: + residuals:pd.DataFrame = pd.DataFrame(index=flatfile.index) + imts = list(imts) + # computget the observations (compute the log for all once here): + observations = pd.DataFrame(index=flatfile.index, + columns=imts, + data=np.log(flatfile[imts])) + for context in yield_event_contexts(flatfile): + # Get the expected ground motions by event + expected = calculate_expected_motions(gsims.values(), imts, context) + # Get residuals: + res = calculate_residuals(gsims, imts, observations.loc[expected.index, :], + expected, normalise=normalise) + if residuals.empty: + for col in list(expected.columns) + list(res.columns): + residuals[col] = np.nan + # copy values: + residuals.loc[expected.index, expected.columns] = expected + residuals.loc[res.index, res.columns] = res + + return residuals + + +class EventContext(RuptureContext): + """A RuptureContext accepting a flatfile (pandas DataFrame) as input""" + + rupture_params:set[str] = None + + def __init__(self, flatfile: pd.DataFrame): + super().__init__() + if not isinstance(flatfile.index, IntegerIndex): + raise ValueError('flatfile index should be made of unique integers') + self._flatfile = flatfile + if self.__class__.rupture_params is None: + # get rupture params once for all instances the first time only: + self.__class__.rupture_params = get_rupture_params() + + def __eq__(self, other): # FIXME: legacy code, is it still used? + assert isinstance(other, EventContext) and \ + self._flatfile is other._flatfile + + @property + def sids(self) -> IntegerIndex: + """Return the ids (iterable of integers) of the records (or sites) used to build + this context. The returned pandas `IntegerIndex` must have unique values so that + the records (flatfile rows) can always be retrieved from the source flatfile via + `flatfile.loc[self.sids, :]` + """ + # note that this attribute is used also when calculating `len(self)` so do not + # delete or rename. See superclass for details + return self._flatfile.index + + + def __getattr__(self, column_name): + """Return a non-found Context attribute by searching in the underlying + flatfile column. Raises AttributeError (as usual) if `item` is not found + """ + try: + values = self._flatfile[column_name].values + except KeyError: + raise MissingColumn(column_name) + if column_name in self.rupture_params: + values = values[0] + return values + + +def yield_event_contexts(flatfile: pd.DataFrame) -> Iterable[EventContext]: + """Group the flatfile by events, and yield `EventContext`s objects, one for + each event""" + # assure each row has a unique int id from 0 until row_count-1: + if not isinstance(flatfile.index, RangeIndex): + flatfile.reset_index(drop=True, inplace=True) + + # check event id column or use the event location to group events: + # group flatfile by events. Use ev. id (_EVENT_COLUMNS[0]) or, when + # no ID found, event spatio-temporal coordinates (_EVENT_COLUMNS[1:]) + ev_sub_flatfiles = flatfile.groupby(get_event_id_column_names(flatfile)) + + for ev_id, dfr in ev_sub_flatfiles: + if not dfr.empty: # for safety ... + yield EventContext(dfr) + + +def calculate_expected_motions(gsims: Iterable[GMPE], imts: Iterable[str], + ctx: EventContext) -> pd.DataFrame: + """ + Calculate the expected ground motions from the context + """ + expected:pd.DataFrame = pd.DataFrame(index=ctx.sids) + label_of = { + const.StdDev.TOTAL: c_labels.total, + const.StdDev.INTER_EVENT: c_labels.inter_ev, + const.StdDev.INTRA_EVENT: c_labels.intra_ev + } + imts_dict = {i: imt.from_string(i) for i in imts} + # Period range for GSIM + for gsim in gsims: + types = gsim.DEFINED_FOR_STANDARD_DEVIATION_TYPES + for imt_name, imtx in imts_dict.items(): + period = get_SA_period(imtx) + if period is not None: + sa_period_limits = get_sa_limits(gsim) + if sa_period_limits is not None and not \ + (sa_period_limits[0] < period < sa_period_limits[1]): + continue + mean, stddev = gsim.get_mean_and_stddevs(ctx, ctx, ctx, imtx, types) + expected[column_label(gsim, imt_name, c_labels.mean)] = mean + for std_type, values in zip(types, stddev): + expected[column_label(gsim, imt_name, label_of[std_type])] = values + return expected + + +def calculate_residuals(gsims: Iterable[str], imts: Iterable[str], + observations: pd.DataFrame, expected_motions: pd.DataFrame, + normalise=True): + """ + Calculate the residual terms, modifies `flatfile` inplace + + :param observations: the flatfile of the (natural logarithm of) the + observed ground motions + """ + residuals: pd.DataFrame = pd.DataFrame(index=observations.index) + for gsim in gsims: + for imtx in imts: + obs = observations.get(imtx) + if obs is None: + continue + mean = expected_motions.get(column_label(gsim, imtx, c_labels.mean)) + total_stddev = expected_motions.get(column_label(gsim, imtx, c_labels.total)) + if mean is None or total_stddev is None: + continue + residuals[column_label(gsim, imtx, c_labels.total_res)] = \ + (obs - mean) / total_stddev + inter_ev = expected_motions.get(column_label(gsim, imtx, c_labels.inter_ev)) + intra_ev = expected_motions.get(column_label(gsim, imtx, c_labels.intra_ev)) + if inter_ev is None or intra_ev is None: + continue + inter, intra = _get_random_effects_residuals(obs, mean, inter_ev, + intra_ev, normalise) + residuals[column_label(gsim, imtx, c_labels.inter_ev_res)] = inter + residuals[column_label(gsim, imtx, c_labels.intra_ev_res)] = intra + return residuals + + +def _get_random_effects_residuals(obs, mean, inter, intra, normalise=True): + """ + Calculate the random effects residuals using the inter-event + residual formula described in Abrahamson & Youngs (1992) Eq. 10 + """ + nvals = float(len(mean)) + inter_res = ((inter ** 2.) * sum(obs - mean)) /\ + (nvals * (inter ** 2.) + (intra ** 2.)) + intra_res = obs - (mean + inter_res) + if normalise: + return inter_res / inter, intra_res / intra + return inter_res, intra_res + + +class c_labels: + """Flatfile computed column labels (e.g. expected motion, residual)""" + mean = "Mean" + total = const.StdDev.TOTAL.capitalize() + inter_ev = const.StdDev.INTER_EVENT.replace(" ", "-").capitalize() + intra_ev = const.StdDev.INTRA_EVENT.replace(" ", "-").capitalize() + expected_motion_column = {total, inter_ev, intra_ev} + res = "residual" + total_res = total + f" {res}" + inter_ev_res = inter_ev + f" {res}" + intra_ev_res = intra_ev + f" {res}" + residuals_columns = {total_res, inter_ev_res, intra_ev_res} + lh = "LH" # likelihood + total_lh = total + f" {lh}" + inter_ev_lh = inter_ev + f" {lh}" + intra_ev_lh = intra_ev + f" {lh}" + lh_columns = {total_res, inter_ev_res, intra_ev_res} + + +def column_label(gsim: Union[str, GMPE], imtx: str, c_label: str): + """ + Return the column label for the given Gsim, imt and type (e.g. "Mean", + "Total-residuals", see :ref:`labels`). If residuals is True, the column is + supposed to denote a residual computed with the observed IMT stored in another column + """ + if isinstance(gsim, GMPE): + gsim = get_gsim_name(gsim) + return f"{gsim} {imtx} {c_label}" + + +def get_computed_columns(gsims: Union[None, Iterable[str]], + imts: Union[None, Iterable[str]], + flatfile: pd.DataFrame): + """Yield tuples from columns of `flatfile` denoting computed columns + (expected motions, residuals) for the given `gsims` and `imts`. A computed column + name has the form: `model + " "+ imt + " " + label` (See :ref:`column_label`). + Each yielded tuple is: + `column name, gsim name, imt name, residual label` + """ + if gsims is not None: + gsims = set(gsims) + if imts is not None: + imts = set(imts) + for col in flatfile.columns: + chunks = col.split(" ", 2) + if len(chunks) == 3 and \ + (gsims is None or chunks[0] in gsims) and \ + (imts is None or chunks[1] in imts): + yield col, chunks[0], chunks[1], chunks[2] + + +def get_residuals_likelihood(gsims: Iterable[str], imts: Iterable[str], + flatfile: pd.DataFrame): + """ + Return the likelihood values for the residuals column found in `flatfile` + (e.g. Total, inter- intra-event) according to Equation 9 of Scherbaum et al (2004) + + :param flatfile: a pandas DataFrame resulting from :ref:`get_residuals`. + NOTE: the flatfile might be modified inplace (new likelihood columns added) + """ + ev_id_cols = None + for col, gsim, imtx, label in get_computed_columns(gsims, imts, flatfile): + if label not in c_labels.residuals_columns: + continue + # build label: + lh_label = c_labels.total_lh + if label == c_labels.inter_ev_res: + lh_label = c_labels.inter_ev_lh + elif label == c_labels.intra_ev_res: + lh_label = c_labels.intra_ev_lh + dest_label = column_label(gsim, imtx, lh_label) + # compute values + if c_labels.inter_ev in label: # inter event, group by events + flatfile[dest_label] = np.nan + if ev_id_cols is None: # lazy load + ev_id_cols = get_event_id_column_names(flatfile) + for ev_id, sub_flatfile in flatfile.groupby(ev_id_cols): + lh_values = get_likelihood(sub_flatfile[col]) + flatfile.loc[sub_flatfile.index, dest_label] = lh_values + else: + flatfile[dest_label] = get_likelihood(flatfile[col]) + + +def get_likelihood(values: Union[np.ndarray, pd.Series]) -> Union[np.ndarray, pd.Series]: + """ + Returns the likelihood of the given values according to Equation 9 of + Scherbaum et al (2004) + """ + zvals = np.fabs(values) + return 1.0 - erf(zvals / sqrt(2.)) diff --git a/egsim/smtk/residuals/stats.py b/egsim/smtk/residuals/stats.py new file mode 100644 index 00000000..04cedbbb --- /dev/null +++ b/egsim/smtk/residuals/stats.py @@ -0,0 +1,315 @@ +""" +Residuals module +""" + +from collections.abc import Iterable +from typing import Union +from math import ceil + +import numpy as np +import pandas as pd + +from scipy.stats import norm +from scipy.linalg import solve + +from egsim.smtk.residuals import get_computed_columns, c_labels, column_label + + +def get_residuals_stats(gsim_names: Iterable[str], imt_names: Iterable[str], + flatfile: pd.DataFrame) -> \ + dict[str, dict[str, dict[str, float]]]: + """ + Retrieve the mean and standard deviation values of the residuals + + :flatfile: the result of :ref:`get_residuals` + """ + stats = {} + for col, gsim, imtx, label in get_computed_columns(gsim_names, imt_names, flatfile): + if label not in c_labels.residuals_columns: + continue + if gsim not in stats: + stats[gsim] = {} + if imtx not in stats[gsim]: + stats[gsim][imtx] = {} + stats[gsim][imtx][f"{label} Mean"] = flatfile[col].mean() + stats[gsim][imtx][f"{label} Stddev"] = flatfile[col].std() + return stats + + +def get_residuals_likelihood(gsims: Iterable[str], imts: Iterable[str], + flatfile: pd.DataFrame) -> \ + dict[str, dict[str, dict[str, float]]]: + """ + Return the likelihood values for the residuals column found in `flatfile` + (e.g. Total, inter- intra-event) according to Equation 9 of Scherbaum et al (2004) + + :param flatfile: a pandas DataFrame resulting from :ref:`get_residuals`. + NOTE: the flatfile might be modified inplace (new likelihood columns added) + """ + result = {} + for col, gsim, imtx, label in get_computed_columns(gsims, imts, flatfile): + if label not in c_labels.lh_columns: + continue + values = flatfile[col] + if gsim not in result: + result[gsim] = {} + if imtx not in result[gsim]: + result[gsim][imtx] = {} + p25, p50, p75 = np.nanpercentile(values, [25, 50, 75]) + result[f"{label} Median"] = p50 + result[f"{label} IQR"] = p75 - p25 + return result + + +def get_residuals_loglikelihood(gsims: Iterable[str], imts: Iterable[str], + flatfile: pd.DataFrame) -> \ + dict[str, dict[str, float]]: + """ + Return the loglikelihood fit for the "Total residuals" columns found in `flatfile` + using the loglikehood (LLH) function described in Scherbaum et al. (2009) + Scherbaum, F., Delavaud, E., Riggelsen, C. (2009) "Model Selection in + Seismic Hazard Analysis: An Information-Theoretic Perspective", + Bulletin of the Seismological Society of America, 99(6), 3234-3247 + + :param flatfile: a pandas DataFrame resulting from :ref:`get_residuals` + """ + result = {} + log_residuals = {} + for col, gsim, imtx, label in get_computed_columns(gsims, imts, flatfile): + if label != c_labels.total_res: + continue + asll = np.log2(norm.pdf(flatfile[col], 0., 1.0)) + if gsim not in result: + result[gsim] = {} + result[imtx] = -(1.0 / len(asll)) * np.sum(asll) + log_residuals[gsim] = np.hstack([log_residuals[gsim], asll]) + for gsim, asll in log_residuals.items(): + result[gsim]["All"] = -(1.0 / len(asll)) * np.sum(asll) + return result + + +def get_residuals_edr_values(gsims: Iterable[str], imts: Iterable[str], + flatfile: pd.DataFrame, bandwidth=0.01, + multiplier=3.0) -> \ + dict[str, dict[str, float]]: + """ + Calculates the EDR values for each Gsim found in `flatfile` with computed residuals + according to the Euclidean Distance Ranking method of Kale & Akkar (2013) + + Kale, O., and Akkar, S. (2013) "A New Procedure for Selecting and + Ranking Ground Motion Predicion Equations (GMPEs): The Euclidean + Distance-Based Ranking Method", Bulletin of the Seismological Society + of America, 103(2A), 1069 - 1084. + + :param flatfile: a pandas DataFrame resulting from :ref:`get_residuals` + :param float bandwidth: Discretisation width + :param float multiplier: "Multiplier of standard deviation (equation 8 of Kale + and Akkar) + """ + result = {} + for gsim in gsims: + obs, expected, stddev = _get_edr_gsim_information(gsim, imts, flatfile) + results = get_edr(obs, expected, stddev, bandwidth, multiplier) + if gsim not in result: + result[gsim] = {} + result[gsim]["MDE Norm"] = float(results[0]) + result[gsim]["sqrt Kappa"] = float(results[1]) + result[gsim]["EDR"] = float(results[2]) + return result + + +def _get_edr_gsim_information(gsim: str, imts: Iterable[str], flatfile:pd.DataFrame) -> \ + tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Extract the observed ground motions, expected and total standard + deviation for the GMPE (aggregating over all IMS) + + Note: `get_residuals` must have been called on `flatfile` before this function + """ + obs = np.array([], dtype=float) + expected = np.array([], dtype=float) + stddev = np.array([], dtype=float) + for col, gsim, imtx, label in get_computed_columns([gsim], imts, flatfile): + if label != c_labels.total: + continue + _stddev = flatfile[column_label(gsim, imtx, c_labels.total)] + _obs = flatfile.get(imtx) + _expected = flatfile.get(column_label(gsim, imtx, c_labels.mean)) + if _expected is not None and _obs is not None: + obs = np.hstack([obs, np.log(flatfile[imtx])]) + expected = np.hstack([expected, _expected]) + stddev = np.hstack([stddev, _stddev]) + + return obs, expected, stddev + + +def get_edr(obs: Union[np.ndarray, pd.Series], + expected: Union[np.ndarray, pd.Series], + stddev: Union[np.ndarray, pd.Series], + bandwidth=0.01, multiplier=3.0) -> tuple[float, float, float]: + """ + Calculated the Euclidean Distanced-Based Rank for a set of + observed and expected values from a particular Gsim + """ + finite = np.isfinite(obs) & np.isfinite(expected) & np.isfinite(stddev) + if not finite.any(): + return np.nan, np.nan, np.nan + elif not finite.all(): + obs, expected, stddev = obs[finite], expected[finite], stddev[finite] + nvals = len(obs) + min_d = bandwidth / 2. + kappa = _get_edr_kappa(obs, expected) + mu_d = obs - expected + d1c = np.fabs(obs - (expected - (multiplier * stddev))) + d2c = np.fabs(obs - (expected + (multiplier * stddev))) + dc_max = ceil(np.max(np.array([np.max(d1c), np.max(d2c)]))) + num_d = len(np.arange(min_d, dc_max, bandwidth)) + mde = np.zeros(nvals) + for iloc in range(0, num_d): + d_val = (min_d + (float(iloc) * bandwidth)) * np.ones(nvals) + d_1 = d_val - min_d + d_2 = d_val + min_d + p_1 = norm.cdf((d_1 - mu_d) / stddev) -\ + norm.cdf((-d_1 - mu_d) / stddev) + p_2 = norm.cdf((d_2 - mu_d) / stddev) -\ + norm.cdf((-d_2 - mu_d) / stddev) + mde += (p_2 - p_1) * d_val + inv_n = 1.0 / float(nvals) + mde_norm = np.sqrt(inv_n * np.sum(mde ** 2.)) + edr = np.sqrt(kappa * inv_n * np.sum(mde ** 2.)) + return float(mde_norm), float(np.sqrt(kappa)), float(edr) + + +def _get_edr_kappa(obs: Union[np.ndarray, pd.Series], + expected: Union[np.ndarray, pd.Series]) -> np.floating: + """ + Returns the correction factor kappa + """ + mu_a = np.mean(obs) + mu_y = np.mean(expected) + b_1 = np.sum((obs - mu_a) * (expected - mu_y)) /\ + np.sum((obs - mu_a) ** 2.) + b_0 = mu_y - b_1 * mu_a + y_c = expected - ((b_0 + b_1 * obs) - obs) + de_orig = np.sum((obs - expected) ** 2.) + de_corr = np.sum((obs - y_c) ** 2.) + return de_orig / de_corr + + +# FIXME REMOVE: +GSIM_MODEL_DATA_TESTS = { + "Residuals": lambda residuals, config: + residuals.get_residual_statistics(), + "Likelihood": lambda residuals, config: residuals.get_likelihood_values(), + "LLH": lambda residuals, config: residuals.get_loglikelihood_values( + config.get("LLH IMTs", [imt for imt in residuals.imts])), + "MultivariateLLH": lambda residuals, config: + residuals.get_multivariate_loglikelihood_values(), + "EDR": lambda residuals, config: residuals.get_edr_values( + config.get("bandwidth", 0.01), config.get("multiplier", 3.0)) +} + + +# FIXME: from here on multivariate_llh need to be fixed with GW + + +# Mak et al multivariate LLH functions +def get_residuals_multivariate_loglikelihood(gsim_names: list[str], imt_names: list[str], + flatfile: pd.DataFrame, contexts=None): + """ + Calculates the multivariate LLH for a set of GMPEs and IMTS according + to the approach described in Mak et al. (2017) + + Mak, S., Clements, R. A. and Schorlemmer, D. (2017) "Empirical + Evaluation of Hierarchical Ground-Motion Models: Score Uncertainty + and Model Weighting", Bulletin of the Seismological Society of America, + 107(2), 949-965 + + """ + raise NotImplementedError('this method is not supported and needs verification') + result = pd.DataFrame(index=flatfile.index) + for col, gsim, imt, label in residuals_columns(gsim_names, imt_names, flatfile): + result[col + "-multivariate-loglikelihood"] = _get_multivariate_ll( + contexts, gsim, imt) + return result + + +def _get_multivariate_ll(contexts, gmpe, imt): + """ + Returns the multivariate loglikelihood, as described om equation 7 of + Mak et al. (2017) + """ + observations, v_mat, expected_mat, neqs, nrecs = _build_matrices( + contexts, gmpe, imt) + sign, logdetv = np.linalg.slogdet(v_mat) + b_mat = observations - expected_mat + # `solve` below needs only finite values (see doc), but unfortunately raises + # in case. In order to silently skip non-finite values: + # 1. Check v_mat (square matrix), by simply returning nan if any cell value + # is nan (FIXME: handle better?): + if not np.isfinite(v_mat).all(): + return np.nan + # 2. Check b_mat (array) by removing finite values: + b_finite = np.isfinite(b_mat) + if not b_finite.all(): + if not b_finite.any(): + return np.nan + b_mat = b_mat[b_finite] + v_mat = v_mat[b_finite, :][:, b_finite] + # call `solve(...check_finite=False)` because the check is already done. + # The function shouldn't raise anymore: + return (float(nrecs) * np.log(2.0 * np.pi) + logdetv + + (b_mat.T.dot(solve(v_mat, b_mat, check_finite=False)))) / 2. + + +# The following methods are used for the MultivariateLLH function +def _build_matrices(contexts, gmpe, imtx): + """ + Constructs the R and Z_G matrices (based on the implementation + in the supplement to Mak et al (2017) + """ + neqs = len(contexts) + nrecs = sum(ctxt["Num. Sites"] for ctxt in contexts) + + r_mat = np.zeros(nrecs, dtype=float) + z_g_mat = np.zeros([nrecs, neqs], dtype=float) + expected_mat = np.zeros(nrecs, dtype=float) + # Get observations + observations = np.zeros(nrecs) + i = 0 + # Determine the total number of records and pass the log of the + # observations to the observations dictionary + for ctxt in contexts: + n_s = ctxt["Num. Sites"] + observations[i:(i + n_s)] = np.log(ctxt["Observations"][imtx]) + i += n_s + + i = 0 + for j, ctxt in enumerate(contexts): + if not("Intra event" in ctxt["Expected"][gmpe][imtx]) and\ + not("Inter event" in ctxt["Expected"][gmpe][imtx]): + # Only the total sigma exists + # Total sigma is used as intra-event sigma (from S. Mak) + n_r = len(ctxt["Expected"][gmpe][imtx]["Total"]) + r_mat[i:(i + n_r)] = ctxt["Expected"][gmpe][imtx]["Total"] + expected_mat[i:(i + n_r)] = ctxt["Expected"][gmpe][imtx]["Mean"] + # Inter-event sigma is set to 0 + i += n_r + continue + n_r = len(ctxt["Expected"][gmpe][imtx]["Intra event"]) + r_mat[i:(i + n_r)] = ctxt["Expected"][gmpe][imtx]["Intra event"] + # Get expected mean + expected_mat[i:(i + n_r)] = ctxt["Expected"][gmpe][imtx]["Mean"] + if len(ctxt["Expected"][gmpe][imtx]["Inter event"]) == 1: + # Single inter event residual + z_g_mat[i:(i + n_r), j] =\ + ctxt["Expected"][gmpe][imtx]["Inter event"][0] + else: + # inter-event residual given at a vector + z_g_mat[i:(i + n_r), j] =\ + ctxt["Expected"][gmpe][imtx]["Inter event"] + i += n_r + + v_mat = np.diag(r_mat ** 2.) + z_g_mat.dot(z_g_mat.T) + return observations, v_mat, expected_mat, neqs, nrecs + diff --git a/tests/smtk/residuals/data/residual_tests_esm_data_old_smtk_lh.json b/tests/smtk/residuals/data/residual_tests_esm_data_old_smtk_lh.json new file mode 100644 index 00000000..adcfc5e4 --- /dev/null +++ b/tests/smtk/residuals/data/residual_tests_esm_data_old_smtk_lh.json @@ -0,0 +1 @@ +{"AkkarEtAlRjb2014 PGA Intra-event": [0.9432758663061641, 0.901784597869652, 0.7198865170114233, 0.8409408796310929, 0.6469686737641258, 0.3755685032011885, 0.09320197013531961, 0.5994844310971033, 0.04001928227045637, 0.9915373218325031, 0.7358744908391497, 0.32856767900402495, 0.39375492258554057, 0.4192540780004672, 0.022393354492776685, 0.3465333424757585, 0.28379445141588133, 0.0051551711584706394, 0.16751204369679473, 1.779132025370167e-06, 0.0023802292747613363, 0.7150185451012525, 0.4195726584641716, 0.1634628258868196, 0.06170342827815034, 0.001255236722237818, 0.4994081303322332, 0.9351866422339886, 0.5005180557947706, 0.00040356795373652776, 0.05222624227550121, 1.3719994850669437e-05, 0.18972386138242248, 0.01186619932858457, 0.0017295138867793325, 0.36174240970651106, 0.04421074514457579, 0.0008462120913038662, 0.07476185472999042, 0.0008190659333798811, 0.13465259174398914], "AkkarEtAlRjb2014 PGA Total": [0.9272620133082103, 0.9377225239040163, 0.7326463077675612, 0.8843710406180743, 0.4330093534716114, 0.6992302058068377, 0.281658790578032, 0.9425651547143586, 0.029744668329502066, 0.6932246591743074, 0.9270893100367101, 0.6415549563985631, 0.2593675529282834, 0.7504007020601124, 0.017604832442079554, 0.22821909751066083, 0.044719197595025406, 0.000448597296645703, 0.47908806926340186, 0.0002466855260324641, 0.031437802036273776, 0.8601425204931507, 0.8346562726783862, 0.47193664949739866, 0.25726674284145323, 0.020609014068615816, 0.27912013245543177, 0.6720558346869798, 0.9263068254236199, 0.009695207170996922, 0.2315768484176629, 1.8636533498095353e-05, 0.10181783237813913, 0.007249589260197475, 0.0012709125073009364, 0.1976228316958416, 0.02468709249915202, 0.0006733989515657068, 0.04077238525550697, 0.0006542457450717798, 0.07235785688795537], "AkkarEtAlRjb2014 PGA Inter-event": [0.952428259176097, 0.43327859967581506, 0.02893363943194549, 0.31485430053945895], "AkkarEtAlRjb2014 SA(1.0) Intra-event": [0.9160598376164533, 0.4971576495008867, 0.2919038831197217, 0.22840927537710454, 0.3412073457592769, 0.8225633833910679, 0.5752089693095781, 0.9689263009353758, 0.21767572922743372, 0.18658448217084145, 0.8344590510157097, 0.6750191490041288, 0.10173437206290115, 0.18641835477653335, 0.37732583929466723, 0.2899489678174624, 0.5770302940460592, 0.2637695131588378, 0.3972396305666289, 1.4514307987711916e-05, 0.12743320247487488, 0.8968541501618451, 0.05969751942426227, 0.05447018291599748, 0.20328728867789514, 0.003656575178249799, 0.9971783230953928, 0.31641645610953373, 0.9735186797983651, 0.0014582195036469958, 0.7380826554956423, 0.00017017061115509602, 0.5458849643631433, 0.04583697082479443, 0.0006914674660498665, 0.15508140907872037, 0.10000066715339662, 0.017462834526808635, 0.08312495645559392, 0.0010654619187255854, 0.1134797985357281], "AkkarEtAlRjb2014 SA(1.0) Total": [0.3275067858567504, 0.1401864581120218, 0.07196563403120881, 0.053671434535368046, 0.22532358258493634, 0.559612136720945, 0.9242818951673825, 0.6721497567186597, 0.14555248209409277, 0.4517525779377186, 0.5685283571581163, 0.45202351016463005, 0.07114509447613271, 0.4514936422778888, 0.24896128815868013, 0.5992315805590713, 0.33146878517194167, 0.14561390791468287, 0.9783444904730114, 0.0023322456685758164, 0.5397663840138038, 0.5533426857450267, 0.3558303878321609, 0.33807491751210383, 0.692682666973091, 0.07057671649231556, 0.4790017188031864, 0.8717649497517468, 0.49894663306365694, 0.040630517092114515, 0.677621183044874, 7.631703035704263e-05, 0.21980087951394411, 0.015036361114498908, 0.00027410015805140464, 0.05308279415566863, 0.03341153066693736, 0.0057799507161852, 0.027582011518052507, 0.0004083819160703994, 0.038134924690509386], "AkkarEtAlRjb2014 SA(1.0) Inter-event": [0.07710287763840151, 0.43810246606202297, 0.3304233826773241, 0.1605775466455649], "ChiouYoungs2014 PGA Intra-event": [0.5284574493276828, 0.7370568721392079, 0.20636342234537441, 0.06811532523422781, 0.7583908676793628, 0.17917733782806844, 0.40101212284886, 0.6995289354140112, 0.08336041505147307, 0.951239129302205, 0.6727345520028106, 0.515081916180564, 0.7896812000640892, 0.7814400746776625, 0.0984703644236482, 0.8123274496544293, 0.0284810839666777, 0.0006654190534475246, 0.039753939083030376, 3.803918814637708e-05, 0.022359024855189147, 0.33167622959301946, 0.239412033218786, 0.03546361291432176, 0.08647192591511599, 0.00024837968018687206, 0.9474432969851513, 0.4720131634634286, 0.21360221968727533, 0.00028449696311161343, 0.3162564956180579, 7.329567120950564e-05, 0.37152916544697023, 0.00594486912483938, 0.005636890600152822, 0.26265922201859604, 0.1157499282373804, 6.063828836389007e-06, 0.1507047153380936, 1.1548193167398857e-05, 0.004856848860126828], "ChiouYoungs2014 PGA Total": [0.09817770383141278, 0.40409619603720526, 0.028461449886419965, 0.00766775446774981, 0.4013077945604038, 0.5673433121573618, 0.8874119021099403, 0.364789874933946, 0.03973058987679601, 0.6002322431614093, 0.3485384257601367, 0.9841394229879274, 0.4211687817106423, 0.7344842217655331, 0.04671795904332221, 0.43487067979181215, 0.00028045604251802914, 3.0742682560758183e-06, 0.17957374962382788, 0.0016948603440359866, 0.12334172276333444, 0.6918619009115023, 0.5646050784882162, 0.16683068079548657, 0.29624974569581175, 0.006094012037503593, 0.612471229622002, 0.8592619147037661, 0.5256574873454245, 0.0066825033775869125, 0.6713666726143914, 9.310220431624217e-05, 0.21960989062805403, 0.004447600542296315, 0.004236157171040311, 0.15433678212973345, 0.06886895924969372, 1.1379601348782131e-05, 0.089018904769126, 1.9662382260032807e-05, 0.003712435873383657], "ChiouYoungs2014 PGA Inter-event": [0.03463700082362675, 0.034637015420995954, 0.034637015420995954, 0.03463701362908278, 0.26667085145802527, 0.2666430211177653, 0.26667085145802527, 0.26667085145802527, 0.26666303009710335, 0.2666678865125174, 0.26667085145802527, 0.2666651701063568, 0.2666147714512864, 0.266670335250608, 0.2666530040747984, 0.2646871397089935, 0.0006807086608876523, 0.0007117721544299682, 0.35879750280884815, 0.36107843372386383, 0.3612152105613795, 0.3596868404524953, 0.36105542331063567, 0.35597532401341314, 0.3611452695010974, 0.360833464059791, 0.36031862004711757, 0.36004145665880294, 0.3612052799723121, 0.36088830306154074, 0.3612095313240301, 0.3573549506527166, 0.36038182712824873, 0.36117957695322644, 0.3608901092616399, 0.36100957191280103, 0.36040958129391876, 0.3611465626613014, 0.3608158595768489, 0.36119992819240576, 0.3612071128639598], "ChiouYoungs2014 SA(1.0) Intra-event": [0.9429020335824827, 0.46384310901718195, 0.31624997461257287, 0.17726144413883949, 0.2953642760544616, 0.8702290022605412, 0.7080031147913697, 0.7976588669008282, 0.2680970229890377, 0.14861901692034762, 0.6901904227978771, 0.7478729237551798, 0.09361259032593439, 0.17517164507837835, 0.4066016930261739, 0.26881994866857806, 0.9942012670089714, 0.6097162121864451, 0.24500739403036942, 1.9594723680183e-05, 0.27760392152415836, 0.658020164557445, 0.03343293550013404, 0.015246763055204338, 0.20504701905177936, 0.0014053365880938307, 0.8101040587583358, 0.18877160288653871, 0.9276138792599368, 0.0007725670647158456, 0.9652437818341886, 0.00022332925585277774, 0.6709429016753564, 0.033562016147477, 0.0007349785954865462, 0.12775282174964053, 0.13637433504548213, 0.0045336658449455225, 0.0991415827973181, 0.00017481577628475353, 0.033811747457316854], "ChiouYoungs2014 SA(1.0) Total": [0.2826418598226462, 0.08162413238299182, 0.04941595761383011, 0.02427128029800707, 0.15273476505195227, 0.4875240665626861, 0.8055950269687543, 0.44026934410337815, 0.1386642470446784, 0.5187816066868686, 0.3733463571526666, 0.4088336526868471, 0.05058495615480063, 0.5677518069141817, 0.21146786315556154, 0.7166215029097998, 0.8581516002621861, 0.5418227014824892, 0.7202447426955784, 0.002519400801520244, 0.7694105890868588, 0.7963062981678427, 0.23595301376964473, 0.14947389552448587, 0.6536441646582014, 0.03580676196631982, 0.666161149656373, 0.6250884261853165, 0.5759814739467881, 0.024843764465444895, 0.5000324422011444, 0.00014309079168961514, 0.3165916439903287, 0.013968148181202844, 0.0004156470057304906, 0.05211376761273989, 0.055686960198916946, 0.0021465590323813677, 0.04034548993158471, 0.00011687163464813022, 0.014068422490479793], "ChiouYoungs2014 SA(1.0) Inter-event": [0.042426287364842996, 0.04242629291176403, 0.04242629291176403, 0.0424262912106923, 0.31291713255285725, 0.3129145687696453, 0.31291713255285725, 0.31291713255285725, 0.31291606953611506, 0.31291672510060375, 0.31291713255285725, 0.31291587105384955, 0.3129129554617117, 0.3129170334667667, 0.31291557583486873, 0.3126611045343962, 0.738299043243531, 0.7383299792158522, 0.2152782828622124, 0.21648105044476784, 0.21662676192404606, 0.21626279374036816, 0.21656530246803884, 0.2151133625924193, 0.21658836182841734, 0.2164249548721684, 0.2163861667830218, 0.21616282817841215, 0.2166251082870333, 0.2163959147780602, 0.21662337095164774, 0.21484625151692305, 0.21626002349730955, 0.2166126427068753, 0.2165599173179623, 0.21650044600502594, 0.21644527491698395, 0.2165675428784114, 0.21647320217946653, 0.21661788958322115, 0.21662127546622512]} \ No newline at end of file diff --git a/tests/smtk/residuals/test_residuals.py b/tests/smtk/residuals/test_residuals.py index 9de39e11..564bfda0 100644 --- a/tests/smtk/residuals/test_residuals.py +++ b/tests/smtk/residuals/test_residuals.py @@ -87,9 +87,14 @@ def test_residuals_execution(self): """ Tests basic execution of residuals - not correctness of values """ - # residuals = res.Residuals(self.gsims, self.imts) - # residuals.get_residuals(self.database, component="Geometric") - res_dict = residuals.get_residuals(self.gsims, self.imts, self.database) + # when comparing new and old data, try np.allclose and if it fails, retry + # with relaxed conditions: + RTOL = 0.016 # relative tolerance defining "closeness": abs diff element-wise + QTL = 0.95 # quantile defining how much data must be "close enough" (