From 239bfd399fa2dd1ed2818dd31b3902c10e91d72a Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Mon, 4 Mar 2024 19:46:45 +0100 Subject: [PATCH 01/48] minimize converter --- skfda/preprocessing/conversion/__init__.py | 0 .../conversion/_mixed_effects.py | 605 ++++++++++++++++++ skfda/preprocessing/conversion/_to_basis.py | 34 + skfda/tests/test_mixed_effects_converter.py | 282 ++++++++ 4 files changed, 921 insertions(+) create mode 100644 skfda/preprocessing/conversion/__init__.py create mode 100644 skfda/preprocessing/conversion/_mixed_effects.py create mode 100644 skfda/preprocessing/conversion/_to_basis.py create mode 100644 skfda/tests/test_mixed_effects_converter.py diff --git a/skfda/preprocessing/conversion/__init__.py b/skfda/preprocessing/conversion/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/skfda/preprocessing/conversion/_mixed_effects.py b/skfda/preprocessing/conversion/_mixed_effects.py new file mode 100644 index 000000000..175def5ad --- /dev/null +++ b/skfda/preprocessing/conversion/_mixed_effects.py @@ -0,0 +1,605 @@ +# -*- coding: utf-8 -*- +"""Mixed effects converters. + +This module contains the class for converting irregular data to basis +representation using the mixed effects model. + +#TODO: Add references ? (laird & ware) + +""" +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import ( + Any, + Callable, + Optional, + List, +) + +import numpy as np +import scipy +from typing_extensions import Final, Self + +from ...representation import FDataBasis, FDataIrregular +from ...representation.basis import Basis +from ...typing._numpy import NDArrayFloat +from ._to_basis import _ToBasisConverter + + +_SCIPY_MINIMIZATION_METHODS = [ + "BFGS", # no hessian + "Powell", # no jacobian + "L-BFGS-B", + "trust-constr", + "Nelder-Mead", # no jacobian + "COBYLA", # no jacobian + "SLSQP", + "CG", # no hessian + "trust-ncg", + "trust-exact", + "trust-krylov", + "TNC", + "dogleg", + "Newton-CG", # requires jacobian +] + +_EM_MINIMIZATION_METHODS = [ + "params", + "square-error", + "square-error-big", + "prop-offset", + "loglikelihood" +] + + +def _get_values_list( + fdatairregular: FDataIrregular, +) -> List[NDArrayFloat]: + assert fdatairregular.dim_domain == 1 + assert fdatairregular.dim_codomain == 1 + return np.split( + fdatairregular.values.reshape(-1), + fdatairregular.start_indices[1:], + ) + + +def _get_basis_evaluations_list( + fdatairregular: FDataIrregular, + basis: Basis, +) -> List[NDArrayFloat]: + assert fdatairregular.dim_domain == 1 + assert fdatairregular.dim_codomain == 1 + return np.split( + basis(fdatairregular.points)[:, :, 0].T, + fdatairregular.start_indices[1:], + ) + + +def _minimize( + fun: Callable[[NDArrayFloat], float], + x0: NDArrayFloat, + minimization_methods: str | List[str] | None = None, +) -> scipy.optimize.OptimizeResult: + """Minimize a scalar function of one or more variables.""" + if isinstance(minimization_methods, str): + minimization_methods = [minimization_methods] + + if minimization_methods is None: + minimization_methods = _SCIPY_MINIMIZATION_METHODS + else: + for method in minimization_methods: + if method not in _SCIPY_MINIMIZATION_METHODS: + raise ValueError(f"Invalid method: \"{method}\".") + + for method in minimization_methods: + result = scipy.optimize.minimize( + fun=fun, + x0=x0, + method=method, + options={ + # "disp": True, + # "maxiter": 1000, + }, + ) + if result.success is True: + # print( + # f"[MEEstimator info]: Minimization method {method} succeeded.", + # ) + return result + # else: + # print(f"[MEEstimator info]: Minimization method {method} failed.") + return result # even if it failed + + +def _linalg_solve( + a: NDArrayFloat, b: NDArrayFloat, *, assume_a: str = 'gen' +) -> NDArrayFloat: + """Solve a linear system of equations: a @ x = b""" + try: + return scipy.linalg.solve(a=a, b=b, assume_a=assume_a) # type: ignore + except scipy.linalg.LinAlgError: + # TODO: is the best way to handle this ? + # print("Warning: scipy.linalg.solve failed, using scipy.linalg.lstsq") + return scipy.linalg.lstsq(a=a, b=b)[0] # type: ignore + + +def sum_mahalanobis( + r_list: List[NDArrayFloat], + cov_mat_list: List[NDArrayFloat], + r_list2: Optional[List[NDArrayFloat]] = None, +) -> NDArrayFloat: + """sum_k ( r_list[k]^T @ cov_mat_list[k]^{-1} @ r_list2[k] ) + + Arguments: + r_list: List of residuals (could be matrices). + cov_mat_list: List of covariance matrices. + r_list2: List of residuals (right side) -- if None, r_list is used. + + Returns: + sum_k ( r_list[k]^T @ cov_mat_list[k]^{-1} @ r_list2[k] ) + """ + if r_list2 is None: + r_list2 = r_list + return sum( + r1.T @ _linalg_solve(cov_mat, r2, assume_a="pos") + for r1, cov_mat, r2 in zip(r_list, cov_mat_list, r_list2) + ) # type: ignore + + +class _MixedEffectsCovParams(ABC): + """Covariance params of the mixed effects model for irregular data.""" + + @abstractmethod + def covariance(self) -> NDArrayFloat: + """Covariance of the mixed effects.""" + pass + + @abstractmethod + def covariance_div_sigmasq(self) -> NDArrayFloat: + """Covariance of the mixed effects.""" + pass + + @abstractmethod + def sigmasq(self) -> float: + """Variance of the residuals.""" + pass + + +class _MixedEffectsParams(_MixedEffectsCovParams): + """Params of the mixed effects model for irregular data.""" + + @abstractmethod + def mean(self) -> NDArrayFloat: + """Fixed effects.""" + pass + + +@dataclass +class _MixedEffectsParamsResult(_MixedEffectsParams): + """Basic mixed effects params implementation.""" + _mean: NDArrayFloat + _covariance: NDArrayFloat + _sigmasq: float + + def covariance(self) -> NDArrayFloat: + return self._covariance + + def mean(self) -> NDArrayFloat: + return self._mean + + def sigmasq(self) -> float: + return self._sigmasq + + def covariance_div_sigmasq(self) -> NDArrayFloat: + return self._covariance / self._sigmasq + + +class _MinimizeMixedEffectsParams(_MixedEffectsParams): + """Default class to represent the mixed effects parameters. + + Used to implement the optimization of loglikelihood as suggested in + Mary J. Lindstrom & Douglas M. Bates (1988). + + Args: + _L: (_L @ _L.T) is the Cholesky decomposition of covariance/sigmasq. + _has_mean: Whether the mean is fixed or estimated with ML estimator. + _mean: Fixed effects (will be none iff _has_mean=False). + _model: Mixed effects model to use for the estimation of the mean in + case _has_mean=False (will be None otherwise). + """ + + _L: NDArrayFloat + _mean: Optional[NDArrayFloat] + _has_mean: bool + _model: Optional[_MixedEffectsModel] + + def __init__( + self, + L: NDArrayFloat, + mean: Optional[NDArrayFloat], + has_mean: bool = True, + model: Optional[_MixedEffectsModel] = None, + ) -> None: + self._L = L + self._mean = mean + self._has_mean = has_mean + self._model = model + if has_mean: + assert mean is not None + else: + assert mean is None + assert model is not None + + def mean(self) -> NDArrayFloat: + if self._has_mean: + assert self._mean is not None # TODO: remove + return self._mean + assert self._model is not None, "model is required" + values_covariances = self._model._values_covariances( + self, div_sigmasq=True, + ) + return _linalg_solve( + a=sum_mahalanobis( + self._model.basis_evaluations, + values_covariances, + self._model.basis_evaluations, + ), + b=sum_mahalanobis( + self._model.basis_evaluations, + values_covariances, + self._model.values, + ), + assume_a="pos", + ) + + def covariance_div_sigmasq(self) -> NDArrayFloat: + return self._L @ self._L.T + + def covariance(self) -> NDArrayFloat: + return self.covariance_div_sigmasq() * self.sigmasq() + + def sigmasq(self) -> float: + assert self._model is not None, "Model is required" + return sum_mahalanobis( + self._model._partial_residuals_list(self), + self._model._values_covariances(self, div_sigmasq=True), + ) / self._model._n_measurements # type: ignore + + @classmethod + def from_vec( + cls, + vec: NDArrayFloat, + dim_effects: int, + model: Optional[_MixedEffectsModel] = None, + has_mean: bool = True, + ) -> Self: + """Create Params from vectorized parameters.""" + mean = vec[:dim_effects] if has_mean else None + L_vec_len = dim_effects * (dim_effects + 1) // 2 + L = np.zeros((dim_effects, dim_effects)) + L[np.tril_indices(dim_effects)] = vec[-L_vec_len:] + return cls(mean=mean, L=L, model=model, has_mean=has_mean) + + def to_vec(self) -> NDArrayFloat: + """Vectorize parameters.""" + return np.concatenate([ + self._mean if self._has_mean else np.array([]), + self._L[np.tril_indices(self._L.shape[0])] + ]) + + @classmethod + def initial_params( + cls, + dim_effects: int, + has_mean: bool, + model: _MixedEffectsModel, + ) -> Self: + """Generic initial parameters .""" + return cls( + mean=np.zeros(dim_effects) if has_mean else None, + L=np.eye(dim_effects), + has_mean=has_mean, + model=model, + ) + + +class _EMMixedEffectsParams(_MixedEffectsCovParams): + """Mixed effects parameters for the EM algorithm.""" + _sigmasq: float + _covariance: NDArrayFloat + # _model: _MixedEffectsModel + + def __init__( + self, + sigmasq: float, + covariance: NDArrayFloat, + # model: _MixedEffectsModel, + ) -> None: + self._sigmasq = sigmasq + self._covariance = covariance + # self._model = model + + def covariance(self) -> NDArrayFloat: + """Covariance of the mixed effects.""" + return self._covariance + + def covariance_div_sigmasq(self) -> NDArrayFloat: + """Covariance of the mixed effects.""" + return self._covariance / self._sigmasq + + def sigmasq(self) -> float: + """Variance of the residuals.""" + return self._sigmasq + + def mean(self) -> NDArrayFloat: + raise NotImplementedError() + + def to_vec(self) -> NDArrayFloat: + return np.concatenate([ + np.array([self._sigmasq]), + self._covariance[np.tril_indices(self._covariance.shape[0])], + ]) + + def len_vec(self) -> int: + dim_effects = self._covariance.shape[0] + return 1 + dim_effects * (dim_effects + 1) // 2 + + +class _MixedEffectsModel: + """Mixed effects model. + + Class representing the mixed effects model for irregular data. + + Model: + + + values[k] = basis_evaluations[k] @ (mean + mixed_effects[k]) + error[k] + + Args: + values: List of the values of each curve. + basis_evaluations: List of the basis evaluations corresponding to the + points where the curves are evaluated. + """ + + values: List[NDArrayFloat] + basis_evaluations: List[NDArrayFloat] + _n_measurements: int + _profile_loglikelihood_additive_constants: float + + def __init__( + self, + fdatairregular: FDataIrregular, + basis: Basis, + ) -> None: + self.values = _get_values_list(fdatairregular) + self.basis_evaluations = _get_basis_evaluations_list( + fdatairregular, basis, + ) + self._n_measurements = len(fdatairregular.points) + self._profile_loglikelihood_additive_constants = ( + + self._n_measurements / 2 * np.log(self._n_measurements) + - self._n_measurements / 2 * np.log(2 * np.pi) + - self._n_measurements / 2 + ) + + def _dim_effects(self) -> int: + """Dimension of the mixed and of the fixed effects.""" + return self.basis_evaluations[0].shape[1] + + def _partial_residuals_list( + self, + params: _MixedEffectsParams, + ) -> List[NDArrayFloat]: + """Residuals of the mixed effects model. + + r[k] = value[k] - basis_evaluations[k] @ mean + """ + mean = params.mean() + return [ + value - basis_evaluation @ mean + for value, basis_evaluation in zip( + self.values, self.basis_evaluations, + ) + ] + + def _values_covariances( + self, + params: _MixedEffectsParams, + div_sigmasq: bool, + ) -> List[NDArrayFloat]: + """Covariance of the values. + + values_covariances[k] = ( + sigmasq * I + + basis_evaluations[k] @ covariance @ basis_evaluations[k].T + ) + + If div_sigmasq is True, then the results will be divided by sigmasq. + div_sigmasq = True for the model from Lindstrom & Bates (1988). + """ + if div_sigmasq: + cov_div_sigmasq = params.covariance_div_sigmasq() + return [ + np.eye(basis_evaluation.shape[0]) + + basis_evaluation @ cov_div_sigmasq @ basis_evaluation.T + for basis_evaluation in self.basis_evaluations + ] + + sigmasq = params.sigmasq() + params_covariance = params.covariance() + + return [ + sigmasq * np.eye(basis_evaluation.shape[0]) + + basis_evaluation @ params_covariance @ basis_evaluation.T + for basis_evaluation in self.basis_evaluations + ] + + def _mixed_effects_estimate( + self, + params: _MixedEffectsParams, + ) -> NDArrayFloat: + """Estimates of the mixed effects (generalized least squares) + + mixed_effects_estimate[k] = ( + covariance @ basis_evaluations[k].T + @ values_covariances[k]^{-1} @ partial_residuals[k] + ) + """ + covariance = params.covariance() + partial_residuals_list = self._partial_residuals_list(params) + values_cov_list = self._values_covariances(params, div_sigmasq=False) + + return np.array([ + covariance @ basis_eval.T @ _linalg_solve( + value_cov, r, assume_a="pos", + ) + for basis_eval, value_cov, r in zip( + self.basis_evaluations, + values_cov_list, + partial_residuals_list, + ) + ]) + + def profile_loglikelihood( + self, + params: _MinimizeMixedEffectsParams | NDArrayFloat, + has_mean: bool = True, + ) -> float: + """Profile loglikelihood.""" + if isinstance(params, np.ndarray): + params = _MinimizeMixedEffectsParams.from_vec( + params, self._dim_effects(), model=self, has_mean=has_mean, + ) + + r_list = self._partial_residuals_list(params) + V_list = self._values_covariances(params, div_sigmasq=True) + + # slogdet_V_list = [np.linalg.slogdet(V) for V in V_list] + # if any(slogdet_V[0] <= 0 for slogdet_V in slogdet_V_list): + # return -np.inf + # TODO remove check sign? + + # sum_logdet_V: float = sum( + # slogdet_V[1] for slogdet_V in slogdet_V_list + # ) + sum_logdet_V: float = sum(np.linalg.slogdet(V)[1] for V in V_list) + sum_mahalanobis_ = sum_mahalanobis(r_list, V_list) + log_sum_mahalanobis: float = np.log(sum_mahalanobis_) # type: ignore + + return ( + - sum_logdet_V / 2 + - self._n_measurements / 2 * log_sum_mahalanobis + + self._profile_loglikelihood_additive_constants + ) + + +class MixedEffectsConverter(_ToBasisConverter[FDataIrregular]): + """Mixed effects to-basis-converter.""" + + basis: Basis + + # after fitting: + fitted_model: Optional[_MixedEffectsModel] + fitted_params: Optional[_MixedEffectsParams] + result: Any + + def __init__( + self, + basis: Basis, + ) -> None: + self.basis = basis + self.fitted_model = None + self.fitted_params = None + + def transform( + self, + X: FDataIrregular, + ) -> FDataBasis: + if self.fitted_params is None: # or self.model is None: + raise ValueError("The converter has not been fitted.") + + X_model = _MixedEffectsModel(X, self.basis) + mean = self.fitted_params.mean() + gamma_estimates = X_model._mixed_effects_estimate(self.fitted_params) + + coefficients = mean[np.newaxis, :] + gamma_estimates + + return FDataBasis( + basis=self.basis, + coefficients=coefficients, + ) + + +class MinimizeMixedEffectsConverter(MixedEffectsConverter): + """Mixed effects to-basis-converter using scipy.optimize. + + Minimizes the profile loglikelihood of the mixed effects model as proposed + by Lindstrom & Bates (1988). + """ + + def fit( + self, + X: FDataIrregular, + y: object = None, + *, + initial_params: Optional[ + _MinimizeMixedEffectsParams | NDArrayFloat + ] = None, + minimization_method: Optional[str] = None, + has_mean: bool = True, + ) -> Self: + """Fit the model. + + Args: + X: irregular data to fit. + y: ignored. + initial_params: initial params of the model. + minimization_methods: scipy.optimize.minimize method to be used for + the minimization of the loglikelihood of the model. + + Returns: + self after fit + """ + dim_effects = self.basis.n_basis + if isinstance(initial_params, _MinimizeMixedEffectsParams): + # assert has_beta == initial_params.has_beta + initial_params_vec = initial_params.to_vec() + elif initial_params is not None: + initial_params_vec = initial_params + else: + initial_params_vec = _MinimizeMixedEffectsParams.initial_params( + dim_effects=dim_effects, has_mean=has_mean, model=self, + ).to_vec() + + if minimization_method is None: + minimization_method = _SCIPY_MINIMIZATION_METHODS[0] + + model = _MixedEffectsModel(X, self.basis) + n_samples = X.n_samples + + def objective_function(params: NDArrayFloat) -> float: + return - model.profile_loglikelihood( + params, has_mean=has_mean, + ) / n_samples + + self.result = _minimize( + fun=objective_function, + x0=initial_params_vec, + minimization_methods=minimization_method, + ) + self.fitted_model = model + params = _MinimizeMixedEffectsParams.from_vec( + self.result.x, + dim_effects=dim_effects, + model=model, + has_mean=has_mean, + ) + self.fitted_params = _MixedEffectsParamsResult( + _mean=params.mean(), + _covariance=params.covariance(), + _sigmasq=params.sigmasq(), + ) + + return self diff --git a/skfda/preprocessing/conversion/_to_basis.py b/skfda/preprocessing/conversion/_to_basis.py new file mode 100644 index 000000000..a6f3c1426 --- /dev/null +++ b/skfda/preprocessing/conversion/_to_basis.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +"""To basis converter. + +This module contains the abstract base class for all FData to FDatabasis +converters. + +""" +from __future__ import annotations + +from typing import ( + Generic, + TypeVar, +) + +from ..._utils._sklearn_adapter import BaseEstimator, TransformerMixin +from ...representation import FData, FDataBasis + +Input = TypeVar( + "Input", + bound=FData, + contravariant=True, +) + + +class _ToBasisConverter( + BaseEstimator, + Generic[Input], + TransformerMixin[Input, FDataBasis, object], +): + """To basis converter. + + Abstract base class for all FData to FDataBasis converters. The subclasses + must override ``fit`` and ``transform`` to define the conversion. + """ diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py new file mode 100644 index 000000000..497532ec0 --- /dev/null +++ b/skfda/tests/test_mixed_effects_converter.py @@ -0,0 +1,282 @@ +"""Tests for the mixed effects to-basis-converter.""" +import pytest +import numpy as np +import numpy.typing as npt +from typing import ( + Callable, + Iterable, + Literal, + Tuple, +) +import matplotlib.pyplot as plt + +from skfda import FDataBasis +from skfda.representation import ( + FDataBasis, + FDataIrregular, +) +from skfda.typing._numpy import (NDArrayFloat, NDArrayInt) +from skfda.representation.basis import ( + BSplineBasis, + FourierBasis, +) +from skfda.preprocessing.conversion._mixed_effects import ( + MinimizeMixedEffectsConverter, + _get_values_list, + _get_basis_evaluations_list, + _MixedEffectsModel, +) + +_fdatairregular = FDataIrregular( + start_indices=[0, 1, 5], + values=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), + points=list(range(9)), +) + + +def test_loglikelihood() -> None: + n_measurements = 200 + n_measurements_per_function = 5 + fdatairregular = FDataIrregular( + start_indices=list( + range(0, n_measurements, n_measurements_per_function) + ), + values=list(range(n_measurements)), + points=list(range(n_measurements)), + ) + + basis = FourierBasis(n_basis=5, domain_range=(0, 10)) + model = _MixedEffectsModel(fdatairregular, basis) + + params_loglike_list = [ + (np.array([ + 217.36197672, 111.34775404, 169.8070363, 337.91045293, + 1.88754248, 48.62764831, 268.29963389, 330.34110204, + 54.68263587, 230.03733177, 356.52878172, 83.68084885, + 74.13128782, 43.35075619, 87.87899705, 391.44951388, + 324.67325964, 68.77640509, 326.48989949, 109.62949882, + ]), -1412.9937447885836), + (np.array([ + 172.68167347, 376.01192785, 327.05975151, 134.44478005, + 70.1641815, 149.13281852, 2.27540294, 100.97054138, + 318.26500339, 6.1019885, 239.53735077, 241.52181562, + 42.05907416, 152.77737798, 14.59042264, 356.16462538, + 392.3683428, 23.97679553, 356.21837789, 230.76059976, + ]), -1333.6585307493442), + (np.array([ + 296.99187564, 252.07357459, 232.73687696, 8.17565281, + 84.01063107, 217.87395127, 307.64606844, 100.27809166, + 114.35827616, 340.95803514, 390.00259744, 353.9413174, + 143.80313757, 239.54357835, 141.91824466, 136.07608615, + 71.2323958, 95.07768345, 17.94491298, 202.17257185, + ]), -1270.1651275382442), + (np.array([ + 150.50098172, 237.12216039, 251.97675023, 57.04012578, + 373.53651979, 378.55195232, 240.91866309, 155.10651213, + 145.27520164, 81.73811075, 110.70602456, 98.61435248, + 69.4432007, 386.64387779, 382.80504014, 239.18947373, + 292.52030122, 136.15408913, 36.82224135, 185.39920757, + ]), -1218.0955679886356), + (np.array([ + 203.4795573, 35.3840692, 211.21408933, 396.8632146, + 158.0143727, 134.23857669, 322.18021493, 301.73959783, + 125.22657664, 253.61467318, 216.16183012, 118.71750035, + 44.31516047, 125.05611915, 182.79165202, 263.57602809, + 101.70300713, 256.44050348, 80.04944289, 263.04992221 + ]), -1231.9562787796967), + (np.array([ + 311.31568618, 311.83935944, 244.13126128, 123.60013941, + 279.09396301, 343.84731829, 250.1295031, 392.96313184, + 390.60005081, 66.67765248, 9.27125459, 64.2978194, + 369.3987301, 381.41993995, 84.39136749, 144.21010033, + 219.75010465, 108.73233967, 184.24064843, 278.46462593 + ]), -1437.3441872940807), + ] + + for params, mixedlm_loglikelihood in params_loglike_list: + model_loglikelihood = model.profile_loglikelihood( + params, + ) + + assert np.allclose(mixedlm_loglikelihood, model_loglikelihood) + + +def test_values_list() -> None: + fdatairregular = _fdatairregular + x_list = _get_values_list(fdatairregular) + expected_x_list = [ + np.array([1]), + np.array([2, 3, 4, 5]), + np.array([6, 7, 8, 9]), + ] + for x, expected_x in zip(x_list, expected_x_list): + assert np.all(x == expected_x) + + +def test_basis_evaluations_list() -> None: + fdatairregular = _fdatairregular + basis = FourierBasis(n_basis=3, domain_range=(0, 10)) + phi_list = _get_basis_evaluations_list(fdatairregular, basis) + + def eval_basis(x: float) -> npt.NDArray[np.float_]: + return basis(x).reshape(-1) + + expected_phi = [ + np.array([eval_basis(0)]), + np.array([eval_basis(j) for j in [1, 2, 3, 4]]), + np.array([eval_basis(j) for j in [5, 6, 7, 8]]), + ] + + for phi, expected_phi in zip(phi_list, expected_phi): + np.testing.assert_allclose(phi, expected_phi) + + +def _create_irregular_samples( + funcs: Iterable[ + Callable[[npt.NDArray[np.float_]], npt.NDArray[np.float_]] + ], + points: npt.NDArray[np.float_], + noise_generate_std: float, + *, + start_indices: NDArrayInt | None = None, + n_points: int | None = None, +) -> FDataIrregular: + """Generate samples of functions at points with gaussian noise. + + Args: + funcs: Functions to sample. + points: Points where to sample. + noise_generate_std: Standard deviation of the gaussian noise. + start_indices: Start indices of each sample. + n_points: Number of points of each sample. If not None, start_indices + is ignored. + """ + if n_points is not None: + start_indices = np.arange(0, len(points), n_points) + elif start_indices is None: + raise ValueError("Either n_points or start_indices must be provided") + fun_points = np.split(points, start_indices[1:]) + fun_values = np.concatenate([ + func(point) for func, point in zip(funcs, fun_points) + ]).reshape((-1, 1)) + noise_values = np.random.normal( + 0, noise_generate_std, len(fun_values), + ).reshape((-1, 1)) + return FDataIrregular( + start_indices=start_indices, + points=points, + values=fun_values + noise_values, + ) + + +def _get_points( + domain_range: Tuple[float, float], + n_points: int, + n_samples: int, + type_gen_points: int | Literal["equally_spaced", "random_uniform"], + n_points_per_sample_range: tuple[int, int] = (1, 6), +) -> npt.NDArray[np.float_]: + if type_gen_points == "equally_spaced": + ret_value = np.tile( + np.linspace(*domain_range, n_points).reshape((-1, 1)), + (n_samples, 1), + ) + elif type_gen_points == "random_uniform": + ret_value = np.random.uniform( + *domain_range, size=n_points * n_samples, + ).reshape((-1, 1)) + elif isinstance(type_gen_points, int): + n = type_gen_points + tot_n_points = n_points * n_samples + domain_split = np.linspace(*domain_range, n + 1) + domains = list(zip(domain_split[:-1], domain_split[1:])) + points_list = [ + np.random.uniform( + domain[0] - 0.6 * (domain[1] - domain[0]), + domain[1] + 0.6 * (domain[1] - domain[0]), + size=tot_n_points // n) + for domain in domains + ] + ret_value = np.concatenate(points_list).reshape((-1, 1))[:tot_n_points] + + return ( + ret_value + * (ret_value >= domain_range[0]) + * (ret_value <= domain_range[1]) + + domain_range[0] * (ret_value < domain_range[0]) + + domain_range[1] * (ret_value > domain_range[1]) + ) + + +def test_simple_conversion() -> None: + """Visual test.""" + _max_val = 10 + _domain_range = (0, 10) + n_points = 6 + n_basis = 5 + n_samples = 50 + points = _get_points(_domain_range, n_points, n_samples, 9) + + basis = FourierBasis(n_basis=n_basis, domain_range=_domain_range) + # BSplineBasis( + # n_basis=n_basis, domain_range=_domain_range, order=n_basis - 1, + # ) + + sigma = 0.3 + Gamma_sqrt = np.zeros((n_basis, n_basis)) + Gamma_sqrt[np.tril_indices(n_basis)] = np.random.rand( + n_basis * (n_basis + 1) // 2, + ) * _max_val + Gamma = Gamma_sqrt @ Gamma_sqrt.T + beta = np.random.rand(n_basis) * _max_val + fdatabasis_original = FDataBasis( + basis=basis, + coefficients=np.random.multivariate_normal( + mean=beta, cov=Gamma, size=n_samples, + ), + ) + + def fun(i: int) -> Callable[[NDArrayFloat], NDArrayFloat]: + def fi(x: NDArrayFloat) -> NDArrayFloat: + return fdatabasis_original[i](x).reshape(x.shape) + return fi + + funcs = [fun(i) for i in range(n_samples)] + + fdatairregular = _create_irregular_samples( + funcs=funcs, + n_points=n_points, + points=points, + noise_generate_std=sigma, + ) + converter = MinimizeMixedEffectsConverter(basis) + fdatabasis_estimated = converter.fit_transform(fdatairregular) + fdatabasis_basic = fdatairregular.to_basis(basis) + if True: + _ = plt.figure(figsize=(15, 6)) + + axes = plt.subplot(2, 2, 1) + plt.title("Original data") + fdatairregular[:5].plot(axes=axes) + left, right = plt.ylim() + plt.ylim((min(0, left), max(1.4, right))) + + axes = plt.subplot(2, 2, 2) + plt.title("Estimated basis representation.\n") + fdatairregular.scatter(axes=axes) + fdatabasis_estimated[:5].plot(axes=axes) + left, right = plt.ylim() + plt.ylim((min(0, left), max(1.4, right))) + + axes = plt.subplot(2, 2, 4) + plt.title("Original basis representation") + fdatairregular.scatter(axes=axes) + fdatabasis_original[:5].plot(axes=axes) + left, right = plt.ylim() + plt.ylim((min(0, left), max(1.4, right))) + + axes = plt.subplot(2, 2, 3) + plt.title(f"{basis}") + basis.plot(axes=axes) + + plt.show() From 4b917ec6ff8385d443cc4283848a658c137b3faf Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Fri, 8 Mar 2024 14:20:12 +0100 Subject: [PATCH 02/48] Change 'conversion' module to 'representation' and reorganize internal params classes --- .../conversion/__init__.py | 0 .../conversion/_mixed_effects.py | 446 +++++++++--------- .../conversion/_to_basis.py | 7 +- skfda/tests/test_mixed_effects_converter.py | 153 +++--- 4 files changed, 301 insertions(+), 305 deletions(-) rename skfda/{preprocessing => representation}/conversion/__init__.py (100%) rename skfda/{preprocessing => representation}/conversion/_mixed_effects.py (58%) rename skfda/{preprocessing => representation}/conversion/_to_basis.py (81%) diff --git a/skfda/preprocessing/conversion/__init__.py b/skfda/representation/conversion/__init__.py similarity index 100% rename from skfda/preprocessing/conversion/__init__.py rename to skfda/representation/conversion/__init__.py diff --git a/skfda/preprocessing/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py similarity index 58% rename from skfda/preprocessing/conversion/_mixed_effects.py rename to skfda/representation/conversion/_mixed_effects.py index 175def5ad..a11679d48 100644 --- a/skfda/preprocessing/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -9,13 +9,13 @@ """ from __future__ import annotations -from abc import ABC, abstractmethod from dataclasses import dataclass from typing import ( Any, Callable, - Optional, List, + Optional, + Protocol, ) import numpy as np @@ -125,7 +125,7 @@ def _linalg_solve( return scipy.linalg.lstsq(a=a, b=b)[0] # type: ignore -def sum_mahalanobis( +def _sum_mahalanobis( r_list: List[NDArrayFloat], cov_mat_list: List[NDArrayFloat], r_list2: Optional[List[NDArrayFloat]] = None, @@ -148,203 +148,36 @@ def sum_mahalanobis( ) # type: ignore -class _MixedEffectsCovParams(ABC): - """Covariance params of the mixed effects model for irregular data.""" - - @abstractmethod - def covariance(self) -> NDArrayFloat: - """Covariance of the mixed effects.""" - pass - - @abstractmethod - def covariance_div_sigmasq(self) -> NDArrayFloat: - """Covariance of the mixed effects.""" - pass - - @abstractmethod - def sigmasq(self) -> float: - """Variance of the residuals.""" - pass - - -class _MixedEffectsParams(_MixedEffectsCovParams): +class _MixedEffectsParams(Protocol): """Params of the mixed effects model for irregular data.""" - @abstractmethod + @property def mean(self) -> NDArrayFloat: """Fixed effects.""" - pass - - -@dataclass -class _MixedEffectsParamsResult(_MixedEffectsParams): - """Basic mixed effects params implementation.""" - _mean: NDArrayFloat - _covariance: NDArrayFloat - _sigmasq: float - - def covariance(self) -> NDArrayFloat: - return self._covariance - - def mean(self) -> NDArrayFloat: - return self._mean - - def sigmasq(self) -> float: - return self._sigmasq - - def covariance_div_sigmasq(self) -> NDArrayFloat: - return self._covariance / self._sigmasq - - -class _MinimizeMixedEffectsParams(_MixedEffectsParams): - """Default class to represent the mixed effects parameters. - - Used to implement the optimization of loglikelihood as suggested in - Mary J. Lindstrom & Douglas M. Bates (1988). - - Args: - _L: (_L @ _L.T) is the Cholesky decomposition of covariance/sigmasq. - _has_mean: Whether the mean is fixed or estimated with ML estimator. - _mean: Fixed effects (will be none iff _has_mean=False). - _model: Mixed effects model to use for the estimation of the mean in - case _has_mean=False (will be None otherwise). - """ - - _L: NDArrayFloat - _mean: Optional[NDArrayFloat] - _has_mean: bool - _model: Optional[_MixedEffectsModel] - - def __init__( - self, - L: NDArrayFloat, - mean: Optional[NDArrayFloat], - has_mean: bool = True, - model: Optional[_MixedEffectsModel] = None, - ) -> None: - self._L = L - self._mean = mean - self._has_mean = has_mean - self._model = model - if has_mean: - assert mean is not None - else: - assert mean is None - assert model is not None - - def mean(self) -> NDArrayFloat: - if self._has_mean: - assert self._mean is not None # TODO: remove - return self._mean - assert self._model is not None, "model is required" - values_covariances = self._model._values_covariances( - self, div_sigmasq=True, - ) - return _linalg_solve( - a=sum_mahalanobis( - self._model.basis_evaluations, - values_covariances, - self._model.basis_evaluations, - ), - b=sum_mahalanobis( - self._model.basis_evaluations, - values_covariances, - self._model.values, - ), - assume_a="pos", - ) - - def covariance_div_sigmasq(self) -> NDArrayFloat: - return self._L @ self._L.T - - def covariance(self) -> NDArrayFloat: - return self.covariance_div_sigmasq() * self.sigmasq() - - def sigmasq(self) -> float: - assert self._model is not None, "Model is required" - return sum_mahalanobis( - self._model._partial_residuals_list(self), - self._model._values_covariances(self, div_sigmasq=True), - ) / self._model._n_measurements # type: ignore - - @classmethod - def from_vec( - cls, - vec: NDArrayFloat, - dim_effects: int, - model: Optional[_MixedEffectsModel] = None, - has_mean: bool = True, - ) -> Self: - """Create Params from vectorized parameters.""" - mean = vec[:dim_effects] if has_mean else None - L_vec_len = dim_effects * (dim_effects + 1) // 2 - L = np.zeros((dim_effects, dim_effects)) - L[np.tril_indices(dim_effects)] = vec[-L_vec_len:] - return cls(mean=mean, L=L, model=model, has_mean=has_mean) - - def to_vec(self) -> NDArrayFloat: - """Vectorize parameters.""" - return np.concatenate([ - self._mean if self._has_mean else np.array([]), - self._L[np.tril_indices(self._L.shape[0])] - ]) - - @classmethod - def initial_params( - cls, - dim_effects: int, - has_mean: bool, - model: _MixedEffectsModel, - ) -> Self: - """Generic initial parameters .""" - return cls( - mean=np.zeros(dim_effects) if has_mean else None, - L=np.eye(dim_effects), - has_mean=has_mean, - model=model, - ) - - -class _EMMixedEffectsParams(_MixedEffectsCovParams): - """Mixed effects parameters for the EM algorithm.""" - _sigmasq: float - _covariance: NDArrayFloat - # _model: _MixedEffectsModel - - def __init__( - self, - sigmasq: float, - covariance: NDArrayFloat, - # model: _MixedEffectsModel, - ) -> None: - self._sigmasq = sigmasq - self._covariance = covariance - # self._model = model + ... + @property def covariance(self) -> NDArrayFloat: """Covariance of the mixed effects.""" - return self._covariance + ... + @property def covariance_div_sigmasq(self) -> NDArrayFloat: """Covariance of the mixed effects.""" - return self._covariance / self._sigmasq + ... + @property def sigmasq(self) -> float: """Variance of the residuals.""" - return self._sigmasq + ... - def mean(self) -> NDArrayFloat: - raise NotImplementedError() - - def to_vec(self) -> NDArrayFloat: - return np.concatenate([ - np.array([self._sigmasq]), - self._covariance[np.tril_indices(self._covariance.shape[0])], - ]) - def len_vec(self) -> int: - dim_effects = self._covariance.shape[0] - return 1 + dim_effects * (dim_effects + 1) // 2 +@dataclass(frozen=True) +class _MixedEffectsParamsResult: + """Result of the fitting of a mixed effects model for irregular data.""" + mean: NDArrayFloat + covariance: NDArrayFloat + sigmasq: float class _MixedEffectsModel: @@ -388,15 +221,14 @@ def _dim_effects(self) -> int: """Dimension of the mixed and of the fixed effects.""" return self.basis_evaluations[0].shape[1] - def _partial_residuals_list( + def partial_residuals_list( self, - params: _MixedEffectsParams, + mean: NDArrayFloat, ) -> List[NDArrayFloat]: """Residuals of the mixed effects model. r[k] = value[k] - basis_evaluations[k] @ mean """ - mean = params.mean() return [ value - basis_evaluation @ mean for value, basis_evaluation in zip( @@ -404,31 +236,38 @@ def _partial_residuals_list( ) ] - def _values_covariances( + def values_covariances( self, params: _MixedEffectsParams, div_sigmasq: bool, ) -> List[NDArrayFloat]: """Covariance of the values. + If div_sigmasq is False, then the results will be: + values_covariances[k] = ( sigmasq * I + basis_evaluations[k] @ covariance @ basis_evaluations[k].T ) - If div_sigmasq is True, then the results will be divided by sigmasq. - div_sigmasq = True for the model from Lindstrom & Bates (1988). + If div_sigmasq is True, then the results will be (divided by sigmasq): + + values_covariances[k] = ( + I + basis_evaluations[k] @ cov_div_sigmasq @ basis_evaluations[k].T + ) + + div_sigmasq=True for the model from Lindstrom & Bates (1988). """ if div_sigmasq: - cov_div_sigmasq = params.covariance_div_sigmasq() + cov_div_sigmasq = params.covariance_div_sigmasq return [ np.eye(basis_evaluation.shape[0]) + basis_evaluation @ cov_div_sigmasq @ basis_evaluation.T for basis_evaluation in self.basis_evaluations ] - sigmasq = params.sigmasq() - params_covariance = params.covariance() + sigmasq = params.sigmasq + params_covariance = params.covariance return [ sigmasq * np.eye(basis_evaluation.shape[0]) @@ -436,7 +275,7 @@ def _values_covariances( for basis_evaluation in self.basis_evaluations ] - def _mixed_effects_estimate( + def mixed_effects_estimate( self, params: _MixedEffectsParams, ) -> NDArrayFloat: @@ -447,9 +286,9 @@ def _mixed_effects_estimate( @ values_covariances[k]^{-1} @ partial_residuals[k] ) """ - covariance = params.covariance() - partial_residuals_list = self._partial_residuals_list(params) - values_cov_list = self._values_covariances(params, div_sigmasq=False) + covariance = params.covariance + partial_residuals_list = self.partial_residuals_list(params.mean) + values_cov_list = self.values_covariances(params, div_sigmasq=False) return np.array([ covariance @ basis_eval.T @ _linalg_solve( @@ -464,17 +303,11 @@ def _mixed_effects_estimate( def profile_loglikelihood( self, - params: _MinimizeMixedEffectsParams | NDArrayFloat, - has_mean: bool = True, + params: _MixedEffectsParams, ) -> float: """Profile loglikelihood.""" - if isinstance(params, np.ndarray): - params = _MinimizeMixedEffectsParams.from_vec( - params, self._dim_effects(), model=self, has_mean=has_mean, - ) - - r_list = self._partial_residuals_list(params) - V_list = self._values_covariances(params, div_sigmasq=True) + r_list = self.partial_residuals_list(params.mean) + V_list = self.values_covariances(params, div_sigmasq=True) # slogdet_V_list = [np.linalg.slogdet(V) for V in V_list] # if any(slogdet_V[0] <= 0 for slogdet_V in slogdet_V_list): @@ -485,8 +318,8 @@ def profile_loglikelihood( # slogdet_V[1] for slogdet_V in slogdet_V_list # ) sum_logdet_V: float = sum(np.linalg.slogdet(V)[1] for V in V_list) - sum_mahalanobis_ = sum_mahalanobis(r_list, V_list) - log_sum_mahalanobis: float = np.log(sum_mahalanobis_) # type: ignore + sum_mahalanobis = _sum_mahalanobis(r_list, V_list) + log_sum_mahalanobis: float = np.log(sum_mahalanobis) # type: ignore return ( - sum_logdet_V / 2 @@ -498,20 +331,19 @@ def profile_loglikelihood( class MixedEffectsConverter(_ToBasisConverter[FDataIrregular]): """Mixed effects to-basis-converter.""" - basis: Basis - # after fitting: fitted_model: Optional[_MixedEffectsModel] - fitted_params: Optional[_MixedEffectsParams] - result: Any + fitted_params: Optional[_MixedEffectsParamsResult] + result: Optional[Any] def __init__( self, basis: Basis, ) -> None: - self.basis = basis self.fitted_model = None self.fitted_params = None + self.result = None + super().__init__(basis) def transform( self, @@ -521,8 +353,8 @@ def transform( raise ValueError("The converter has not been fitted.") X_model = _MixedEffectsModel(X, self.basis) - mean = self.fitted_params.mean() - gamma_estimates = X_model._mixed_effects_estimate(self.fitted_params) + mean = self.fitted_params.mean + gamma_estimates = X_model.mixed_effects_estimate(self.fitted_params) coefficients = mean[np.newaxis, :] + gamma_estimates @@ -536,16 +368,119 @@ class MinimizeMixedEffectsConverter(MixedEffectsConverter): """Mixed effects to-basis-converter using scipy.optimize. Minimizes the profile loglikelihood of the mixed effects model as proposed - by Lindstrom & Bates (1988). + by Mary J. Lindstrom & Douglas M. Bates (1988). """ + @dataclass(frozen=True) + class _Params: + """Private class for the parameters of the minimization. + Args: + L: (_L @ _L.T) is the Cholesky decomposition of covariance/sigmasq. + has_mean: Whether the mean is fixed or estimated with ML estimator. + mean: Fixed effects (can be none). + model: Mixed effects model to use for the estimation of the mean in + case mean=None (will be None otherwise). + """ + + L: NDArrayFloat + _mean: Optional[NDArrayFloat] + _model: Optional[_MixedEffectsModel] + + def __init__( + self, + L: NDArrayFloat, + mean: Optional[NDArrayFloat], + model: Optional[_MixedEffectsModel] = None, + ) -> None: + if mean is None: + assert model is not None + + # must use object.__setattr__ due to frozen=True + object.__setattr__(self, "L", L) + object.__setattr__(self, "_mean", mean) + object.__setattr__(self, "_model", model) + + @property + def mean(self) -> NDArrayFloat: + if self._mean is not None: + return self._mean + assert self._model is not None, "model is required" + values_covariances = self._model.values_covariances( + self, div_sigmasq=True, + ) + return _linalg_solve( + a=_sum_mahalanobis( + self._model.basis_evaluations, + values_covariances, + self._model.basis_evaluations, + ), + b=_sum_mahalanobis( + self._model.basis_evaluations, + values_covariances, + self._model.values, + ), + assume_a="pos", + ) + + @property + def covariance_div_sigmasq(self) -> NDArrayFloat: + return self.L @ self.L.T + + @property + def covariance(self) -> NDArrayFloat: + return self.covariance_div_sigmasq * self.sigmasq + + @property + def sigmasq(self) -> float: + assert self._model is not None, "Model is required" + return _sum_mahalanobis( + self._model.partial_residuals_list(self.mean), + self._model.values_covariances(self, div_sigmasq=True), + ) / self._model._n_measurements # type: ignore + + @classmethod + def from_vec( + cls, + vec: NDArrayFloat, + dim_effects: int, + model: Optional[_MixedEffectsModel] = None, + has_mean: bool = True, + ) -> Self: + """Create Params from vectorized parameters.""" + mean = vec[:dim_effects] if has_mean else None + L_vec_len = dim_effects * (dim_effects + 1) // 2 + L = np.zeros((dim_effects, dim_effects)) + L[np.tril_indices(dim_effects)] = vec[-L_vec_len:] + return cls(mean=mean, L=L, model=model) + + def to_vec(self) -> NDArrayFloat: + """Vectorize parameters.""" + return np.concatenate([ + self._mean if self._mean is not None else np.array([]), + self.L[np.tril_indices(self.L.shape[0])] + ]) + + @classmethod + def initial_params( + cls, + dim_effects: int, + has_mean: bool, + model: _MixedEffectsModel, + ) -> Self: + """Generic initial parameters .""" + return cls( + mean=np.zeros(dim_effects) if has_mean else None, + L=np.eye(dim_effects), + model=model, + ) + def fit( self, X: FDataIrregular, y: object = None, *, initial_params: Optional[ - _MinimizeMixedEffectsParams | NDArrayFloat + MinimizeMixedEffectsConverter._Params | NDArrayFloat ] = None, minimization_method: Optional[str] = None, has_mean: bool = True, @@ -563,15 +498,20 @@ def fit( self after fit """ dim_effects = self.basis.n_basis - if isinstance(initial_params, _MinimizeMixedEffectsParams): + if isinstance( + initial_params, + MinimizeMixedEffectsConverter._Params, + ): # assert has_beta == initial_params.has_beta initial_params_vec = initial_params.to_vec() elif initial_params is not None: initial_params_vec = initial_params else: - initial_params_vec = _MinimizeMixedEffectsParams.initial_params( - dim_effects=dim_effects, has_mean=has_mean, model=self, - ).to_vec() + initial_params_vec = ( + MinimizeMixedEffectsConverter._Params.initial_params( + dim_effects=dim_effects, has_mean=has_mean, model=self, + ).to_vec() + ) if minimization_method is None: minimization_method = _SCIPY_MINIMIZATION_METHODS[0] @@ -579,9 +519,11 @@ def fit( model = _MixedEffectsModel(X, self.basis) n_samples = X.n_samples - def objective_function(params: NDArrayFloat) -> float: + def objective_function(params_vec: NDArrayFloat) -> float: return - model.profile_loglikelihood( - params, has_mean=has_mean, + params=MinimizeMixedEffectsConverter._Params.from_vec( + params_vec, dim_effects, model=self, has_mean=has_mean, + ) ) / n_samples self.result = _minimize( @@ -590,16 +532,64 @@ def objective_function(params: NDArrayFloat) -> float: minimization_methods=minimization_method, ) self.fitted_model = model - params = _MinimizeMixedEffectsParams.from_vec( + params = MinimizeMixedEffectsConverter._Params.from_vec( self.result.x, dim_effects=dim_effects, model=model, has_mean=has_mean, ) self.fitted_params = _MixedEffectsParamsResult( - _mean=params.mean(), - _covariance=params.covariance(), - _sigmasq=params.sigmasq(), + mean=params.mean, + covariance=params.covariance, + sigmasq=params.sigmasq, ) return self + + +class EMMixedEffectsConverter(MixedEffectsConverter): + """Mixed effects to-basis-converter using the EM algorithm.""" + @dataclass(frozen=True) + class _Params: + """Mixed effects parameters for the EM algorithm.""" + sigmasq: float + covariance: NDArrayFloat + + def covariance_div_sigmasq(self) -> NDArrayFloat: + """Covariance of the mixed effects.""" + return self.covariance / self.sigmasq + + def to_vec(self) -> NDArrayFloat: + return np.concatenate([ + np.array([self.sigmasq]), + self.covariance[np.tril_indices(self.covariance.shape[0])], + ]) + + def len_vec(self) -> int: + dim_effects = self.covariance.shape[0] + return 1 + dim_effects * (dim_effects + 1) // 2 + + def fit( + self, + X: FDataIrregular, + y: object = None, + *, + initial_params: Optional[ + EMMixedEffectsConverter._Params | NDArrayFloat + ] = None, + minimization_method: Optional[str] = None, + has_mean: bool = True, + ) -> Self: + """Fit the model. + + Args: + X: irregular data to fit. + y: ignored. + initial_params: initial params of the model. + minimization_methods: scipy.optimize.minimize method to be used for + the minimization of the loglikelihood of the model. + + Returns: + self after fit + """ + return self # TODO diff --git a/skfda/preprocessing/conversion/_to_basis.py b/skfda/representation/conversion/_to_basis.py similarity index 81% rename from skfda/preprocessing/conversion/_to_basis.py rename to skfda/representation/conversion/_to_basis.py index a6f3c1426..081e97dbb 100644 --- a/skfda/preprocessing/conversion/_to_basis.py +++ b/skfda/representation/conversion/_to_basis.py @@ -14,6 +14,7 @@ from ..._utils._sklearn_adapter import BaseEstimator, TransformerMixin from ...representation import FData, FDataBasis +from ...representation.basis import Basis Input = TypeVar( "Input", @@ -24,7 +25,6 @@ class _ToBasisConverter( BaseEstimator, - Generic[Input], TransformerMixin[Input, FDataBasis, object], ): """To basis converter. @@ -32,3 +32,8 @@ class _ToBasisConverter( Abstract base class for all FData to FDataBasis converters. The subclasses must override ``fit`` and ``transform`` to define the conversion. """ + basis: Basis + + def __init__(self, basis: Basis) -> None: + self.basis = basis + super().__init__() diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py index 497532ec0..a06944b80 100644 --- a/skfda/tests/test_mixed_effects_converter.py +++ b/skfda/tests/test_mixed_effects_converter.py @@ -20,7 +20,7 @@ BSplineBasis, FourierBasis, ) -from skfda.preprocessing.conversion._mixed_effects import ( +from skfda.representation.conversion._mixed_effects import ( MinimizeMixedEffectsConverter, _get_values_list, _get_basis_evaluations_list, @@ -93,10 +93,11 @@ def test_loglikelihood() -> None: ]), -1437.3441872940807), ] - for params, mixedlm_loglikelihood in params_loglike_list: - model_loglikelihood = model.profile_loglikelihood( - params, + for params_vec, mixedlm_loglikelihood in params_loglike_list: + params = MinimizeMixedEffectsConverter._Params.from_vec( + params_vec, basis.n_basis, model, ) + model_loglikelihood = model.profile_loglikelihood(params) assert np.allclose(mixedlm_loglikelihood, model_loglikelihood) @@ -208,75 +209,75 @@ def _get_points( ) -def test_simple_conversion() -> None: - """Visual test.""" - _max_val = 10 - _domain_range = (0, 10) - n_points = 6 - n_basis = 5 - n_samples = 50 - points = _get_points(_domain_range, n_points, n_samples, 9) - - basis = FourierBasis(n_basis=n_basis, domain_range=_domain_range) - # BSplineBasis( - # n_basis=n_basis, domain_range=_domain_range, order=n_basis - 1, - # ) - - sigma = 0.3 - Gamma_sqrt = np.zeros((n_basis, n_basis)) - Gamma_sqrt[np.tril_indices(n_basis)] = np.random.rand( - n_basis * (n_basis + 1) // 2, - ) * _max_val - Gamma = Gamma_sqrt @ Gamma_sqrt.T - beta = np.random.rand(n_basis) * _max_val - fdatabasis_original = FDataBasis( - basis=basis, - coefficients=np.random.multivariate_normal( - mean=beta, cov=Gamma, size=n_samples, - ), - ) - - def fun(i: int) -> Callable[[NDArrayFloat], NDArrayFloat]: - def fi(x: NDArrayFloat) -> NDArrayFloat: - return fdatabasis_original[i](x).reshape(x.shape) - return fi - - funcs = [fun(i) for i in range(n_samples)] - - fdatairregular = _create_irregular_samples( - funcs=funcs, - n_points=n_points, - points=points, - noise_generate_std=sigma, - ) - converter = MinimizeMixedEffectsConverter(basis) - fdatabasis_estimated = converter.fit_transform(fdatairregular) - fdatabasis_basic = fdatairregular.to_basis(basis) - if True: - _ = plt.figure(figsize=(15, 6)) - - axes = plt.subplot(2, 2, 1) - plt.title("Original data") - fdatairregular[:5].plot(axes=axes) - left, right = plt.ylim() - plt.ylim((min(0, left), max(1.4, right))) - - axes = plt.subplot(2, 2, 2) - plt.title("Estimated basis representation.\n") - fdatairregular.scatter(axes=axes) - fdatabasis_estimated[:5].plot(axes=axes) - left, right = plt.ylim() - plt.ylim((min(0, left), max(1.4, right))) - - axes = plt.subplot(2, 2, 4) - plt.title("Original basis representation") - fdatairregular.scatter(axes=axes) - fdatabasis_original[:5].plot(axes=axes) - left, right = plt.ylim() - plt.ylim((min(0, left), max(1.4, right))) - - axes = plt.subplot(2, 2, 3) - plt.title(f"{basis}") - basis.plot(axes=axes) - - plt.show() +# def test_simple_conversion() -> None: +# """Visual test.""" +# _max_val = 10 +# _domain_range = (0, 10) +# n_points = 6 +# n_basis = 5 +# n_samples = 50 +# points = _get_points(_domain_range, n_points, n_samples, 9) + +# basis = FourierBasis(n_basis=n_basis, domain_range=_domain_range) +# # BSplineBasis( +# # n_basis=n_basis, domain_range=_domain_range, order=n_basis - 1, +# # ) + +# sigma = 0.3 +# Gamma_sqrt = np.zeros((n_basis, n_basis)) +# Gamma_sqrt[np.tril_indices(n_basis)] = np.random.rand( +# n_basis * (n_basis + 1) // 2, +# ) * _max_val +# Gamma = Gamma_sqrt @ Gamma_sqrt.T +# beta = np.random.rand(n_basis) * _max_val +# fdatabasis_original = FDataBasis( +# basis=basis, +# coefficients=np.random.multivariate_normal( +# mean=beta, cov=Gamma, size=n_samples, +# ), +# ) + +# def fun(i: int) -> Callable[[NDArrayFloat], NDArrayFloat]: +# def fi(x: NDArrayFloat) -> NDArrayFloat: +# return fdatabasis_original[i](x).reshape(x.shape) +# return fi + +# funcs = [fun(i) for i in range(n_samples)] + +# fdatairregular = _create_irregular_samples( +# funcs=funcs, +# n_points=n_points, +# points=points, +# noise_generate_std=sigma, +# ) +# converter = MinimizeMixedEffectsConverter(basis) +# fdatabasis_estimated = converter.fit_transform(fdatairregular) +# fdatabasis_basic = fdatairregular.to_basis(basis) +# if True: +# _ = plt.figure(figsize=(15, 6)) + +# axes = plt.subplot(2, 2, 1) +# plt.title("Original data") +# fdatairregular[:5].plot(axes=axes) +# left, right = plt.ylim() +# plt.ylim((min(0, left), max(1.4, right))) + +# axes = plt.subplot(2, 2, 2) +# plt.title("Estimated basis representation.\n") +# fdatairregular.scatter(axes=axes) +# fdatabasis_estimated[:5].plot(axes=axes) +# left, right = plt.ylim() +# plt.ylim((min(0, left), max(1.4, right))) + +# axes = plt.subplot(2, 2, 4) +# plt.title("Original basis representation") +# fdatairregular.scatter(axes=axes) +# fdatabasis_original[:5].plot(axes=axes) +# left, right = plt.ylim() +# plt.ylim((min(0, left), max(1.4, right))) + +# axes = plt.subplot(2, 2, 3) +# plt.title(f"{basis}") +# basis.plot(axes=axes) + +# plt.show() From 9d2a2d4db99a5f0d9a5a6e4e5347de0a2e3ef73c Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Fri, 8 Mar 2024 14:44:51 +0100 Subject: [PATCH 03/48] WIP: include EM converter --- .../conversion/_mixed_effects.py | 190 +++++++++++++++--- 1 file changed, 157 insertions(+), 33 deletions(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index a11679d48..4ee444f64 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -14,6 +14,7 @@ Any, Callable, List, + Literal, Optional, Protocol, ) @@ -179,6 +180,21 @@ class _MixedEffectsParamsResult: covariance: NDArrayFloat sigmasq: float + @property + def covariance_div_sigmasq(self) -> NDArrayFloat: + return self.covariance / self.sigmasq + + +def _initial_params( + dim_effects: int, # TODO add X: FDataIrregular, basis: Basis ? +) -> _MixedEffectsParams: + """Generic initial parameters.""" + return _MixedEffectsParamsResult( + mean=np.zeros(dim_effects), + covariance=np.eye(dim_effects), + sigmasq=1, + ) + class _MixedEffectsModel: """Mixed effects model. @@ -460,20 +476,6 @@ def to_vec(self) -> NDArrayFloat: self.L[np.tril_indices(self.L.shape[0])] ]) - @classmethod - def initial_params( - cls, - dim_effects: int, - has_mean: bool, - model: _MixedEffectsModel, - ) -> Self: - """Generic initial parameters .""" - return cls( - mean=np.zeros(dim_effects) if has_mean else None, - L=np.eye(dim_effects), - model=model, - ) - def fit( self, X: FDataIrregular, @@ -498,27 +500,23 @@ def fit( self after fit """ dim_effects = self.basis.n_basis - if isinstance( - initial_params, - MinimizeMixedEffectsConverter._Params, - ): - # assert has_beta == initial_params.has_beta + model = _MixedEffectsModel(X, self.basis) + n_samples = X.n_samples + if isinstance(initial_params, MinimizeMixedEffectsConverter._Params): initial_params_vec = initial_params.to_vec() elif initial_params is not None: initial_params_vec = initial_params else: - initial_params_vec = ( - MinimizeMixedEffectsConverter._Params.initial_params( - dim_effects=dim_effects, has_mean=has_mean, model=self, - ).to_vec() - ) + initial_params_generic = _initial_params(dim_effects) + initial_params_vec = MinimizeMixedEffectsConverter._Params( + L=np.linalg.cholesky(initial_params_generic.covariance), + mean=initial_params_generic.mean if has_mean else None, + model=model, + ).to_vec() if minimization_method is None: minimization_method = _SCIPY_MINIMIZATION_METHODS[0] - model = _MixedEffectsModel(X, self.basis) - n_samples = X.n_samples - def objective_function(params_vec: NDArrayFloat) -> float: return - model.profile_loglikelihood( params=MinimizeMixedEffectsConverter._Params.from_vec( @@ -564,6 +562,18 @@ def to_vec(self) -> NDArrayFloat: np.array([self.sigmasq]), self.covariance[np.tril_indices(self.covariance.shape[0])], ]) + + @classmethod + def from_vec( + cls, + vec: NDArrayFloat, + dim_effects: int, + ) -> EMMixedEffectsConverter._Params: + """Create Params from vectorized parameters.""" + sigmasq = vec[0] + covariance = np.zeros((dim_effects, dim_effects)) + covariance[np.tril_indices(dim_effects)] = vec[1:] + return cls(sigmasq=sigmasq, covariance=covariance) def len_vec(self) -> int: dim_effects = self.covariance.shape[0] @@ -577,19 +587,133 @@ def fit( initial_params: Optional[ EMMixedEffectsConverter._Params | NDArrayFloat ] = None, - minimization_method: Optional[str] = None, - has_mean: bool = True, + niter: int = 700, + convergence_criterion: Optional[Literal["params"]] = None, + rtol: float = 1e-3, ) -> Self: - """Fit the model. + """Fit the model using the EM algorithm. Args: X: irregular data to fit. y: ignored. initial_params: initial params of the model. - minimization_methods: scipy.optimize.minimize method to be used for - the minimization of the loglikelihood of the model. + niter: maximum number of iterations. + convergence_criterion: convergence criterion to use when fitting. + - "params" to use relative differences between parameters. + # - "square-error" to use the square error of the estimates wrt + # the original data. + # - "prop-offset" to use the criteria proposed by Bates & + # Watts 1981 (A Relative Offset Convergence Criterion for + # Nonlinear Least Squares). + # - "loglikelihood" to use the loglikelihood. + rtol: relative tolerance for convergence. Returns: self after fit """ - return self # TODO + model = self.model = _MixedEffectsModel(X, self.basis) + + if initial_params is None: + initial_params_generic = _initial_params(self.basis.n_basis) + next_params = EMMixedEffectsConverter._Params( + sigmasq=initial_params_generic.sigmasq, + covariance=initial_params_generic.covariance, + ) + elif isinstance(initial_params, np.ndarray): + next_params = EMMixedEffectsConverter._Params.from_vec( + initial_params, dim_effects=self.basis.n_basis, + ) + else: + next_params = initial_params + + if convergence_criterion is None: + convergence_criterion = "params" + + assert convergence_criterion in _EM_MINIMIZATION_METHODS + + use_error = convergence_criterion in [ + "square-error", "square-error-big", "prop-offset", + ] + use_big_model = convergence_criterion[-3:] == "big" + + conv_estimate = prev_conv_estimate = None + converged = False + + for iter_number in range(niter): + curr_params = next_params + Sigma_list = self.Sigma_list(model, curr_params) + beta = self.beta(model, Sigma_list) + r_list = self.r_list(model, beta) + random_effects = self._gamma_estimates( + model, curr_params, r_list, Sigma_list, + ) + Sigma_inv_list = [ + # _linalg_solve(Sigma, np.eye(Sigma.shape[0]), assume_a="pos") + np.linalg.pinv(Sigma, hermitian=True) + for Sigma in Sigma_list + ] + next_params = self.next_params( + model, curr_params, r_list, Sigma_inv_list, Sigma_list, random_effects, + ) + + if use_error: + me_params = self.meparams_from_emparams(curr_params, beta) + # error = values - estimates + error = model.error(me_params, use_big_model) + if convergence_criterion == "prop-offset": + conv_estimate = em_prop_offset_conv_estimate( + curr_params, error, model, + ) + elif convergence_criterion in [ + "square-error", "square-error-big", + ]: + conv_estimate = em_square_error_conv_estimate(error) + else: + raise ValueError("Invalid minimization method.") + elif convergence_criterion == "params": + conv_estimate = next_params.to_vec() + elif convergence_criterion == "loglikelihood": + me_params = self.meparams_from_emparams(curr_params, beta) + conv_estimate = model.profile_loglikelihood( + me_params, has_beta=True, + ) + else: + raise ValueError("Invalid minimization method.") + + if iter_number > 0: + if convergence_criterion != "prop-offset": + converged = np.allclose( + conv_estimate, prev_conv_estimate, rtol=rtol, + ) + else: + converged = conv_estimate < rtol + if converged: + break + + prev_conv_estimate = conv_estimate + + + if not converged: + message = f"EM algorithm did not converge ({niter=})." + # raise RuntimeError(f"EM algorithm did not converge ({niter=}).") + else: + message = ( + "EM algorithm converged after " + f"{iter_number}/{niter} iterations." + ) + + curr_params = next_params + Sigma_list = self.Sigma_list(model, curr_params) + beta = self.beta(model, Sigma_list) + self.result = {"success": converged, "message": message} + self.params = MEParams( + beta=beta, + model=model, + L=np.linalg.cholesky(curr_params.Gamma/curr_params.sigmasq), + ) + self.params_result = MEParamsResult( + beta=beta, + Gamma=curr_params.Gamma, + sigmasq=curr_params.sigmasq, + ) + return self From c28f4bf85e496619da3a553a56afa2da45642dcd Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Fri, 15 Mar 2024 20:22:56 +0100 Subject: [PATCH 04/48] Add EM converter --- .../conversion/_mixed_effects.py | 358 +++++++++++------- skfda/tests/test_mixed_effects_converter.py | 143 +++++-- 2 files changed, 345 insertions(+), 156 deletions(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index 4ee444f64..3cf005892 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -13,6 +13,7 @@ from typing import ( Any, Callable, + Dict, List, Literal, Optional, @@ -21,7 +22,7 @@ import numpy as np import scipy -from typing_extensions import Final, Self +from typing_extensions import Self from ...representation import FDataBasis, FDataIrregular from ...representation.basis import Basis @@ -48,9 +49,7 @@ _EM_MINIMIZATION_METHODS = [ "params", - "square-error", - "square-error-big", - "prop-offset", + "squared-error", "loglikelihood" ] @@ -92,7 +91,7 @@ def _minimize( else: for method in minimization_methods: if method not in _SCIPY_MINIMIZATION_METHODS: - raise ValueError(f"Invalid method: \"{method}\".") + raise ValueError(f"Invalid minimize method: \"{method}\".") for method in minimization_methods: result = scipy.optimize.minimize( @@ -105,12 +104,7 @@ def _minimize( }, ) if result.success is True: - # print( - # f"[MEEstimator info]: Minimization method {method} succeeded.", - # ) - return result - # else: - # print(f"[MEEstimator info]: Minimization method {method} failed.") + break return result # even if it failed @@ -153,13 +147,13 @@ class _MixedEffectsParams(Protocol): """Params of the mixed effects model for irregular data.""" @property - def mean(self) -> NDArrayFloat: - """Fixed effects.""" + def covariance(self) -> NDArrayFloat: + """Covariance of the mixed effects.""" ... @property - def covariance(self) -> NDArrayFloat: - """Covariance of the mixed effects.""" + def sigmasq(self) -> float: + """Variance of the residuals.""" ... @property @@ -168,8 +162,8 @@ def covariance_div_sigmasq(self) -> NDArrayFloat: ... @property - def sigmasq(self) -> float: - """Variance of the residuals.""" + def mean(self) -> NDArrayFloat: + """Fixed effects.""" ... @@ -203,8 +197,7 @@ class _MixedEffectsModel: Model: - - values[k] = basis_evaluations[k] @ (mean + mixed_effects[k]) + error[k] + values[k] = basis_evaluations[k] @ (mean + random_effects[k]) + error[k] Args: values: List of the values of each curve. @@ -237,7 +230,7 @@ def _dim_effects(self) -> int: """Dimension of the mixed and of the fixed effects.""" return self.basis_evaluations[0].shape[1] - def partial_residuals_list( + def partial_residuals( self, mean: NDArrayFloat, ) -> List[NDArrayFloat]: @@ -252,78 +245,99 @@ def partial_residuals_list( ) ] - def values_covariances( + def values_covariances_div_sigmasq( self, - params: _MixedEffectsParams, - div_sigmasq: bool, + cov_div_sigmasq: NDArrayFloat, ) -> List[NDArrayFloat]: - """Covariance of the values. + """Covariance of the values divided by sigmasq. - If div_sigmasq is False, then the results will be: - - values_covariances[k] = ( - sigmasq * I - + basis_evaluations[k] @ covariance @ basis_evaluations[k].T + values_covariances_div_sigmasq[k] = ( + I + basis_evaluations[k] @ cov_div_sigmasq @ basis_evaluations[k].T ) - If div_sigmasq is True, then the results will be (divided by sigmasq): + Used for the model from Lindstrom & Bates (1988). + """ + return [ + np.eye(basis_evaluation.shape[0]) + + basis_evaluation @ cov_div_sigmasq @ basis_evaluation.T + for basis_evaluation in self.basis_evaluations + ] + + def values_covariances( + self, + sigmasq: float, + random_effects_covariance: NDArrayFloat, + ) -> List[NDArrayFloat]: + """Covariance of the values. values_covariances[k] = ( - I + basis_evaluations[k] @ cov_div_sigmasq @ basis_evaluations[k].T + sigmasq * I + + basis_evaluations[k] @ random_effects_covariance + @ basis_evaluations[k].T ) - div_sigmasq=True for the model from Lindstrom & Bates (1988). + Args: + sigmasq: Variance of the residuals. + random_effects_covariance: Covariance of the random effects. """ - if div_sigmasq: - cov_div_sigmasq = params.covariance_div_sigmasq - return [ - np.eye(basis_evaluation.shape[0]) - + basis_evaluation @ cov_div_sigmasq @ basis_evaluation.T - for basis_evaluation in self.basis_evaluations - ] - - sigmasq = params.sigmasq - params_covariance = params.covariance return [ sigmasq * np.eye(basis_evaluation.shape[0]) - + basis_evaluation @ params_covariance @ basis_evaluation.T + + basis_evaluation @ random_effects_covariance @ basis_evaluation.T for basis_evaluation in self.basis_evaluations ] - def mixed_effects_estimate( + def _random_effects_estimate( self, - params: _MixedEffectsParams, + random_effects_covariance: NDArrayFloat, + values_covariances: List[NDArrayFloat], + partial_residuals: List[NDArrayFloat], ) -> NDArrayFloat: - """Estimates of the mixed effects (generalized least squares) + """Estimates of the random effects (generalized least squares) - mixed_effects_estimate[k] = ( - covariance @ basis_evaluations[k].T + random_effects_estimate[k] = ( + random_effects_covariance @ basis_evaluations[k].T @ values_covariances[k]^{-1} @ partial_residuals[k] ) - """ - covariance = params.covariance - partial_residuals_list = self.partial_residuals_list(params.mean) - values_cov_list = self.values_covariances(params, div_sigmasq=False) + Args: + random_effects_covariance: Covariance of the random effects. + values_covariances: Covariances of the values. + partial_residuals: List of: value - basis_evaluation @ mean. + """ return np.array([ - covariance @ basis_eval.T @ _linalg_solve( + random_effects_covariance @ basis_eval.T @ _linalg_solve( value_cov, r, assume_a="pos", ) for basis_eval, value_cov, r in zip( self.basis_evaluations, - values_cov_list, - partial_residuals_list, + values_covariances, + partial_residuals, ) ]) + def random_effects_estimate( + self, + params: _MixedEffectsParams, + ) -> NDArrayFloat: + """Estimates of the random effects (generalized least squares).""" + return self._random_effects_estimate( + random_effects_covariance=params.covariance, + values_covariances=self.values_covariances( + params.sigmasq, params.covariance, + ), + partial_residuals=self.partial_residuals(params.mean), + ) + def profile_loglikelihood( self, params: _MixedEffectsParams, ) -> float: """Profile loglikelihood.""" - r_list = self.partial_residuals_list(params.mean) - V_list = self.values_covariances(params, div_sigmasq=True) + partial_residuals = self.partial_residuals(params.mean) + values_covariances = self.values_covariances_div_sigmasq( + params.covariance_div_sigmasq, + ) # slogdet_V_list = [np.linalg.slogdet(V) for V in V_list] # if any(slogdet_V[0] <= 0 for slogdet_V in slogdet_V_list): @@ -333,8 +347,12 @@ def profile_loglikelihood( # sum_logdet_V: float = sum( # slogdet_V[1] for slogdet_V in slogdet_V_list # ) - sum_logdet_V: float = sum(np.linalg.slogdet(V)[1] for V in V_list) - sum_mahalanobis = _sum_mahalanobis(r_list, V_list) + sum_logdet_V: float = sum( + np.linalg.slogdet(V)[1] for V in values_covariances + ) + sum_mahalanobis = _sum_mahalanobis( + partial_residuals, values_covariances, + ) log_sum_mahalanobis: float = np.log(sum_mahalanobis) # type: ignore return ( @@ -343,6 +361,11 @@ def profile_loglikelihood( + self._profile_loglikelihood_additive_constants ) + @property + def n_samples(self) -> int: + """Number of samples of the irregular dataset.""" + return len(self.values) + class MixedEffectsConverter(_ToBasisConverter[FDataIrregular]): """Mixed effects to-basis-converter.""" @@ -350,7 +373,7 @@ class MixedEffectsConverter(_ToBasisConverter[FDataIrregular]): # after fitting: fitted_model: Optional[_MixedEffectsModel] fitted_params: Optional[_MixedEffectsParamsResult] - result: Optional[Any] + result: Optional[Dict[str, Any] | scipy.optimize.OptimizeResult] def __init__( self, @@ -370,7 +393,7 @@ def transform( X_model = _MixedEffectsModel(X, self.basis) mean = self.fitted_params.mean - gamma_estimates = X_model.mixed_effects_estimate(self.fitted_params) + gamma_estimates = X_model.random_effects_estimate(self.fitted_params) coefficients = mean[np.newaxis, :] + gamma_estimates @@ -409,7 +432,7 @@ def __init__( model: Optional[_MixedEffectsModel] = None, ) -> None: if mean is None: - assert model is not None + assert model is not None, "model is required if mean is None" # must use object.__setattr__ due to frozen=True object.__setattr__(self, "L", L) @@ -420,9 +443,9 @@ def __init__( def mean(self) -> NDArrayFloat: if self._mean is not None: return self._mean - assert self._model is not None, "model is required" - values_covariances = self._model.values_covariances( - self, div_sigmasq=True, + assert self._model is not None, "Model is required" + values_covariances = self._model.values_covariances_div_sigmasq( + self.covariance_div_sigmasq, ) return _linalg_solve( a=_sum_mahalanobis( @@ -450,8 +473,10 @@ def covariance(self) -> NDArrayFloat: def sigmasq(self) -> float: assert self._model is not None, "Model is required" return _sum_mahalanobis( - self._model.partial_residuals_list(self.mean), - self._model.values_covariances(self, div_sigmasq=True), + self._model.partial_residuals(self.mean), + self._model.values_covariances_div_sigmasq( + self.covariance_div_sigmasq, + ), ) / self._model._n_measurements # type: ignore @classmethod @@ -553,6 +578,7 @@ class _Params: sigmasq: float covariance: NDArrayFloat + @property def covariance_div_sigmasq(self) -> NDArrayFloat: """Covariance of the mixed effects.""" return self.covariance / self.sigmasq @@ -562,7 +588,7 @@ def to_vec(self) -> NDArrayFloat: np.array([self.sigmasq]), self.covariance[np.tril_indices(self.covariance.shape[0])], ]) - + @classmethod def from_vec( cls, @@ -579,6 +605,69 @@ def len_vec(self) -> int: dim_effects = self.covariance.shape[0] return 1 + dim_effects * (dim_effects + 1) // 2 + def _mean( + self, + model: _MixedEffectsModel, + values_covariances_list: List[NDArrayFloat], + ) -> NDArrayFloat: + """Return the beta estimate.""" + return _linalg_solve( + a=_sum_mahalanobis( + model.basis_evaluations, + values_covariances_list, + model.basis_evaluations, + ), + b=_sum_mahalanobis( + model.basis_evaluations, + values_covariances_list, + model.values, + ), + assume_a="pos", + ) + + def next_params( + self, + model: _MixedEffectsModel, + curr_params: EMMixedEffectsConverter._Params, + partial_residuals: List[NDArrayFloat], + values_cov_inv: List[NDArrayFloat], + vaules_cov: List[NDArrayFloat], + random_effects: NDArrayFloat, + ) -> EMMixedEffectsConverter._Params: + """Return the next parameters of the EM algorithm.""" + residuals = [ + r - basis_eval @ random_effect + for r, basis_eval, random_effect in zip( + partial_residuals, model.basis_evaluations, random_effects, + ) + ] + sum_squared_residuals = sum(np.inner(r, r) for r in residuals) + sum_traces = curr_params.sigmasq * sum( + # np.trace(np.eye(cov_inv.shape[0]) - params.sigmasq * cov_inv) + cov_inv.shape[0] - curr_params.sigmasq * np.trace(cov_inv) + for cov_inv in values_cov_inv + ) + next_sigmasq = ( + (sum_squared_residuals + sum_traces) / model._n_measurements + ) + next_covariance = sum( + np.outer(random_effect, random_effect) + + curr_params.covariance @ ( + np.eye(curr_params.covariance.shape[1]) + - basis_eval.T @ _linalg_solve( + Sigma, basis_eval @ curr_params.covariance, assume_a="pos", + ) + ) + for basis_eval, Sigma, random_effect in zip( + model.basis_evaluations, vaules_cov, random_effects, + ) + ) / model.n_samples + + return EMMixedEffectsConverter._Params( + sigmasq=next_sigmasq, + covariance=next_covariance, + ) + def fit( self, X: FDataIrregular, @@ -588,7 +677,9 @@ def fit( EMMixedEffectsConverter._Params | NDArrayFloat ] = None, niter: int = 700, - convergence_criterion: Optional[Literal["params"]] = None, + convergence_criterion: Optional[ + Literal["params", "squared-error", "loglikelihood"] + ] = None, rtol: float = 1e-3, ) -> Self: """Fit the model using the EM algorithm. @@ -600,12 +691,12 @@ def fit( niter: maximum number of iterations. convergence_criterion: convergence criterion to use when fitting. - "params" to use relative differences between parameters. - # - "square-error" to use the square error of the estimates wrt - # the original data. + - "squared-error" to use the square error of the estimates with + respect to the original data. + - "loglikelihood" to use the loglikelihood. # - "prop-offset" to use the criteria proposed by Bates & # Watts 1981 (A Relative Offset Convergence Criterion for # Nonlinear Least Squares). - # - "loglikelihood" to use the loglikelihood. rtol: relative tolerance for convergence. Returns: @@ -629,70 +720,72 @@ def fit( if convergence_criterion is None: convergence_criterion = "params" - assert convergence_criterion in _EM_MINIMIZATION_METHODS + if convergence_criterion not in _EM_MINIMIZATION_METHODS: + raise ValueError( + "Invalid convergence criterion for the EM algorithm: " + f"\"{convergence_criterion}\"." + ) - use_error = convergence_criterion in [ - "square-error", "square-error-big", "prop-offset", - ] - use_big_model = convergence_criterion[-3:] == "big" + use_error = convergence_criterion in ("squared-error",) - conv_estimate = prev_conv_estimate = None - converged = False + if use_error: + big_values = np.concatenate(model.values) + converged = False + convergence_val: Optional[NDArrayFloat | float] = None + prev_convergence_val: Optional[NDArrayFloat | float] = None for iter_number in range(niter): curr_params = next_params - Sigma_list = self.Sigma_list(model, curr_params) - beta = self.beta(model, Sigma_list) - r_list = self.r_list(model, beta) - random_effects = self._gamma_estimates( - model, curr_params, r_list, Sigma_list, + values_cov = model.values_covariances( + curr_params.sigmasq, curr_params.covariance, ) - Sigma_inv_list = [ - # _linalg_solve(Sigma, np.eye(Sigma.shape[0]), assume_a="pos") - np.linalg.pinv(Sigma, hermitian=True) - for Sigma in Sigma_list + mean = self._mean(model, values_cov) + partial_residuals = model.partial_residuals(mean) + random_effects = model._random_effects_estimate( + curr_params.covariance, values_cov, partial_residuals, + ) + values_cov_inv = [ + np.linalg.pinv(cov, hermitian=True) + for cov in values_cov ] next_params = self.next_params( - model, curr_params, r_list, Sigma_inv_list, Sigma_list, random_effects, + model=model, + curr_params=curr_params, + partial_residuals=partial_residuals, + values_cov_inv=values_cov_inv, + vaules_cov=values_cov, + random_effects=random_effects, ) - if use_error: - me_params = self.meparams_from_emparams(curr_params, beta) - # error = values - estimates - error = model.error(me_params, use_big_model) - if convergence_criterion == "prop-offset": - conv_estimate = em_prop_offset_conv_estimate( - curr_params, error, model, + if convergence_criterion == "params": + convergence_val = next_params.to_vec() + elif convergence_criterion == "squared-error": + estimates = np.concatenate([ # estimated values + basis_eval @ (mean + random_effect) + for basis_eval, random_effect in zip( + model.basis_evaluations, random_effects, ) - elif convergence_criterion in [ - "square-error", "square-error-big", - ]: - conv_estimate = em_square_error_conv_estimate(error) - else: - raise ValueError("Invalid minimization method.") - elif convergence_criterion == "params": - conv_estimate = next_params.to_vec() + ]) + error = big_values - estimates + convergence_val = np.inner(error, error) # sum of squares elif convergence_criterion == "loglikelihood": - me_params = self.meparams_from_emparams(curr_params, beta) - conv_estimate = model.profile_loglikelihood( - me_params, has_beta=True, - ) - else: - raise ValueError("Invalid minimization method.") - - if iter_number > 0: - if convergence_criterion != "prop-offset": - converged = np.allclose( - conv_estimate, prev_conv_estimate, rtol=rtol, + convergence_val = model.profile_loglikelihood( + _MixedEffectsParamsResult( + mean=mean, + covariance=next_params.covariance, + sigmasq=next_params.sigmasq, ) - else: - converged = conv_estimate < rtol + ) + + if prev_convergence_val is not None: + converged = np.allclose( + convergence_val, prev_convergence_val, rtol=rtol, + ) if converged: break - prev_conv_estimate = conv_estimate + prev_convergence_val = convergence_val - if not converged: message = f"EM algorithm did not converge ({niter=})." # raise RuntimeError(f"EM algorithm did not converge ({niter=}).") @@ -702,18 +795,21 @@ def fit( f"{iter_number}/{niter} iterations." ) - curr_params = next_params - Sigma_list = self.Sigma_list(model, curr_params) - beta = self.beta(model, Sigma_list) - self.result = {"success": converged, "message": message} - self.params = MEParams( - beta=beta, - model=model, - L=np.linalg.cholesky(curr_params.Gamma/curr_params.sigmasq), + self.result = { + "success": converged, + "message": message, + "nit": iter_number, + } + self.fitted_model = model + + final_params = next_params + values_cov = model.values_covariances( + curr_params.sigmasq, curr_params.covariance, ) - self.params_result = MEParamsResult( - beta=beta, - Gamma=curr_params.Gamma, - sigmasq=curr_params.sigmasq, + final_mean = self._mean(model, values_cov) + self.fitted_params = _MixedEffectsParamsResult( + mean=final_mean, + covariance=final_params.covariance, + sigmasq=final_params.sigmasq, ) return self diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py index a06944b80..ea5f0ebcc 100644 --- a/skfda/tests/test_mixed_effects_converter.py +++ b/skfda/tests/test_mixed_effects_converter.py @@ -3,14 +3,18 @@ import numpy as np import numpy.typing as npt from typing import ( + Any, Callable, Iterable, Literal, + List, Tuple, + Type, ) import matplotlib.pyplot as plt from skfda import FDataBasis +from skfda.misc.scoring import r2_score from skfda.representation import ( FDataBasis, FDataIrregular, @@ -22,6 +26,8 @@ ) from skfda.representation.conversion._mixed_effects import ( MinimizeMixedEffectsConverter, + MixedEffectsConverter, + EMMixedEffectsConverter, _get_values_list, _get_basis_evaluations_list, _MixedEffectsModel, @@ -35,6 +41,7 @@ def test_loglikelihood() -> None: + """Test loglikelihood function comparing it with Statsmodels' MixedLM.""" n_measurements = 200 n_measurements_per_function = 5 fdatairregular = FDataIrregular( @@ -103,6 +110,7 @@ def test_loglikelihood() -> None: def test_values_list() -> None: + """Test conversion from FDataIrregular to ME model: values.""" fdatairregular = _fdatairregular x_list = _get_values_list(fdatairregular) expected_x_list = [ @@ -115,6 +123,7 @@ def test_values_list() -> None: def test_basis_evaluations_list() -> None: + """Test conversion from FDataIrregular to ME model: basis evaluations.""" fdatairregular = _fdatairregular basis = FourierBasis(n_basis=3, domain_range=(0, 10)) phi_list = _get_basis_evaluations_list(fdatairregular, basis) @@ -174,31 +183,20 @@ def _get_points( domain_range: Tuple[float, float], n_points: int, n_samples: int, - type_gen_points: int | Literal["equally_spaced", "random_uniform"], - n_points_per_sample_range: tuple[int, int] = (1, 6), + type_gen_points: int, ) -> npt.NDArray[np.float_]: - if type_gen_points == "equally_spaced": - ret_value = np.tile( - np.linspace(*domain_range, n_points).reshape((-1, 1)), - (n_samples, 1), - ) - elif type_gen_points == "random_uniform": - ret_value = np.random.uniform( - *domain_range, size=n_points * n_samples, - ).reshape((-1, 1)) - elif isinstance(type_gen_points, int): - n = type_gen_points - tot_n_points = n_points * n_samples - domain_split = np.linspace(*domain_range, n + 1) - domains = list(zip(domain_split[:-1], domain_split[1:])) - points_list = [ - np.random.uniform( - domain[0] - 0.6 * (domain[1] - domain[0]), - domain[1] + 0.6 * (domain[1] - domain[0]), - size=tot_n_points // n) - for domain in domains - ] - ret_value = np.concatenate(points_list).reshape((-1, 1))[:tot_n_points] + n = type_gen_points + tot_n_points = n_points * n_samples + domain_split = np.linspace(*domain_range, n + 1) + domains = list(zip(domain_split[:-1], domain_split[1:])) + points_list = [ + np.random.uniform( + domain[0] - 0.6 * (domain[1] - domain[0]), + domain[1] + 0.6 * (domain[1] - domain[0]), + size=tot_n_points // n) + for domain in domains + ] + ret_value = np.concatenate(points_list).reshape((-1, 1))[:tot_n_points] return ( ret_value @@ -209,7 +207,7 @@ def _get_points( ) -# def test_simple_conversion() -> None: +# def __test_simple_conversion() -> None: # """Visual test.""" # _max_val = 10 # _domain_range = (0, 10) @@ -281,3 +279,98 @@ def _get_points( # basis.plot(axes=axes) # plt.show() + + +def _cmp_estimation_with_original( + n_points: int, + sigma: float, # to generate the noise + domain_range: Tuple[float, float], + funcs: List[Callable[[NDArrayFloat], NDArrayFloat]], + type_gen_points: int, + estimator: MixedEffectsConverter, + fit_kwargs: dict[str, Any], + fdatabasis_original: FDataBasis, +) -> None: + n_samples = len(funcs) + points = _get_points(domain_range, n_points, n_samples, type_gen_points) + fdatairregular = _create_irregular_samples( + funcs=funcs, + points=points, + noise_generate_std=sigma, + n_points=n_points, + ) + + fdatabasis_estimated = estimator.fit_transform( + fdatairregular, **fit_kwargs, + ) + + if ( + isinstance(estimator.result, dict) + and "success" in estimator.result + and not estimator.result["success"] + ): + raise Exception(f"Optimization failed: {estimator.result}") + + assert r2_score(fdatabasis_estimated, fdatabasis_original) > 0.9 + + +def _test_compare_with_original( + estimator_cls: Type[MixedEffectsConverter], + fit_kwargs: dict[str, Any] = {}, +) -> None: + np.random.seed(34285676) + domain_range = (0, 100) + _max_val = 5 + n_points = 7 + n_basis = 3 + n_samples = 40 + + basis = BSplineBasis( + n_basis=n_basis, domain_range=domain_range, order=2, + ) + sigma = 0.1 + fe_cov_sqrt = np.zeros((n_basis, n_basis)) + fe_cov_sqrt[np.tril_indices(n_basis)] = np.random.rand( + n_basis * (n_basis + 1) // 2, + ) * _max_val + fe_cov = fe_cov_sqrt @ fe_cov_sqrt.T + mean = np.array([-15, 20, 6]) + fdatabasis_original = FDataBasis( + basis=basis, + coefficients=np.random.multivariate_normal( + mean=mean, cov=fe_cov, size=n_samples, + ), + ) + + def fun(i: int): + return lambda x: fdatabasis_original[i](x).reshape(x.shape) + + funcs = [fun(i) for i in range(n_samples)] + + _cmp_estimation_with_original( + n_points=n_points, + sigma=sigma, + funcs=funcs, + type_gen_points=5, + estimator=estimator_cls(basis=basis), + domain_range=domain_range, + fit_kwargs=fit_kwargs, + fdatabasis_original=fdatabasis_original, + ) + + +# def test_compare_with_statsmodels_minimize() -> None: +# _test_general_compare_with_original( +# MinimizeMixedEffectsConverter, +# ) + + +def test_compare_with_statsmodels_em() -> None: + _test_compare_with_original( + estimator_cls=EMMixedEffectsConverter, + fit_kwargs={ + "niter": 500, + "convergence_criterion": "params", + "rtol": 1e-3, + } + ) From 21c7f174a250fd937bacf6e706160c3124f7a882 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 16 Mar 2024 20:09:36 +0100 Subject: [PATCH 05/48] Naming and comments --- .../conversion/_mixed_effects.py | 105 ++++++++++-------- skfda/tests/test_mixed_effects_converter.py | 13 ++- 2 files changed, 67 insertions(+), 51 deletions(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index 3cf005892..67190e683 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -207,7 +207,7 @@ class _MixedEffectsModel: values: List[NDArrayFloat] basis_evaluations: List[NDArrayFloat] - _n_measurements: int + n_measurements: int _profile_loglikelihood_additive_constants: float def __init__( @@ -219,11 +219,11 @@ def __init__( self.basis_evaluations = _get_basis_evaluations_list( fdatairregular, basis, ) - self._n_measurements = len(fdatairregular.points) + self.n_measurements = len(fdatairregular.points) self._profile_loglikelihood_additive_constants = ( - + self._n_measurements / 2 * np.log(self._n_measurements) - - self._n_measurements / 2 * np.log(2 * np.pi) - - self._n_measurements / 2 + + self.n_measurements / 2 * np.log(self.n_measurements) + - self.n_measurements / 2 * np.log(2 * np.pi) + - self.n_measurements / 2 ) def _dim_effects(self) -> int: @@ -357,7 +357,7 @@ def profile_loglikelihood( return ( - sum_logdet_V / 2 - - self._n_measurements / 2 * log_sum_mahalanobis + - self.n_measurements / 2 * log_sum_mahalanobis + self._profile_loglikelihood_additive_constants ) @@ -367,12 +367,12 @@ def n_samples(self) -> int: return len(self.values) -class MixedEffectsConverter(_ToBasisConverter[FDataIrregular]): +class _MixedEffectsConverter(_ToBasisConverter[FDataIrregular]): """Mixed effects to-basis-converter.""" # after fitting: fitted_model: Optional[_MixedEffectsModel] - fitted_params: Optional[_MixedEffectsParamsResult] + fitted_params: Optional[_MixedEffectsParams] result: Optional[Dict[str, Any] | scipy.optimize.OptimizeResult] def __init__( @@ -388,12 +388,13 @@ def transform( self, X: FDataIrregular, ) -> FDataBasis: + """Transform to FDataBasis using the fitted converter.""" if self.fitted_params is None: # or self.model is None: raise ValueError("The converter has not been fitted.") - X_model = _MixedEffectsModel(X, self.basis) + model = _MixedEffectsModel(X, self.basis) mean = self.fitted_params.mean - gamma_estimates = X_model.random_effects_estimate(self.fitted_params) + gamma_estimates = model.random_effects_estimate(self.fitted_params) coefficients = mean[np.newaxis, :] + gamma_estimates @@ -403,7 +404,7 @@ def transform( ) -class MinimizeMixedEffectsConverter(MixedEffectsConverter): +class MinimizeMixedEffectsConverter(_MixedEffectsConverter): """Mixed effects to-basis-converter using scipy.optimize. Minimizes the profile loglikelihood of the mixed effects model as proposed @@ -421,13 +422,13 @@ class _Params: case mean=None (will be None otherwise). """ - L: NDArrayFloat + sqrt_cov_div_sigmasq: NDArrayFloat _mean: Optional[NDArrayFloat] _model: Optional[_MixedEffectsModel] def __init__( self, - L: NDArrayFloat, + sqrt_cov_div_sigmasq: NDArrayFloat, mean: Optional[NDArrayFloat], model: Optional[_MixedEffectsModel] = None, ) -> None: @@ -435,7 +436,8 @@ def __init__( assert model is not None, "model is required if mean is None" # must use object.__setattr__ due to frozen=True - object.__setattr__(self, "L", L) + object.__setattr__( + self, "sqrt_cov_div_sigmasq", sqrt_cov_div_sigmasq) object.__setattr__(self, "_mean", mean) object.__setattr__(self, "_model", model) @@ -463,21 +465,24 @@ def mean(self) -> NDArrayFloat: @property def covariance_div_sigmasq(self) -> NDArrayFloat: - return self.L @ self.L.T + """Covariance of the random effects divided by sigmasq.""" + return self.sqrt_cov_div_sigmasq @ self.sqrt_cov_div_sigmasq.T @property def covariance(self) -> NDArrayFloat: + """Covariance of the random effects.""" return self.covariance_div_sigmasq * self.sigmasq @property def sigmasq(self) -> float: + """Variance of the residuals.""" assert self._model is not None, "Model is required" return _sum_mahalanobis( self._model.partial_residuals(self.mean), self._model.values_covariances_div_sigmasq( self.covariance_div_sigmasq, ), - ) / self._model._n_measurements # type: ignore + ) / self._model.n_measurements # type: ignore @classmethod def from_vec( @@ -489,16 +494,24 @@ def from_vec( ) -> Self: """Create Params from vectorized parameters.""" mean = vec[:dim_effects] if has_mean else None - L_vec_len = dim_effects * (dim_effects + 1) // 2 - L = np.zeros((dim_effects, dim_effects)) - L[np.tril_indices(dim_effects)] = vec[-L_vec_len:] - return cls(mean=mean, L=L, model=model) + sqrt_cov_vec_len = dim_effects * (dim_effects + 1) // 2 + sqrt_cov_div_sigmasq = np.zeros((dim_effects, dim_effects)) + sqrt_cov_div_sigmasq[np.tril_indices(dim_effects)] = ( + vec[-sqrt_cov_vec_len:] + ) + return cls( + mean=mean, + sqrt_cov_div_sigmasq=sqrt_cov_div_sigmasq, + model=model, + ) def to_vec(self) -> NDArrayFloat: """Vectorize parameters.""" return np.concatenate([ self._mean if self._mean is not None else np.array([]), - self.L[np.tril_indices(self.L.shape[0])] + self.sqrt_cov_div_sigmasq[ + np.tril_indices(self.sqrt_cov_div_sigmasq.shape[0]) + ], ]) def fit( @@ -520,6 +533,8 @@ def fit( initial_params: initial params of the model. minimization_methods: scipy.optimize.minimize method to be used for the minimization of the loglikelihood of the model. + has_mean: Whether the mean is a fixed parameter to be optimized or + estimated with ML estimator from the covariance parameters. Returns: self after fit @@ -534,7 +549,9 @@ def fit( else: initial_params_generic = _initial_params(dim_effects) initial_params_vec = MinimizeMixedEffectsConverter._Params( - L=np.linalg.cholesky(initial_params_generic.covariance), + sqrt_cov_div_sigmasq=np.linalg.cholesky( + initial_params_generic.covariance, + ), mean=initial_params_generic.mean if has_mean else None, model=model, ).to_vec() @@ -570,7 +587,7 @@ def objective_function(params_vec: NDArrayFloat) -> float: return self -class EMMixedEffectsConverter(MixedEffectsConverter): +class EMMixedEffectsConverter(_MixedEffectsConverter): """Mixed effects to-basis-converter using the EM algorithm.""" @dataclass(frozen=True) class _Params: @@ -584,6 +601,7 @@ def covariance_div_sigmasq(self) -> NDArrayFloat: return self.covariance / self.sigmasq def to_vec(self) -> NDArrayFloat: + """Vectorize parameters.""" return np.concatenate([ np.array([self.sigmasq]), self.covariance[np.tril_indices(self.covariance.shape[0])], @@ -602,6 +620,7 @@ def from_vec( return cls(sigmasq=sigmasq, covariance=covariance) def len_vec(self) -> int: + """Length of the vectorized parameters.""" dim_effects = self.covariance.shape[0] return 1 + dim_effects * (dim_effects + 1) // 2 @@ -625,13 +644,12 @@ def _mean( assume_a="pos", ) - def next_params( + def _next_params( self, model: _MixedEffectsModel, curr_params: EMMixedEffectsConverter._Params, partial_residuals: List[NDArrayFloat], - values_cov_inv: List[NDArrayFloat], - vaules_cov: List[NDArrayFloat], + values_cov: List[NDArrayFloat], random_effects: NDArrayFloat, ) -> EMMixedEffectsConverter._Params: """Return the next parameters of the EM algorithm.""" @@ -641,6 +659,9 @@ def next_params( partial_residuals, model.basis_evaluations, random_effects, ) ] + values_cov_inv = [ + np.linalg.pinv(cov, hermitian=True) for cov in values_cov + ] sum_squared_residuals = sum(np.inner(r, r) for r in residuals) sum_traces = curr_params.sigmasq * sum( # np.trace(np.eye(cov_inv.shape[0]) - params.sigmasq * cov_inv) @@ -648,7 +669,7 @@ def next_params( for cov_inv in values_cov_inv ) next_sigmasq = ( - (sum_squared_residuals + sum_traces) / model._n_measurements + (sum_squared_residuals + sum_traces) / model.n_measurements ) next_covariance = sum( np.outer(random_effect, random_effect) @@ -659,7 +680,7 @@ def next_params( ) ) for basis_eval, Sigma, random_effect in zip( - model.basis_evaluations, vaules_cov, random_effects, + model.basis_evaluations, values_cov, random_effects, ) ) / model.n_samples @@ -676,7 +697,7 @@ def fit( initial_params: Optional[ EMMixedEffectsConverter._Params | NDArrayFloat ] = None, - niter: int = 700, + maxiter: int = 700, convergence_criterion: Optional[ Literal["params", "squared-error", "loglikelihood"] ] = None, @@ -691,18 +712,18 @@ def fit( niter: maximum number of iterations. convergence_criterion: convergence criterion to use when fitting. - "params" to use relative differences between parameters. - - "squared-error" to use the square error of the estimates with - respect to the original data. - - "loglikelihood" to use the loglikelihood. + - "squared-error" to userelative changes in the squared error + of the estimated values with respect to the original data. + - "loglikelihood" to use relative changes in the loglikelihood. # - "prop-offset" to use the criteria proposed by Bates & # Watts 1981 (A Relative Offset Convergence Criterion for # Nonlinear Least Squares). rtol: relative tolerance for convergence. Returns: - self after fit + The converter after fitting. """ - model = self.model = _MixedEffectsModel(X, self.basis) + model = _MixedEffectsModel(X, self.basis) if initial_params is None: initial_params_generic = _initial_params(self.basis.n_basis) @@ -734,7 +755,7 @@ def fit( converged = False convergence_val: Optional[NDArrayFloat | float] = None prev_convergence_val: Optional[NDArrayFloat | float] = None - for iter_number in range(niter): + for iter_number in range(maxiter): curr_params = next_params values_cov = model.values_covariances( curr_params.sigmasq, curr_params.covariance, @@ -744,16 +765,11 @@ def fit( random_effects = model._random_effects_estimate( curr_params.covariance, values_cov, partial_residuals, ) - values_cov_inv = [ - np.linalg.pinv(cov, hermitian=True) - for cov in values_cov - ] - next_params = self.next_params( + next_params = self._next_params( model=model, curr_params=curr_params, partial_residuals=partial_residuals, - values_cov_inv=values_cov_inv, - vaules_cov=values_cov, + values_cov=values_cov, random_effects=random_effects, ) @@ -787,12 +803,11 @@ def fit( prev_convergence_val = convergence_val if not converged: - message = f"EM algorithm did not converge ({niter=})." - # raise RuntimeError(f"EM algorithm did not converge ({niter=}).") + message = f"EM algorithm did not converge ({maxiter=})." else: message = ( "EM algorithm converged after " - f"{iter_number}/{niter} iterations." + f"{iter_number}/{maxiter} iterations." ) self.result = { diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py index ea5f0ebcc..b4cf1510e 100644 --- a/skfda/tests/test_mixed_effects_converter.py +++ b/skfda/tests/test_mixed_effects_converter.py @@ -26,7 +26,7 @@ ) from skfda.representation.conversion._mixed_effects import ( MinimizeMixedEffectsConverter, - MixedEffectsConverter, + _MixedEffectsConverter, EMMixedEffectsConverter, _get_values_list, _get_basis_evaluations_list, @@ -287,7 +287,7 @@ def _cmp_estimation_with_original( domain_range: Tuple[float, float], funcs: List[Callable[[NDArrayFloat], NDArrayFloat]], type_gen_points: int, - estimator: MixedEffectsConverter, + estimator: _MixedEffectsConverter, fit_kwargs: dict[str, Any], fdatabasis_original: FDataBasis, ) -> None: @@ -315,8 +315,8 @@ def _cmp_estimation_with_original( def _test_compare_with_original( - estimator_cls: Type[MixedEffectsConverter], - fit_kwargs: dict[str, Any] = {}, + estimator_cls: Type[_MixedEffectsConverter], + fit_kwargs: dict[str, Any] = dict(), ) -> None: np.random.seed(34285676) domain_range = (0, 100) @@ -365,11 +365,12 @@ def fun(i: int): # ) -def test_compare_with_statsmodels_em() -> None: +def test_compare_em_with_original() -> None: + """Compare the EM conversion with the original data.""" _test_compare_with_original( estimator_cls=EMMixedEffectsConverter, fit_kwargs={ - "niter": 500, + "maxiter": 500, "convergence_criterion": "params", "rtol": 1e-3, } From 5166b5cc38db80123340bd8b2bcbab27d676e877 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 16 Mar 2024 20:10:45 +0100 Subject: [PATCH 06/48] Fix docstring --- skfda/representation/conversion/_mixed_effects.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index 67190e683..89a319b2b 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -415,7 +415,9 @@ class MinimizeMixedEffectsConverter(_MixedEffectsConverter): class _Params: """Private class for the parameters of the minimization. Args: - L: (_L @ _L.T) is the Cholesky decomposition of covariance/sigmasq. + sqrt_cov_div_sigmasq: + (sqrt_cov_div_sigmasq @ sqrt_cov_div_sigmasq.T) is the Cholesky + decomposition of covariance/sigmasq. has_mean: Whether the mean is fixed or estimated with ML estimator. mean: Fixed effects (can be none). model: Mixed effects model to use for the estimation of the mean in From 2b7fd7c4fc4ad4b5a5e2aaabb19f05d2c6031678 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 16 Mar 2024 21:10:50 +0100 Subject: [PATCH 07/48] Comment and remove unused import --- skfda/representation/conversion/_mixed_effects.py | 1 + skfda/representation/conversion/_to_basis.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index 89a319b2b..b495f2932 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -445,6 +445,7 @@ def __init__( @property def mean(self) -> NDArrayFloat: + """Estimate the fixed effects (mean of the coefficients).""" if self._mean is not None: return self._mean assert self._model is not None, "Model is required" diff --git a/skfda/representation/conversion/_to_basis.py b/skfda/representation/conversion/_to_basis.py index 081e97dbb..54868e985 100644 --- a/skfda/representation/conversion/_to_basis.py +++ b/skfda/representation/conversion/_to_basis.py @@ -8,7 +8,6 @@ from __future__ import annotations from typing import ( - Generic, TypeVar, ) From ddadefc60ceb8f7760a598205f1b0cb4a111ef36 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 23 Mar 2024 21:40:53 +0100 Subject: [PATCH 08/48] Remove space and adapt FDataIrregular.integrate to correctly override FData.integrate --- skfda/representation/conversion/_mixed_effects.py | 2 +- skfda/representation/irregular.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index b495f2932..be3dfa55b 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -415,7 +415,7 @@ class MinimizeMixedEffectsConverter(_MixedEffectsConverter): class _Params: """Private class for the parameters of the minimization. Args: - sqrt_cov_div_sigmasq: + sqrt_cov_div_sigmasq: (sqrt_cov_div_sigmasq @ sqrt_cov_div_sigmasq.T) is the Cholesky decomposition of covariance/sigmasq. has_mean: Whether the mean is fixed or estimated with ML estimator. diff --git a/skfda/representation/irregular.py b/skfda/representation/irregular.py index 1524a02fa..f4cf5226f 100644 --- a/skfda/representation/irregular.py +++ b/skfda/representation/irregular.py @@ -600,6 +600,7 @@ def derivative( def integrate( self: T, + *, domain: Optional[DomainRange] = None, ) -> NDArrayFloat: """Integrate the FDataIrregular object. From 9f17ff3db920f01af6362dd1e828a533aa4015a0 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Mon, 1 Apr 2024 16:24:41 +0200 Subject: [PATCH 09/48] Update signature of FDataIrregular.cov to match superclass' --- skfda/representation/irregular.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/skfda/representation/irregular.py b/skfda/representation/irregular.py index f4cf5226f..dd33e0145 100644 --- a/skfda/representation/irregular.py +++ b/skfda/representation/irregular.py @@ -728,7 +728,11 @@ def var(self: T, correction: int = 0) -> T: sample_names=(None,), ) - def cov(self: T) -> T: + def cov( + self: T, + /, + correction: int = 0, + ) -> T: """Compute the covariance for a FDataIrregular object. Returns: From 30e11cec11869cc56d2d7973a58c7ca38d75e8de Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sun, 7 Apr 2024 13:52:14 +0200 Subject: [PATCH 10/48] conversion documentation --- docs/modules/representation.rst | 9 +++++++++ docs/modules/representation/conversion.rst | 20 ++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 docs/modules/representation/conversion.rst diff --git a/docs/modules/representation.rst b/docs/modules/representation.rst index 4f00b3fde..51375d20c 100644 --- a/docs/modules/representation.rst +++ b/docs/modules/representation.rst @@ -132,6 +132,15 @@ interval using extrapolation methods. representation/extrapolation +Conversion +------------ +Convert irregular data to basis representation using mixed effects models. + +.. toctree:: + :maxdepth: 4 + + representation/conversion + Deprecated Classes ---------------------- diff --git a/docs/modules/representation/conversion.rst b/docs/modules/representation/conversion.rst new file mode 100644 index 000000000..d1e1c0694 --- /dev/null +++ b/docs/modules/representation/conversion.rst @@ -0,0 +1,20 @@ +Conversion between representations +================================== + +This module contains classes (converters) for converting between different +representations. Currently only the conversion between :class:`FDataIrregular` +and :class:`FDataBasis` has been implemented via converters. + +:class:`FDataIrregular` to :class:`FDataBasis` +---------------------------------------------- + +The following classes are used for converting irregular functional data to +basis representation using the mixed effects model. + +.. autosummary:: + :toctree: autosummary + + skfda.representation.conversion.EMMixedEffectsConverter + skfda.representation.conversion.MinimizeMixedEffectsConverter + skfda.representation.conversion.MixedEffectsConverter + From d3b26b538450ff78d4ae4490ca055ae926db6087 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sun, 7 Apr 2024 13:54:57 +0200 Subject: [PATCH 11/48] Rename classes to make them public and add `FDataIrregular.to_basis` method --- skfda/representation/conversion/__init__.py | 22 ++++++++ .../conversion/_mixed_effects.py | 52 +++++++++++-------- skfda/representation/irregular.py | 51 +++++++++++++++--- skfda/tests/test_mixed_effects_converter.py | 15 +++--- 4 files changed, 105 insertions(+), 35 deletions(-) diff --git a/skfda/representation/conversion/__init__.py b/skfda/representation/conversion/__init__.py index e69de29bb..6f28a04dd 100644 --- a/skfda/representation/conversion/__init__.py +++ b/skfda/representation/conversion/__init__.py @@ -0,0 +1,22 @@ +"""Conversion.""" +from typing import TYPE_CHECKING + +import lazy_loader as lazy + +__getattr__, __dir__, __all__ = lazy.attach( + __name__, + submod_attrs={ + "_mixed_effects": [ + "EMMixedEffectsConverter", + "MixedEffectsConverter", + "MinimizeMixedEffectsConverter" + ], + }, +) + +if TYPE_CHECKING: + from ._mixed_effects import ( + EMMixedEffectsConverter, + MinimizeMixedEffectsConverter, + MixedEffectsConverter, + ) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index be3dfa55b..70de08ae7 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -9,6 +9,7 @@ """ from __future__ import annotations +from abc import ABC from dataclasses import dataclass from typing import ( Any, @@ -149,22 +150,18 @@ class _MixedEffectsParams(Protocol): @property def covariance(self) -> NDArrayFloat: """Covariance of the mixed effects.""" - ... @property def sigmasq(self) -> float: """Variance of the residuals.""" - ... @property def covariance_div_sigmasq(self) -> NDArrayFloat: """Covariance of the mixed effects.""" - ... @property def mean(self) -> NDArrayFloat: """Fixed effects.""" - ... @dataclass(frozen=True) @@ -176,6 +173,7 @@ class _MixedEffectsParamsResult: @property def covariance_div_sigmasq(self) -> NDArrayFloat: + """covariance/sigmasq of the mixed effects model.""" return self.covariance / self.sigmasq @@ -367,8 +365,11 @@ def n_samples(self) -> int: return len(self.values) -class _MixedEffectsConverter(_ToBasisConverter[FDataIrregular]): - """Mixed effects to-basis-converter.""" +class MixedEffectsConverter(_ToBasisConverter[FDataIrregular], ABC): + """Abstract class for mixed effects to-basis-converters. + + TODO: explain the model in detail. + """ # after fitting: fitted_model: Optional[_MixedEffectsModel] @@ -401,10 +402,15 @@ def transform( return FDataBasis( basis=self.basis, coefficients=coefficients, + dataset_name=X.dataset_name, + argument_names=X.argument_names, + coordinate_names=X.coordinate_names, + sample_names=X.sample_names, + extrapolation=X.extrapolation, ) -class MinimizeMixedEffectsConverter(_MixedEffectsConverter): +class MinimizeMixedEffectsConverter(MixedEffectsConverter): """Mixed effects to-basis-converter using scipy.optimize. Minimizes the profile loglikelihood of the mixed effects model as proposed @@ -412,8 +418,9 @@ class MinimizeMixedEffectsConverter(_MixedEffectsConverter): """ @dataclass(frozen=True) - class _Params: + class Params: """Private class for the parameters of the minimization. + Args: sqrt_cov_div_sigmasq: (sqrt_cov_div_sigmasq @ sqrt_cov_div_sigmasq.T) is the Cholesky @@ -523,7 +530,7 @@ def fit( y: object = None, *, initial_params: Optional[ - MinimizeMixedEffectsConverter._Params | NDArrayFloat + MinimizeMixedEffectsConverter.Params | NDArrayFloat ] = None, minimization_method: Optional[str] = None, has_mean: bool = True, @@ -545,13 +552,13 @@ def fit( dim_effects = self.basis.n_basis model = _MixedEffectsModel(X, self.basis) n_samples = X.n_samples - if isinstance(initial_params, MinimizeMixedEffectsConverter._Params): + if isinstance(initial_params, MinimizeMixedEffectsConverter.Params): initial_params_vec = initial_params.to_vec() elif initial_params is not None: initial_params_vec = initial_params else: initial_params_generic = _initial_params(dim_effects) - initial_params_vec = MinimizeMixedEffectsConverter._Params( + initial_params_vec = MinimizeMixedEffectsConverter.Params( sqrt_cov_div_sigmasq=np.linalg.cholesky( initial_params_generic.covariance, ), @@ -564,7 +571,7 @@ def fit( def objective_function(params_vec: NDArrayFloat) -> float: return - model.profile_loglikelihood( - params=MinimizeMixedEffectsConverter._Params.from_vec( + params=MinimizeMixedEffectsConverter.Params.from_vec( params_vec, dim_effects, model=self, has_mean=has_mean, ) ) / n_samples @@ -575,7 +582,7 @@ def objective_function(params_vec: NDArrayFloat) -> float: minimization_methods=minimization_method, ) self.fitted_model = model - params = MinimizeMixedEffectsConverter._Params.from_vec( + params = MinimizeMixedEffectsConverter.Params.from_vec( self.result.x, dim_effects=dim_effects, model=model, @@ -590,10 +597,10 @@ def objective_function(params_vec: NDArrayFloat) -> float: return self -class EMMixedEffectsConverter(_MixedEffectsConverter): +class EMMixedEffectsConverter(MixedEffectsConverter): """Mixed effects to-basis-converter using the EM algorithm.""" @dataclass(frozen=True) - class _Params: + class Params: """Mixed effects parameters for the EM algorithm.""" sigmasq: float covariance: NDArrayFloat @@ -615,7 +622,7 @@ def from_vec( cls, vec: NDArrayFloat, dim_effects: int, - ) -> EMMixedEffectsConverter._Params: + ) -> EMMixedEffectsConverter.Params: """Create Params from vectorized parameters.""" sigmasq = vec[0] covariance = np.zeros((dim_effects, dim_effects)) @@ -650,11 +657,11 @@ def _mean( def _next_params( self, model: _MixedEffectsModel, - curr_params: EMMixedEffectsConverter._Params, + curr_params: EMMixedEffectsConverter.Params, partial_residuals: List[NDArrayFloat], values_cov: List[NDArrayFloat], random_effects: NDArrayFloat, - ) -> EMMixedEffectsConverter._Params: + ) -> EMMixedEffectsConverter.Params: """Return the next parameters of the EM algorithm.""" residuals = [ r - basis_eval @ random_effect @@ -687,7 +694,7 @@ def _next_params( ) ) / model.n_samples - return EMMixedEffectsConverter._Params( + return EMMixedEffectsConverter.Params( sigmasq=next_sigmasq, covariance=next_covariance, ) @@ -698,7 +705,7 @@ def fit( y: object = None, *, initial_params: Optional[ - EMMixedEffectsConverter._Params | NDArrayFloat + EMMixedEffectsConverter.Params | NDArrayFloat ] = None, maxiter: int = 700, convergence_criterion: Optional[ @@ -714,6 +721,7 @@ def fit( initial_params: initial params of the model. niter: maximum number of iterations. convergence_criterion: convergence criterion to use when fitting. + - "params" to use relative differences between parameters. - "squared-error" to userelative changes in the squared error of the estimated values with respect to the original data. @@ -730,12 +738,12 @@ def fit( if initial_params is None: initial_params_generic = _initial_params(self.basis.n_basis) - next_params = EMMixedEffectsConverter._Params( + next_params = EMMixedEffectsConverter.Params( sigmasq=initial_params_generic.sigmasq, covariance=initial_params_generic.covariance, ) elif isinstance(initial_params, np.ndarray): - next_params = EMMixedEffectsConverter._Params.from_vec( + next_params = EMMixedEffectsConverter.Params.from_vec( initial_params, dim_effects=self.basis.n_basis, ) else: diff --git a/skfda/representation/irregular.py b/skfda/representation/irregular.py index dd33e0145..3bb335ec9 100644 --- a/skfda/representation/irregular.py +++ b/skfda/representation/irregular.py @@ -11,7 +11,7 @@ import numbers from typing import ( Any, - Callable, + Literal, Optional, Sequence, Tuple, @@ -24,6 +24,10 @@ import pandas.api.extensions from matplotlib.figure import Figure +from ..representation.conversion import ( + EMMixedEffectsConverter, + MinimizeMixedEffectsConverter, +) from .._utils import _cartesian_product, _check_array_key, _to_grid_points from ..typing._base import ( DomainRange, @@ -34,7 +38,6 @@ ) from ..typing._numpy import ( ArrayLike, - DTypeLike, NDArrayBool, NDArrayFloat, NDArrayInt, @@ -47,6 +50,9 @@ from .interpolation import SplineInterpolation T = TypeVar("T", bound='FDataIrregular') +IrregularToBasisConversionType = Literal[ + "separately", "mixed_effects", "mixed_effects_minimize", +] ###################### # Auxiliary functions# @@ -1052,14 +1058,38 @@ def scatter(self, *args: Any, **kwargs: Any) -> Figure: return ScatterPlotIrregular(self, *args, **kwargs).plot() - def to_basis(self, basis: Basis, **kwargs: Any) -> FDataBasis: + def to_basis( + self, + basis: Basis, + *, + conversion_type: IrregularToBasisConversionType = "separately", + **kwargs: Any, + ) -> FDataBasis: """Return the basis representation of the object. Args: basis (Basis): basis object in which the functional data are going to be represented. - kwargs: keyword arguments to be passed to - FDataBasis.from_data(). + conversion_type: method to use for the conversion: + + - "separately": (default) each curve is converted independently + (meaning that only the information of each curve is used + for its conversion) with + :class:`~skfda.preprocessing.smoothing.BasisSmoother`. + - "mixed_effects": all curves are converted jointly (this means + that the information of all curves is used to convert each + one) using the EM algorithm to fit the mixed effects + model: + :class:`~skfda.representation.conversion.EMMixedEffectsConverter`. + - "mixed_effects_minimize": all curves are converted jointly + using the scipy.optimize.minimize to fit the mixed effects + model: + :class:`~skfda.representation.conversion.MinimizeMixedEffectsConverter`. + kwargs: keyword arguments to be passed to FDataBasis.from_data() + in the case of conversion_type="separately. If conversion_type + has another value, the keyword arguments are passed to the fit + method of the + :class:`~skfda.representation.conversion.MixedEffectsConverter`. Raises: ValueError: Incorrect domain dimension @@ -1069,8 +1099,6 @@ def to_basis(self, basis: Basis, **kwargs: Any) -> FDataBasis: FDataBasis: Basis representation of the funtional data object. """ - from ..preprocessing.smoothing import BasisSmoother - if self.dim_domain != basis.dim_domain: raise ValueError( f"The domain of the function has " @@ -1090,6 +1118,15 @@ def to_basis(self, basis: Basis, **kwargs: Any) -> FDataBasis: if not basis.is_domain_range_fixed(): basis = basis.copy(domain_range=self.domain_range) + if conversion_type != "separately": + converter_class = ( + EMMixedEffectsConverter if conversion_type == "mixed_effects" + else MinimizeMixedEffectsConverter + ) + converter = converter_class(basis) + return converter.fit_transform(self, **kwargs) + + from ..preprocessing.smoothing import BasisSmoother smoother = BasisSmoother( basis=basis, **kwargs, diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py index b4cf1510e..343306232 100644 --- a/skfda/tests/test_mixed_effects_converter.py +++ b/skfda/tests/test_mixed_effects_converter.py @@ -6,8 +6,8 @@ Any, Callable, Iterable, - Literal, List, + Optional, Tuple, Type, ) @@ -26,7 +26,7 @@ ) from skfda.representation.conversion._mixed_effects import ( MinimizeMixedEffectsConverter, - _MixedEffectsConverter, + MixedEffectsConverter, EMMixedEffectsConverter, _get_values_list, _get_basis_evaluations_list, @@ -101,7 +101,7 @@ def test_loglikelihood() -> None: ] for params_vec, mixedlm_loglikelihood in params_loglike_list: - params = MinimizeMixedEffectsConverter._Params.from_vec( + params = MinimizeMixedEffectsConverter.Params.from_vec( params_vec, basis.n_basis, model, ) model_loglikelihood = model.profile_loglikelihood(params) @@ -287,7 +287,7 @@ def _cmp_estimation_with_original( domain_range: Tuple[float, float], funcs: List[Callable[[NDArrayFloat], NDArrayFloat]], type_gen_points: int, - estimator: _MixedEffectsConverter, + estimator: MixedEffectsConverter, fit_kwargs: dict[str, Any], fdatabasis_original: FDataBasis, ) -> None: @@ -315,10 +315,13 @@ def _cmp_estimation_with_original( def _test_compare_with_original( - estimator_cls: Type[_MixedEffectsConverter], - fit_kwargs: dict[str, Any] = dict(), + estimator_cls: Type[MixedEffectsConverter], + fit_kwargs: Optional[dict[str, Any]] = None, ) -> None: np.random.seed(34285676) + if fit_kwargs is None: + fit_kwargs = {} + domain_range = (0, 100) _max_val = 5 n_points = 7 From 313a0012223084550f801ddab7cacff520add641 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 13 Apr 2024 12:26:46 +0200 Subject: [PATCH 12/48] Fix imports --- skfda/representation/irregular.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skfda/representation/irregular.py b/skfda/representation/irregular.py index 3bb335ec9..db26ebda6 100644 --- a/skfda/representation/irregular.py +++ b/skfda/representation/irregular.py @@ -24,10 +24,6 @@ import pandas.api.extensions from matplotlib.figure import Figure -from ..representation.conversion import ( - EMMixedEffectsConverter, - MinimizeMixedEffectsConverter, -) from .._utils import _cartesian_product, _check_array_key, _to_grid_points from ..typing._base import ( DomainRange, @@ -1119,6 +1115,10 @@ def to_basis( basis = basis.copy(domain_range=self.domain_range) if conversion_type != "separately": + from ..representation.conversion import ( + EMMixedEffectsConverter, + MinimizeMixedEffectsConverter, + ) converter_class = ( EMMixedEffectsConverter if conversion_type == "mixed_effects" else MinimizeMixedEffectsConverter From 0bb641b9acb5f258af7759563b9d73397edf4f1b Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 13 Apr 2024 17:04:07 +0200 Subject: [PATCH 13/48] irregularly sample FData objects --- skfda/datasets/__init__.py | 6 ++ skfda/datasets/_sample_from_fdata.py | 99 ++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 skfda/datasets/_sample_from_fdata.py diff --git a/skfda/datasets/__init__.py b/skfda/datasets/__init__.py index 8abd0dfda..141d877a0 100644 --- a/skfda/datasets/__init__.py +++ b/skfda/datasets/__init__.py @@ -30,6 +30,9 @@ "make_random_warping", "make_sinusoidal_process", ], + "_sample_from_fdata": [ + "irregular_sample", + ], }, ) @@ -58,3 +61,6 @@ make_random_warping as make_random_warping, make_sinusoidal_process as make_sinusoidal_process, ) + from ._sample_from_fdata import ( + irregular_sample + ) diff --git a/skfda/datasets/_sample_from_fdata.py b/skfda/datasets/_sample_from_fdata.py new file mode 100644 index 000000000..fbbe29686 --- /dev/null +++ b/skfda/datasets/_sample_from_fdata.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +from typing import Callable, Iterable +from functools import singledispatch + +import numpy as np +from ..representation import FData, FDataGrid, FDataBasis, FDataIrregular +from ..representation.interpolation import SplineInterpolation +from ..typing._base import DomainRangeLike, GridPointsLike, RandomStateLike +from ..typing._numpy import NDArrayFloat + + +def _irregular_sample_from_callable( + funcs: Iterable[Callable[[NDArrayFloat], NDArrayFloat]], + points_matrix: NDArrayFloat, + noise_stddev: float, +) -> FDataIrregular: + """Sample from a list of functions at irregular points. + + Args: + funcs: List of functions to sample. + points_matrix: of shape (n_funcs, n_points_per_function). Points where + to measure each function sample. + noise_stddev: Standard deviation of the noise. + """ + assert points_matrix.ndim == 2 + n_points_per_curve = points_matrix.shape[1] + total_n_points = points_matrix.shape[0] * n_points_per_curve + return FDataIrregular( + points=points_matrix.reshape(-1), + start_indices=np.array(range(0, total_n_points, n_points_per_curve)), + values=np.concatenate([ + func(func_points).reshape(-1) + for func, func_points in zip(funcs, points_matrix) + ]) + np.random.normal(scale=noise_stddev, size=total_n_points), + ) + + +def irregular_sample( + fdata: FDataGrid | FDataBasis, + n_points_per_curve: int, + noise_stddev: float = 0.0, +) -> FDataIrregular: + """Irregularly sample from a FDataGrid or FDataBasis object. + + Only implemented for 1D domains and codomains. The points are selected at + random (uniformly) from the domain of the input object. + + Args: + fdata: Functional data object to sample from. + n_points_per_curve: Number of points to sample per curve. + noise_stddev: Standard deviation of the noise. + """ + if fdata.dim_domain != 1 or fdata.dim_codomain != 1: + raise NotImplementedError( + "Only implemented for 1D domains and codomains.", + ) + + points_matrix = _irregular_sample_points_matrix( + fdata, n_points_per_curve, + ) + return _irregular_sample_from_callable( + funcs=fdata, + points_matrix=points_matrix, + noise_stddev=noise_stddev, + ) + + +@singledispatch +def _irregular_sample_points_matrix( + fdata: FDataGrid | FDataBasis, + n_points_per_curve: int, +) -> NDArrayFloat: + raise NotImplementedError( + "Only implemented for FDataGrid and FDataBasis.", + ) + + +@_irregular_sample_points_matrix.register +def _irregular_sample_points_matrix_fdatagrid( + fdata: FDataGrid, + n_points_per_curve: int, +) -> NDArrayFloat: + return np.random.choice( + fdata.grid_points[0], # This only works for 1D domains + size=(fdata.n_samples, n_points_per_curve), + replace=True, + ) + + +@_irregular_sample_points_matrix.register +def _irregular_sample_points_matrix_fdatabasis( + fdata: FDataBasis, + n_points_per_curve: int, +) -> NDArrayFloat: + return np.random.uniform( + *fdata.domain_range[0], # This only works for 1D domains + size=(fdata.n_samples, n_points_per_curve), + ) From 2030dd34dd256b59a043ad2f3d9571b62cbe1eef Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 13 Apr 2024 17:04:17 +0200 Subject: [PATCH 14/48] Examples --- docs/refs.bib | 7 + examples/plot_fdatairregular_to_basis.py | 87 ++++++++++++ .../plot_irregular_to_basis_mixed_effects.py | 125 ++++++++++++++++++ .../conversion/_mixed_effects.py | 3 +- 4 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 examples/plot_fdatairregular_to_basis.py create mode 100644 examples/plot_irregular_to_basis_mixed_effects.py diff --git a/docs/refs.bib b/docs/refs.bib index 005bc0112..7d82543a0 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -659,3 +659,10 @@ @inbook{wasserman_2006_nonparametric langid = {english} } +@article{james_2018_sparsenessfda, + title = {Sparseness and functional data analysis}, + author = {Gareth M. James}, + journal = {Oxford Handbooks Online}, + year = {2018}, + url = {https://api.semanticscholar.org/CorpusID:14265225} +} diff --git a/examples/plot_fdatairregular_to_basis.py b/examples/plot_fdatairregular_to_basis.py new file mode 100644 index 000000000..9c5582892 --- /dev/null +++ b/examples/plot_fdatairregular_to_basis.py @@ -0,0 +1,87 @@ +""" +Irregular data to basis representation +======================================================================= + +Convert irregular data to a basis representation using the ``to_basis`` +method of the :class:`skfda.representation.irregular.FDataIrregular` class. +""" + +# Author: Pablo Cuesta Sierra +# License: MIT + +# sphinx_gallery_thumbnail_number = -1 + +import matplotlib.pyplot as plt +import numpy as np + +from skfda.datasets import fetch_weather, irregular_sample +from skfda.representation.basis import FourierBasis +from skfda.misc.scoring import r2_score + +np.random.seed(439472) # set the seed for reproducibility + +# %% +# First, the Canadian Weather dataset is downloaded from the package 'fda' in +# CRAN. It contains a FDataGrid with daily temperatures and precipitations, +# that is, it has a 2-dimensional image. We are interested only in the daily +# average temperatures, so we will use the first coordinate. +# +# As we want to ilustrate the conversion of irregular data to basis, +# representation, we will take an irregular sample of the temperatures dataset +# containing only 8 points per curve. +fd_temperatures = fetch_weather().data.coordinates[0] +irregular_temperatures = irregular_sample( + fdata=fd_temperatures, n_points_per_curve=8, +) + +# %% +# To get an idea of the irregular data we will be working with, 6 of the +# irregular curves are plotted, along with the original curves +# that they come from. +fig = plt.figure() +irregular_temperatures[-6:].scatter(fig=fig) +fd_temperatures[-6:].plot(fig=fig, alpha=0.1) +plt.show() + +# %% +# Now, we will convert the irregularly sampled temperature curves to basis +# representation. Due to the periodicity of the data, we will be using a +# Fourier basis. +basis = FourierBasis(n_basis=5, domain_range=fd_temperatures.domain_range) +irregular_temperatures_converted = irregular_temperatures.to_basis( + basis, conversion_type="mixed_effects", +) + +# %% +# To visualize the conversion, we will now plot 6 of the converted +# curves (smooth basis representation) along with the original temperatures +# (non-smooth) and the irregular points that we sampled. +fig = plt.figure(figsize=(10, 14)) +for k in range(6): + axes = plt.subplot(3, 2, k + 1) + fd_temperatures.plot(axes=axes, alpha=0.05, color="black") + fd_temperatures[k].plot(axes=axes, color=f"C{k}") + irregular_temperatures_converted[k].plot(axes=axes, color=f"C{k}") + irregular_temperatures[k].scatter(axes=axes, color=f"C{k}") +plt.show() + +# %% +# Finally, we will get a score of the quality of the conversion by comparing +# the obtained basis representation (``irregular_temperatures_converted``) +# with the original data (``fd_temperatures``) from the CRAN dataset. We will +# be using the :func:`skfda.misc.scoring.r2_score`. +# +# Note that, to compare the original data and the basis representation (which +# have different :class:`FData` types), we have to evaluate the latter at +# the grid points of the former. +r2 = r2_score( + fd_temperatures, + irregular_temperatures_converted.to_grid(fd_temperatures.grid_points), +) +print(f"R2 score: {r2:.2f}") + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py new file mode 100644 index 000000000..f1aa1131a --- /dev/null +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -0,0 +1,125 @@ +""" +Mixed effects model to convert irregular data to basis representation +======================================================================= + +Convert irregular data to a basis representation using the mixed effects models +implemented in :class:`skfda.representation.irregular.FDataIrregular` class. +""" +# Author: Pablo Cuesta Sierra +# License: MIT + +# sphinx_gallery_thumbnail_number = -1 + +import matplotlib.pyplot as plt +import numpy as np + +from skfda import FDataBasis +from skfda.datasets import irregular_sample +from skfda.representation.basis import BSplineBasis +from skfda.representation.conversion import EMMixedEffectsConverter +from skfda.misc.scoring import r2_score, mean_squared_error + +np.random.seed(4934755) # set the seed for reproducibility + + +# %% +# For this example, we are going to simulate the irregular sampling of a +# dataset following the mixed effects model, to later attempt to reconstruct +# the original data with an +# :class:`skfda.representation.conversion.MixedEffectsConverter`. +# +# First, we create the original :class:`skfda.representation.basis.FDataBasis` +# object, whose coefficients follow the mixed effects model for irregular data +# as presented in :cite:p:`james_2018_sparsenessfda`. This just means that +# the coefficients are generated from a Gaussian distribution. Our dataset +# will contain 40 curves. +n_basis = 4 +domain_range = (0, 10) +basis = BSplineBasis(n_basis=n_basis, domain_range=domain_range, order=4) +basis.plot() +plt.title("Basis functions") + +coeff_mean = np.array([-10, 20, -24, 4]) +coeff_cov_sqrt = np.random.rand(n_basis, n_basis) * 5 +coeff_cov = coeff_cov_sqrt @ coeff_cov_sqrt.T # ensure positive semidefinite +coefficients = np.random.multivariate_normal( + mean=coeff_mean, cov=coeff_cov, size=40, +) + +fdatabasis_original = FDataBasis(basis, coefficients) +# Plot the first 10 curves +fdatabasis_original[:10].plot() +plt.title("Original curves") +plt.show() + + +# %% +# Sencondly, we will simulate the irregular sampling of the original data +# with random noise. For each curve, we will sample 4 points from the domain. +fd_irregular = irregular_sample( + fdatabasis_original, n_points_per_curve=4, noise_stddev=0.2, +) +fig = plt.figure() +fdatabasis_original[-6:].plot(fig=fig) +fd_irregular[-6:].scatter(fig=fig, alpha=0.1) +plt.show() + +# %% +# Moreover, we will split our irregular data into two groups, the train curves +# and the test curves. We will use the train curves to fit the mixed effects +# model and the test curves to evaluate the quality of the conversion. +test_original = fdatabasis_original[::2] +train_original = fdatabasis_original[1::2] +test_irregular = fd_irregular[::2] +train_irregular = fd_irregular[1::2] + +# %% +# Now, we create and train the mixed effects converter. +converter = EMMixedEffectsConverter(basis) +converter = converter.fit(train_irregular) + +# %% +# And convert the irregular data to basis representation. +train_converted = converter.transform(train_irregular) +test_converted = converter.transform(test_irregular) + +# %% +# Let's plot the first 8 original and converted curves of the test set. +# On the background, we plot the train set. +fig = plt.figure(figsize=(10, 15)) +for k in range(8): + axes = plt.subplot(4, 2, k + 1) + + train_original.plot(axes=axes, color=(0, 0, 0, 0.05)) + train_irregular.scatter(axes=axes, color=(0, 0, 0, 0.05), marker=".") + + test_converted[k].plot( + axes=axes, color=f"C{k}", label="Converted", + ) + test_original[k].plot( + axes=axes, color=f"C{k}", linestyle="--", label="Original", + ) + test_irregular[k].scatter( + axes=axes, color=f"C{k}", label="Irregular" + ) + plt.legend() +plt.show() + +# %% +# Finally, we will use the :math:`R^2` score and the :math:`MSE` to compare +# the converted basis representations with the original data, both for the +# train and test sets. +train_r2_score = r2_score(train_original, train_converted) +test_r2_score = r2_score(test_original, test_converted) +train_mse = mean_squared_error(train_original, train_converted) +test_mse = mean_squared_error(test_original, test_converted) +print(f"Train R2 score: {train_r2_score:.2f}") +print(f"Test R2 score: {test_r2_score:.2f}") +print(f"Train Mean Squared Error: {train_mse:.2f}") +print(f"Test Mean Squared Error: {test_mse:.2f}") + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index 70de08ae7..ed55340ab 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -722,7 +722,8 @@ def fit( niter: maximum number of iterations. convergence_criterion: convergence criterion to use when fitting. - - "params" to use relative differences between parameters. + - "params" to use relative differences between parameters + (the default). - "squared-error" to userelative changes in the squared error of the estimated values with respect to the original data. - "loglikelihood" to use relative changes in the loglikelihood. From 36a80924a7a9e8103584aad6403c4e1d0cbb0e79 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 13 Apr 2024 17:43:15 +0200 Subject: [PATCH 15/48] Fix irregular datasets slicing and add fetch_bone_density to documentation --- docs/modules/datasets.rst | 1 + examples/plot_fdatairregular_to_basis.py | 1 - skfda/representation/irregular.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/modules/datasets.rst b/docs/modules/datasets.rst index f5ed9f8da..48e31d0dc 100644 --- a/docs/modules/datasets.rst +++ b/docs/modules/datasets.rst @@ -22,6 +22,7 @@ The following functions are used to retrieve specific functional datasets: skfda.datasets.fetch_phoneme skfda.datasets.fetch_tecator skfda.datasets.fetch_weather + skfda.datasets.fetch_bone_density Those functions return a dictionary with at least a "data" field containing the instance data, and a "target" field containing the class labels or regression values, diff --git a/examples/plot_fdatairregular_to_basis.py b/examples/plot_fdatairregular_to_basis.py index 9c5582892..adafe7a60 100644 --- a/examples/plot_fdatairregular_to_basis.py +++ b/examples/plot_fdatairregular_to_basis.py @@ -5,7 +5,6 @@ Convert irregular data to a basis representation using the ``to_basis`` method of the :class:`skfda.representation.irregular.FDataIrregular` class. """ - # Author: Pablo Cuesta Sierra # License: MIT diff --git a/skfda/representation/irregular.py b/skfda/representation/irregular.py index 9115fa922..8737f50f4 100644 --- a/skfda/representation/irregular.py +++ b/skfda/representation/irregular.py @@ -1466,7 +1466,7 @@ def __getitem__( start_indices=indices.astype(int), points=arguments, values=values, - sample_names=self.sample_names[key], + sample_names=list(np.array(self.sample_names)[key]), ) ##################################################################### # Numpy methods From 2fa07cd7f355e06d3687add312ab2aa64b79832c Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Wed, 17 Apr 2024 19:06:28 +0200 Subject: [PATCH 16/48] examples --- examples/plot_fdatairregular_to_basis.py | 5 +- .../plot_irregular_to_basis_mixed_effects.py | 85 +++++++++++-------- skfda/datasets/_sample_from_fdata.py | 28 +++--- 3 files changed, 66 insertions(+), 52 deletions(-) diff --git a/examples/plot_fdatairregular_to_basis.py b/examples/plot_fdatairregular_to_basis.py index adafe7a60..ee3f8eef0 100644 --- a/examples/plot_fdatairregular_to_basis.py +++ b/examples/plot_fdatairregular_to_basis.py @@ -17,8 +17,6 @@ from skfda.representation.basis import FourierBasis from skfda.misc.scoring import r2_score -np.random.seed(439472) # set the seed for reproducibility - # %% # First, the Canadian Weather dataset is downloaded from the package 'fda' in # CRAN. It contains a FDataGrid with daily temperatures and precipitations, @@ -29,8 +27,9 @@ # representation, we will take an irregular sample of the temperatures dataset # containing only 8 points per curve. fd_temperatures = fetch_weather().data.coordinates[0] +random_state = np.random.RandomState(seed=4934755) irregular_temperatures = irregular_sample( - fdata=fd_temperatures, n_points_per_curve=8, + fdata=fd_temperatures, n_points_per_curve=8, random_state=random_state, ) # %% diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index f1aa1131a..7f104e6b2 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -2,8 +2,8 @@ Mixed effects model to convert irregular data to basis representation ======================================================================= -Convert irregular data to a basis representation using the mixed effects models -implemented in :class:`skfda.representation.irregular.FDataIrregular` class. +This example converts irregular data to a basis representation using a mixed +effects model. """ # Author: Pablo Cuesta Sierra # License: MIT @@ -12,6 +12,7 @@ import matplotlib.pyplot as plt import numpy as np +from sklearn.model_selection import train_test_split from skfda import FDataBasis from skfda.datasets import irregular_sample @@ -19,34 +20,38 @@ from skfda.representation.conversion import EMMixedEffectsConverter from skfda.misc.scoring import r2_score, mean_squared_error -np.random.seed(4934755) # set the seed for reproducibility - # %% # For this example, we are going to simulate the irregular sampling of a # dataset following the mixed effects model, to later attempt to reconstruct -# the original data with an -# :class:`skfda.representation.conversion.MixedEffectsConverter`. +# the original data. # -# First, we create the original :class:`skfda.representation.basis.FDataBasis` -# object, whose coefficients follow the mixed effects model for irregular data -# as presented in :cite:p:`james_2018_sparsenessfda`. This just means that -# the coefficients are generated from a Gaussian distribution. Our dataset -# will contain 40 curves. +# First, we generate the original basis representation of the data following +# the mixed effects model for irregular data as presented by +# :footcite:t:`james_2018_sparsenessfda`. This just means that +# the coefficients of the basis representation are generated from a Gaussian +# distribution. +n_curves = 50 n_basis = 4 domain_range = (0, 10) basis = BSplineBasis(n_basis=n_basis, domain_range=domain_range, order=4) + basis.plot() plt.title("Basis functions") coeff_mean = np.array([-10, 20, -24, 4]) -coeff_cov_sqrt = np.random.rand(n_basis, n_basis) * 5 -coeff_cov = coeff_cov_sqrt @ coeff_cov_sqrt.T # ensure positive semidefinite -coefficients = np.random.multivariate_normal( - mean=coeff_mean, cov=coeff_cov, size=40, +cov_sqrt = np.array([ + [3.2, 0.0, 0.0, 0.0], + [0.4, 6.0, 0.0, 0.0], + [0.3, 1.5, 2.0, 0.0], + [1.2, 0.3, 2.5, 1.8], +]) +random_state = np.random.RandomState(seed=4934755) +coefficients = ( + coeff_mean + random_state.normal(size=(n_curves, n_basis)) @ cov_sqrt ) - fdatabasis_original = FDataBasis(basis, coefficients) + # Plot the first 10 curves fdatabasis_original[:10].plot() plt.title("Original curves") @@ -54,38 +59,44 @@ # %% -# Sencondly, we will simulate the irregular sampling of the original data -# with random noise. For each curve, we will sample 4 points from the domain. +# Sencondly, we simulate the irregular sampling of the original data. fd_irregular = irregular_sample( - fdatabasis_original, n_points_per_curve=4, noise_stddev=0.2, + fdatabasis_original, + n_points_per_curve=3, # Number of points per curve in the irregular data + random_state=random_state, ) + +# Plot the last 6 curves of the newly created irregular data fig = plt.figure() -fdatabasis_original[-6:].plot(fig=fig) -fd_irregular[-6:].scatter(fig=fig, alpha=0.1) +fdatabasis_original[-6:].plot(fig=fig, alpha=0.3) +fd_irregular[-6:].scatter(fig=fig) plt.show() # %% -# Moreover, we will split our irregular data into two groups, the train curves -# and the test curves. We will use the train curves to fit the mixed effects -# model and the test curves to evaluate the quality of the conversion. -test_original = fdatabasis_original[::2] -train_original = fdatabasis_original[1::2] -test_irregular = fd_irregular[::2] -train_irregular = fd_irregular[1::2] +# We split our irregular data into two groups, the train curves +# and the test curves. +train_original, test_original, train_irregular, test_irregular = ( + train_test_split( + fdatabasis_original, + fd_irregular, + test_size=0.5, + random_state=random_state, + ) +) # %% -# Now, we create and train the mixed effects converter. +# Now, we create and train the mixed effects converter using the train curves, converter = EMMixedEffectsConverter(basis) converter = converter.fit(train_irregular) # %% -# And convert the irregular data to basis representation. +# and we convert the irregular data to basis representation. train_converted = converter.transform(train_irregular) test_converted = converter.transform(test_irregular) # %% -# Let's plot the first 8 original and converted curves of the test set. -# On the background, we plot the train set. +# To visualize the conversion results, we plot the first 8 original and +# converted curves of the test set. On the background, we plot the train set. fig = plt.figure(figsize=(10, 15)) for k in range(8): axes = plt.subplot(4, 2, k + 1) @@ -106,17 +117,17 @@ plt.show() # %% -# Finally, we will use the :math:`R^2` score and the :math:`MSE` to compare +# Finally, we make use of the :math:`R^2` score and the :math:`MSE` to compare # the converted basis representations with the original data, both for the # train and test sets. train_r2_score = r2_score(train_original, train_converted) test_r2_score = r2_score(test_original, test_converted) train_mse = mean_squared_error(train_original, train_converted) test_mse = mean_squared_error(test_original, test_converted) -print(f"Train R2 score: {train_r2_score:.2f}") -print(f"Test R2 score: {test_r2_score:.2f}") -print(f"Train Mean Squared Error: {train_mse:.2f}") -print(f"Test Mean Squared Error: {test_mse:.2f}") +print(f"R2 score (train): {train_r2_score:.2f}") +print(f"R2 score (test): {test_r2_score:.2f}") +print(f"Mean Squared Error (train): {train_mse:.2f}") +print(f"Mean Squared Error (test): {test_mse:.2f}") # %% # References diff --git a/skfda/datasets/_sample_from_fdata.py b/skfda/datasets/_sample_from_fdata.py index fbbe29686..a5e790488 100644 --- a/skfda/datasets/_sample_from_fdata.py +++ b/skfda/datasets/_sample_from_fdata.py @@ -2,18 +2,18 @@ from typing import Callable, Iterable from functools import singledispatch - import numpy as np + + +from ..misc.validation import validate_random_state from ..representation import FData, FDataGrid, FDataBasis, FDataIrregular -from ..representation.interpolation import SplineInterpolation -from ..typing._base import DomainRangeLike, GridPointsLike, RandomStateLike +from ..typing._base import RandomState, RandomStateLike from ..typing._numpy import NDArrayFloat def _irregular_sample_from_callable( funcs: Iterable[Callable[[NDArrayFloat], NDArrayFloat]], points_matrix: NDArrayFloat, - noise_stddev: float, ) -> FDataIrregular: """Sample from a list of functions at irregular points. @@ -21,7 +21,6 @@ def _irregular_sample_from_callable( funcs: List of functions to sample. points_matrix: of shape (n_funcs, n_points_per_function). Points where to measure each function sample. - noise_stddev: Standard deviation of the noise. """ assert points_matrix.ndim == 2 n_points_per_curve = points_matrix.shape[1] @@ -32,14 +31,14 @@ def _irregular_sample_from_callable( values=np.concatenate([ func(func_points).reshape(-1) for func, func_points in zip(funcs, points_matrix) - ]) + np.random.normal(scale=noise_stddev, size=total_n_points), + ]), ) def irregular_sample( fdata: FDataGrid | FDataBasis, n_points_per_curve: int, - noise_stddev: float = 0.0, + random_state: RandomStateLike = None, ) -> FDataIrregular: """Irregularly sample from a FDataGrid or FDataBasis object. @@ -49,20 +48,22 @@ def irregular_sample( Args: fdata: Functional data object to sample from. n_points_per_curve: Number of points to sample per curve. - noise_stddev: Standard deviation of the noise. """ if fdata.dim_domain != 1 or fdata.dim_codomain != 1: raise NotImplementedError( "Only implemented for 1D domains and codomains.", ) + random_state = validate_random_state(random_state) + points_matrix = _irregular_sample_points_matrix( - fdata, n_points_per_curve, + fdata, + n_points_per_curve=n_points_per_curve, + random_state=random_state, ) return _irregular_sample_from_callable( funcs=fdata, points_matrix=points_matrix, - noise_stddev=noise_stddev, ) @@ -70,6 +71,7 @@ def irregular_sample( def _irregular_sample_points_matrix( fdata: FDataGrid | FDataBasis, n_points_per_curve: int, + random_state: RandomState, ) -> NDArrayFloat: raise NotImplementedError( "Only implemented for FDataGrid and FDataBasis.", @@ -80,8 +82,9 @@ def _irregular_sample_points_matrix( def _irregular_sample_points_matrix_fdatagrid( fdata: FDataGrid, n_points_per_curve: int, + random_state: RandomState, ) -> NDArrayFloat: - return np.random.choice( + return random_state.choice( fdata.grid_points[0], # This only works for 1D domains size=(fdata.n_samples, n_points_per_curve), replace=True, @@ -92,8 +95,9 @@ def _irregular_sample_points_matrix_fdatagrid( def _irregular_sample_points_matrix_fdatabasis( fdata: FDataBasis, n_points_per_curve: int, + random_state: RandomState, ) -> NDArrayFloat: - return np.random.uniform( + return random_state.uniform( *fdata.domain_range[0], # This only works for 1D domains size=(fdata.n_samples, n_points_per_curve), ) From 10ed91fde84be3c22015cf4cd18a684ed9742889 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 20 Apr 2024 12:01:37 +0200 Subject: [PATCH 17/48] irregular_sample can receive an fdatairreuglar and a list of n_points_per_curve --- skfda/datasets/_sample_from_fdata.py | 119 +++++++++++++++------------ 1 file changed, 67 insertions(+), 52 deletions(-) diff --git a/skfda/datasets/_sample_from_fdata.py b/skfda/datasets/_sample_from_fdata.py index a5e790488..3f15d3392 100644 --- a/skfda/datasets/_sample_from_fdata.py +++ b/skfda/datasets/_sample_from_fdata.py @@ -1,43 +1,19 @@ from __future__ import annotations -from typing import Callable, Iterable +from typing import List from functools import singledispatch import numpy as np from ..misc.validation import validate_random_state -from ..representation import FData, FDataGrid, FDataBasis, FDataIrregular +from ..representation import FDataBasis, FDataGrid, FDataIrregular from ..typing._base import RandomState, RandomStateLike -from ..typing._numpy import NDArrayFloat - - -def _irregular_sample_from_callable( - funcs: Iterable[Callable[[NDArrayFloat], NDArrayFloat]], - points_matrix: NDArrayFloat, -) -> FDataIrregular: - """Sample from a list of functions at irregular points. - - Args: - funcs: List of functions to sample. - points_matrix: of shape (n_funcs, n_points_per_function). Points where - to measure each function sample. - """ - assert points_matrix.ndim == 2 - n_points_per_curve = points_matrix.shape[1] - total_n_points = points_matrix.shape[0] * n_points_per_curve - return FDataIrregular( - points=points_matrix.reshape(-1), - start_indices=np.array(range(0, total_n_points, n_points_per_curve)), - values=np.concatenate([ - func(func_points).reshape(-1) - for func, func_points in zip(funcs, points_matrix) - ]), - ) +from ..typing._numpy import NDArrayFloat, NDArrayInt def irregular_sample( - fdata: FDataGrid | FDataBasis, - n_points_per_curve: int, + fdata: FDataBasis | FDataGrid | FDataIrregular, + n_points_per_curve: int | NDArrayInt, random_state: RandomStateLike = None, ) -> FDataIrregular: """Irregularly sample from a FDataGrid or FDataBasis object. @@ -55,49 +31,88 @@ def irregular_sample( ) random_state = validate_random_state(random_state) + if isinstance(n_points_per_curve, int): + n_points_per_curve = np.full(fdata.n_samples, n_points_per_curve) - points_matrix = _irregular_sample_points_matrix( + points_list = _irregular_sample_points_list( fdata, n_points_per_curve=n_points_per_curve, random_state=random_state, ) - return _irregular_sample_from_callable( - funcs=fdata, - points_matrix=points_matrix, + return FDataIrregular( + points=np.concatenate(points_list), + start_indices=np.cumsum( + np.concatenate([ + np.zeros(1, dtype=int), + n_points_per_curve[:-1], + ]), + ), + values=np.concatenate([ + func(func_points).reshape(-1) + for func, func_points in zip(fdata, points_list) + ]), ) @singledispatch -def _irregular_sample_points_matrix( - fdata: FDataGrid | FDataBasis, - n_points_per_curve: int, +def _irregular_sample_points_list( + fdata: FDataBasis | FDataGrid | FDataIrregular, + n_points_per_curve: NDArrayInt, random_state: RandomState, -) -> NDArrayFloat: +) -> List[NDArrayFloat]: raise NotImplementedError( "Only implemented for FDataGrid and FDataBasis.", ) -@_irregular_sample_points_matrix.register +@_irregular_sample_points_list.register def _irregular_sample_points_matrix_fdatagrid( fdata: FDataGrid, - n_points_per_curve: int, + n_points_per_curve: NDArrayInt, random_state: RandomState, -) -> NDArrayFloat: - return random_state.choice( - fdata.grid_points[0], # This only works for 1D domains - size=(fdata.n_samples, n_points_per_curve), - replace=True, - ) +) -> List[NDArrayFloat]: + # This only works for 1D domains + return [ + random_state.choice( + fdata.grid_points[0], + size=(n_points), + replace=True, + ) + for n_points in n_points_per_curve + ] + +@_irregular_sample_points_list.register +def _irregular_sample_points_matrix_fdatairregular( + fdata: FDataIrregular, + n_points_per_curve: NDArrayInt, + random_state: RandomState, +) -> List[NDArrayFloat]: + # This only works for 1D domains + return [ + random_state.choice( + curve_points, + size=(n_points), + replace=True, + ) + for n_points, curve_points in zip( + n_points_per_curve, + np.split(fdata.points, fdata.start_indices[1:]), + ) + ] -@_irregular_sample_points_matrix.register + +@_irregular_sample_points_list.register def _irregular_sample_points_matrix_fdatabasis( fdata: FDataBasis, - n_points_per_curve: int, + n_points_per_curve: NDArrayInt, random_state: RandomState, -) -> NDArrayFloat: - return random_state.uniform( - *fdata.domain_range[0], # This only works for 1D domains - size=(fdata.n_samples, n_points_per_curve), - ) +) -> List[NDArrayFloat]: + # This only works for 1D domains + return [ + random_state.uniform( + *fdata.domain_range[0], + size=(n_points), + ) + for n_points in n_points_per_curve + ] From 7104d2a461a7b56e2b88a4715999368c0aac1534 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 20 Apr 2024 12:01:55 +0200 Subject: [PATCH 18/48] example removing points from fdatairregular --- .../plot_irregular_to_basis_mixed_effects.py | 126 ++++++++++++++++-- 1 file changed, 114 insertions(+), 12 deletions(-) diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index 7f104e6b2..a33907fd5 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -12,19 +12,22 @@ import matplotlib.pyplot as plt import numpy as np +import pandas as pd from sklearn.model_selection import train_test_split from skfda import FDataBasis -from skfda.datasets import irregular_sample -from skfda.representation.basis import BSplineBasis +from skfda.datasets import fetch_weather, irregular_sample +from skfda.representation.basis import BSplineBasis, FourierBasis from skfda.representation.conversion import EMMixedEffectsConverter from skfda.misc.scoring import r2_score, mean_squared_error # %% -# For this example, we are going to simulate the irregular sampling of a -# dataset following the mixed effects model, to later attempt to reconstruct -# the original data. +# Converting irregular data to basis representation +# ################################################# +# For the first part of this example, we are going to simulate the irregular +# sampling of a dataset following the mixed effects model, to later attempt to +# reconstruct the original data. # # First, we generate the original basis representation of the data following # the mixed effects model for irregular data as presented by @@ -33,7 +36,7 @@ # distribution. n_curves = 50 n_basis = 4 -domain_range = (0, 10) +domain_range = (0, 12) basis = BSplineBasis(n_basis=n_basis, domain_range=domain_range, order=4) basis.plot() @@ -94,6 +97,11 @@ train_converted = converter.transform(train_irregular) test_converted = converter.transform(test_irregular) +# For comparison, we also convert to basis representation using the separate +# basis representation for each curve. +train_separate_basis = train_irregular.to_basis(basis) +test_separate_basis = test_irregular.to_basis(basis) + # %% # To visualize the conversion results, we plot the first 8 original and # converted curves of the test set. On the background, we plot the train set. @@ -105,14 +113,18 @@ train_irregular.scatter(axes=axes, color=(0, 0, 0, 0.05), marker=".") test_converted[k].plot( - axes=axes, color=f"C{k}", label="Converted", + axes=axes, color=f"C{k}", linestyle="--", label="Converted", ) test_original[k].plot( - axes=axes, color=f"C{k}", linestyle="--", label="Original", + axes=axes, color=f"C{k}", linewidth=0.65, label="Original", ) test_irregular[k].scatter( axes=axes, color=f"C{k}", label="Irregular" ) + test_separate_basis[k].plot( + axes=axes, color=f"C{k}", linestyle=":", + label="Separate basis representation", + ) plt.legend() plt.show() @@ -124,10 +136,100 @@ test_r2_score = r2_score(test_original, test_converted) train_mse = mean_squared_error(train_original, train_converted) test_mse = mean_squared_error(test_original, test_converted) -print(f"R2 score (train): {train_r2_score:.2f}") -print(f"R2 score (test): {test_r2_score:.2f}") -print(f"Mean Squared Error (train): {train_mse:.2f}") -print(f"Mean Squared Error (test): {test_mse:.2f}") +train_r2_score_separate = r2_score(train_original, train_separate_basis) +test_r2_score_separate = r2_score(test_original, test_separate_basis) +train_mse_separate = mean_squared_error(train_original, train_separate_basis) +test_mse_separate = mean_squared_error(test_original, test_separate_basis) +print(f"R2 score (mixed effects - train): {train_r2_score:.2f}") +print(f"R2 score (mixed effects - test): {test_r2_score:.2f}") +print(f"R2 score (separate basis - train): {train_r2_score_separate:.2f}") +print(f"R2 score (separate basis - test): {test_r2_score_separate:.2f}") +print(f"Mean Squared Error (mixed effects - train): {train_mse:.2f}") +print(f"Mean Squared Error (mixed effects - test): {test_mse:.2f}") +print(f"Mean Squared Error (separate basis - train): {train_mse_separate:.2f}") +print(f"Mean Squared Error (separate basis - test): {test_mse_separate:.2f}") + +# %% +# Check robustness of the method by removing measurement points +# ############################################################# +# For the second part of the example, we are going to check the robustness of +# the method by removing some measurement points from the test and train sets +# and comparing the results. The temperatures from the Canadian weather +# dataset are used to generate the irregular data. +fd_temperatures = fetch_weather().data.coordinates[0] +fd_irregular = irregular_sample( + fdata=fd_temperatures, n_points_per_curve=40, random_state=random_state, +) +basis = FourierBasis(n_basis=5, domain_range=fd_temperatures.domain_range) + +# %% +# Split the data into train and test sets +train_original, test_original, train_irregular, test_irregular = ( + train_test_split( + fd_temperatures, + fd_irregular, + test_size=0.2, + random_state=random_state, + ) +) + +# %% +# Create the different datasets by removing some measurement points +train_irregular_list = [] +test_irregular_list = [] +n_points_list = [40, 10, 5, 4, 3] +for n_points in n_points_list: + train_irregular_list.append( + irregular_sample( + train_original, + n_points_per_curve=n_points, + random_state=random_state, + ) + ) + test_irregular_list.append( + irregular_sample( + test_original, + n_points_per_curve=n_points, + random_state=random_state, + ) + ) + +# %% +# We convert the irregular data to basis representation and compute the scores: +scores = { + "n_points_per_curve": n_points_list, + "Train R2 score": [], + "Test R2 score": [], + "Train MSE": [], + "Test MSE": [], +} +converter = EMMixedEffectsConverter(basis) +for train_irregular, test_irregular in zip( + train_irregular_list, + test_irregular_list, +): + converter = converter.fit(train_irregular) + train_converted = converter.transform(train_irregular) + test_converted = converter.transform(test_irregular) + + scores["Train R2 score"].append(r2_score( + train_original, train_converted.to_grid(train_original.grid_points), + )) + scores["Test R2 score"].append(r2_score( + test_original, test_converted.to_grid(test_original.grid_points), + )) + scores["Train MSE"].append(mean_squared_error( + train_original, train_converted.to_grid(train_original.grid_points), + )) + scores["Test MSE"].append(mean_squared_error( + test_original, test_converted.to_grid(test_original.grid_points), + )) + +# %% +# Finally, we can see the results in a table: +df = pd.DataFrame(scores) +df = df.set_index("n_points_per_curve") +print(df) # %% # References From 5f58016ebed100b091ec6dc9842e23b49c6dec15 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 20 Apr 2024 12:52:41 +0200 Subject: [PATCH 19/48] irregular to basis doctest example with plot --- docs/conf.py | 1 + skfda/representation/irregular.py | 35 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index e25650181..aea8c0197 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -80,6 +80,7 @@ # Sphinx extensions extensions = [ "jupyter_sphinx", + "matplotlib.sphinxext.plot_directive", "myst_parser", "sphinx.ext.autodoc", "sphinx.ext.autodoc.typehints", diff --git a/skfda/representation/irregular.py b/skfda/representation/irregular.py index 8737f50f4..79fdd1fea 100644 --- a/skfda/representation/irregular.py +++ b/skfda/representation/irregular.py @@ -1120,6 +1120,41 @@ def to_basis( Returns: FDataBasis: Basis representation of the funtional data object. + + .. plot:: + :format: python + :include-source: True + + >>> from skfda.datasets import fetch_weather, irregular_sample + >>> from skfda.representation.basis import FourierBasis + >>> import matplotlib.pyplot as plt + >>> fd_temperatures = fetch_weather().data.coordinates[0] + >>> temp_irregular = irregular_sample( + ... fdata=fd_temperatures, + ... n_points_per_curve=8, + ... random_state=4934755, + ... ) + >>> basis = FourierBasis( + ... n_basis=5, domain_range=fd_temperatures.domain_range, + ... ) + >>> temp_basis_repr = temp_irregular.to_basis( #doctest: +SKIP + ... basis, conversion_type="mixed_effects", + ... ) + >>> fig = plt.figure(figsize=(10, 10)) + >>> for k in range(4): #doctest: +SKIP + ... axes = plt.subplot(2, 2, k + 1) + ... fd_temperatures.plot(axes=axes, alpha=0.05, color="black") + ... fd_temperatures[k].plot( + ... axes=axes, color=f"C{k}", + ... label="Original data", linestyle="--", + ... ) + ... temp_basis_repr[k].plot( + ... axes=axes, color=f"C{k}", + ... label="Basis representation", + ... ) + ... temp_irregular[k].scatter(axes=axes, color=f"C{k}") + ... plt.legend() + >>> plt.show() #doctest: +SKIP """ if self.dim_domain != basis.dim_domain: raise ValueError( From 2eff88ef02d458fbe148980cb0e0d03b90830813 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Mon, 22 Apr 2024 19:25:57 +0200 Subject: [PATCH 20/48] update examples --- ...plot_irregular_mixed_effects_robustness.py | 185 +++++++++++++++ .../plot_irregular_to_basis_mixed_effects.py | 213 ++++++------------ skfda/datasets/_sample_from_fdata.py | 53 +++-- skfda/representation/irregular.py | 16 +- 4 files changed, 305 insertions(+), 162 deletions(-) create mode 100644 examples/plot_irregular_mixed_effects_robustness.py diff --git a/examples/plot_irregular_mixed_effects_robustness.py b/examples/plot_irregular_mixed_effects_robustness.py new file mode 100644 index 000000000..bb5f5d9e8 --- /dev/null +++ b/examples/plot_irregular_mixed_effects_robustness.py @@ -0,0 +1,185 @@ +""" +Mixed-effects model for irregular data when removing measurement points +======================================================================= + +This example converts irregular data to a basis representation using a mixed +effects model and checks the robustness of the method by fitting +the model with decreasing number of measurement points per curve. +""" +# Author: Pablo Cuesta Sierra +# License: MIT + +# sphinx_gallery_thumbnail_number = -1 + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split + +from skfda import FDataIrregular +from skfda.datasets import fetch_weather, irregular_sample +from skfda.representation.basis import FourierBasis +from skfda.representation.conversion import EMMixedEffectsConverter +from skfda.misc.scoring import r2_score, mean_squared_error + + +# %% +# For this example, we are going to check the robustness of +# the mixed effects method for converting irregular data to basis +# representation by removing some measurement points from the test and train +# sets and comparing the results. The temperatures from the Canadian weather +# dataset are used to generate the irregular data. +fd_temperatures = fetch_weather().data.coordinates[0] +basis = FourierBasis(n_basis=5, domain_range=fd_temperatures.domain_range) + +fd_temperatures.plot() +plt.show() +basis.plot() +plt.title("Basis functions") +plt.show() + +# %% +# We split the data into train and test sets: +random_state = np.random.RandomState(seed=4934792) +train_original, test_original = train_test_split( + fd_temperatures, + test_size=0.3, + random_state=random_state, +) + +# %% +# Then, we create datasets with decreasing number of measurement points per +# curve, by removing measurement points from the previous dataset iteratively. +train_irregular_list = [train_original] +test_irregular_list = [test_original] +n_points_list = [40, 10, 7, 5, 4, 3] +for n_points in n_points_list: + train_irregular_list.append( + irregular_sample( + train_irregular_list[-1], + n_points_per_curve=n_points, + random_state=random_state, + ), + ) + test_irregular_list.append( + irregular_sample( + test_irregular_list[-1], + n_points_per_curve=n_points, + random_state=random_state, + ), + ) + +# remove the original dataset from the lists +train_irregular_list = train_irregular_list[1:] +test_irregular_list = test_irregular_list[1:] + +# %% +# We convert the irregular data to basis representation and compute the scores. +# To do so, we fit the converter once per train set. After fitting the +# the converter with a train set that has :math:`k` points per curve, we +# use it to transform that train set, the test set with :math:`k` points per +# curve and the original test set with 365 points per curve. +score_functions = { + "R^2": r2_score, + "MSE": mean_squared_error, +} +converted_data = { + "Train-sparse": {}, + "Test-sparse": {}, + "Test-original": {}, +} +scores = { + score_name: { + "n_points_per_curve": n_points_list, + **{data_name: [] for data_name in converted_data.keys()}, + } + for score_name in score_functions.keys() +} +converter = EMMixedEffectsConverter(basis) +for n_points, train_irregular, test_irregular in zip( + n_points_list, + train_irregular_list, + test_irregular_list, +): + converter = converter.fit(train_irregular) + train_sparse_converted = converter.transform(train_irregular) + test_sparse_converted = converter.transform(test_irregular) + test_original_converted = converter.transform( + FDataIrregular.from_fdatagrid(test_original), + ) + converted_data["Train-sparse"][n_points] = train_sparse_converted + converted_data["Test-sparse"][n_points] = test_sparse_converted + converted_data["Test-original"][n_points] = test_original_converted + + for score_name, score_fun in score_functions.items(): + scores[score_name]["Train-sparse"].append(score_fun( + train_original, + train_sparse_converted.to_grid(train_original.grid_points), + )) + scores[score_name]["Test-sparse"].append(score_fun( + test_original, + test_sparse_converted.to_grid(test_original.grid_points), + )) + scores[score_name]["Test-original"].append(score_fun( + test_original, + test_original_converted.to_grid(test_original.grid_points), + )) + +# %% +# Finally, we have the scores for the train and test sets with decreasing +# number of measurement points per curve. +for score_name in scores.keys(): + print("-" * 62) + print(f"{score_name} scores:") + print("-" * 62) + print(( + pd.DataFrame(scores[score_name]) + .set_index("n_points_per_curve").sort_index() + ), end="\n\n\n") + +# %% +# The following plots show the original curves along with the converted +# test curves for the conversions with 5, 4 and 3 points per curve. +for ( + n_points_per_curve, + test_irregular, + test_converted, + test_original_converted, +) in zip( + n_points_list, + test_irregular_list, + converted_data["Test-sparse"].values(), + converted_data["Test-original"].values(), +): + if n_points_per_curve not in [5, 4, 3]: + continue + fig = plt.figure(figsize=(10, 23)) + for k in range(7): + axes = plt.subplot(7, 1, k + 1) + + test_irregular[k].scatter( + axes=axes, color=f"C{k}", + ) + test_original[k].plot( + axes=axes, color=f"C{k}", linewidth=0.65, + label="Original test curve", + ) + test_converted[k].plot( + axes=axes, color=f"C{k}", linestyle="--", + label=f"Test curve transformed from {n_points_per_curve} points", + ) + test_original_converted[k].plot( + axes=axes, color=f"C{k}", alpha=0.5, + label="Test curve transformed from original 365 points", + ) + axes.legend(bbox_to_anchor=(1., 1.)) + plt.tight_layout(rect=[0, 0, 1, 0.98]) + plt.suptitle(f"Fitted model with {n_points_per_curve=}") + + plt.show() + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index a33907fd5..f90a7d5c4 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -1,10 +1,11 @@ """ -Mixed effects model to convert irregular data to basis representation -======================================================================= +Mixed effects model for irregular data +============================================================================== This example converts irregular data to a basis representation using a mixed effects model. """ +# %% # Author: Pablo Cuesta Sierra # License: MIT @@ -15,19 +16,17 @@ import pandas as pd from sklearn.model_selection import train_test_split -from skfda import FDataBasis -from skfda.datasets import fetch_weather, irregular_sample -from skfda.representation.basis import BSplineBasis, FourierBasis +from skfda import FDataBasis, FDataIrregular +from skfda.datasets import irregular_sample +from skfda.representation.basis import BSplineBasis from skfda.representation.conversion import EMMixedEffectsConverter from skfda.misc.scoring import r2_score, mean_squared_error # %% -# Converting irregular data to basis representation -# ################################################# -# For the first part of this example, we are going to simulate the irregular +# For this example, we are going to simulate the irregular # sampling of a dataset following the mixed effects model, to later attempt to -# reconstruct the original data. +# reconstruct said original dataset. # # First, we generate the original basis representation of the data following # the mixed effects model for irregular data as presented by @@ -37,13 +36,14 @@ n_curves = 50 n_basis = 4 domain_range = (0, 12) -basis = BSplineBasis(n_basis=n_basis, domain_range=domain_range, order=4) +basis = BSplineBasis(n_basis=n_basis, domain_range=domain_range, order=3) +plt.figure(figsize=(10, 5)) basis.plot() plt.title("Basis functions") coeff_mean = np.array([-10, 20, -24, 4]) -cov_sqrt = np.array([ +coeff_cov_sqrt = np.array([ [3.2, 0.0, 0.0, 0.0], [0.4, 6.0, 0.0, 0.0], [0.3, 1.5, 2.0, 0.0], @@ -51,28 +51,41 @@ ]) random_state = np.random.RandomState(seed=4934755) coefficients = ( - coeff_mean + random_state.normal(size=(n_curves, n_basis)) @ cov_sqrt + coeff_mean + random_state.normal(size=(n_curves, n_basis)) @ coeff_cov_sqrt ) fdatabasis_original = FDataBasis(basis, coefficients) -# Plot the first 10 curves -fdatabasis_original[:10].plot() +# Plot the first 6 curves +plt.figure(figsize=(10, 5)) +fdatabasis_original[:6].plot() plt.title("Original curves") plt.show() # %% -# Sencondly, we simulate the irregular sampling of the original data. -fd_irregular = irregular_sample( +# Sencondly, we subsample of the original data by measuring a random number of +# points per curve generating an irregular dataset. +# Moreover, we add some noise to the data. +fd_irregular_without_noise = irregular_sample( fdatabasis_original, - n_points_per_curve=3, # Number of points per curve in the irregular data + n_points_per_curve=random_state.randint(3, 5, n_curves), random_state=random_state, ) +noise_std = 0.1 +fd_irregular = FDataIrregular( + points=fd_irregular_without_noise.points, + start_indices=fd_irregular_without_noise.start_indices, + values=fd_irregular_without_noise.values + random_state.normal( + 0, noise_std, fd_irregular_without_noise.values.shape, + ), +) -# Plot the last 6 curves of the newly created irregular data -fig = plt.figure() -fdatabasis_original[-6:].plot(fig=fig, alpha=0.3) -fd_irregular[-6:].scatter(fig=fig) +# Plot 9 curves of the newly created irregular data +fig = plt.figure(figsize=(10, 10)) +for k in range(9): + axes = plt.subplot(3, 3, k + 1) + fdatabasis_original[k].plot(axes=axes, alpha=0.3, color=f"C{k}") + fd_irregular[k].plot(axes=axes, marker=".", color=f"C{k}") plt.show() # %% @@ -82,154 +95,78 @@ train_test_split( fdatabasis_original, fd_irregular, - test_size=0.5, + test_size=0.3, random_state=random_state, ) ) # %% # Now, we create and train the mixed effects converter using the train curves, +# and we convert the irregular data to basis representation. +# For comparison, we also convert to basis representation using the default +# basis representation for each curve, which is done curve-wise instead of +# taking into account the whole dataset. converter = EMMixedEffectsConverter(basis) -converter = converter.fit(train_irregular) +converter.fit(train_irregular) -# %% -# and we convert the irregular data to basis representation. train_converted = converter.transform(train_irregular) test_converted = converter.transform(test_irregular) -# For comparison, we also convert to basis representation using the separate -# basis representation for each curve. -train_separate_basis = train_irregular.to_basis(basis) -test_separate_basis = test_irregular.to_basis(basis) +train_curvewise_to_basis = train_irregular.to_basis(basis) +test_curvewise_to_basis = test_irregular.to_basis(basis) # %% # To visualize the conversion results, we plot the first 8 original and # converted curves of the test set. On the background, we plot the train set. -fig = plt.figure(figsize=(10, 15)) +fig = plt.figure(figsize=(10, 25)) for k in range(8): - axes = plt.subplot(4, 2, k + 1) + axes = plt.subplot(8, 1, k + 1) - train_original.plot(axes=axes, color=(0, 0, 0, 0.05)) - train_irregular.scatter(axes=axes, color=(0, 0, 0, 0.05), marker=".") + # train_original.plot(axes=axes, color=(0, 0, 0, 0.05)) + # train_irregular.scatter(axes=axes, color=(0, 0, 0, 0.05), marker=".") - test_converted[k].plot( - axes=axes, color=f"C{k}", linestyle="--", label="Converted", - ) - test_original[k].plot( - axes=axes, color=f"C{k}", linewidth=0.65, label="Original", - ) test_irregular[k].scatter( axes=axes, color=f"C{k}", label="Irregular" ) - test_separate_basis[k].plot( + test_curvewise_to_basis[k].plot( axes=axes, color=f"C{k}", linestyle=":", - label="Separate basis representation", + label="Curve-wise conversion", ) - plt.legend() + test_converted[k].plot( + axes=axes, color=f"C{k}", linestyle="--", + label="Mixed-effects conversion", + ) + test_original[k].plot( + axes=axes, color=f"C{k}", alpha=0.5, + label="Original basis representation", + ) + axes.legend(bbox_to_anchor=(1., 1.)) + plt.tight_layout(rect=[0, 0, 1, 0.98]) plt.show() # %% # Finally, we make use of the :math:`R^2` score and the :math:`MSE` to compare # the converted basis representations with the original data, both for the # train and test sets. -train_r2_score = r2_score(train_original, train_converted) -test_r2_score = r2_score(test_original, test_converted) -train_mse = mean_squared_error(train_original, train_converted) -test_mse = mean_squared_error(test_original, test_converted) -train_r2_score_separate = r2_score(train_original, train_separate_basis) -test_r2_score_separate = r2_score(test_original, test_separate_basis) -train_mse_separate = mean_squared_error(train_original, train_separate_basis) -test_mse_separate = mean_squared_error(test_original, test_separate_basis) -print(f"R2 score (mixed effects - train): {train_r2_score:.2f}") -print(f"R2 score (mixed effects - test): {test_r2_score:.2f}") -print(f"R2 score (separate basis - train): {train_r2_score_separate:.2f}") -print(f"R2 score (separate basis - test): {test_r2_score_separate:.2f}") -print(f"Mean Squared Error (mixed effects - train): {train_mse:.2f}") -print(f"Mean Squared Error (mixed effects - test): {test_mse:.2f}") -print(f"Mean Squared Error (separate basis - train): {train_mse_separate:.2f}") -print(f"Mean Squared Error (separate basis - test): {test_mse_separate:.2f}") - -# %% -# Check robustness of the method by removing measurement points -# ############################################################# -# For the second part of the example, we are going to check the robustness of -# the method by removing some measurement points from the test and train sets -# and comparing the results. The temperatures from the Canadian weather -# dataset are used to generate the irregular data. -fd_temperatures = fetch_weather().data.coordinates[0] -fd_irregular = irregular_sample( - fdata=fd_temperatures, n_points_per_curve=40, random_state=random_state, -) -basis = FourierBasis(n_basis=5, domain_range=fd_temperatures.domain_range) - -# %% -# Split the data into train and test sets -train_original, test_original, train_irregular, test_irregular = ( - train_test_split( - fd_temperatures, - fd_irregular, - test_size=0.2, - random_state=random_state, - ) -) - -# %% -# Create the different datasets by removing some measurement points -train_irregular_list = [] -test_irregular_list = [] -n_points_list = [40, 10, 5, 4, 3] -for n_points in n_points_list: - train_irregular_list.append( - irregular_sample( - train_original, - n_points_per_curve=n_points, - random_state=random_state, - ) - ) - test_irregular_list.append( - irregular_sample( - test_original, - n_points_per_curve=n_points, - random_state=random_state, - ) - ) - -# %% -# We convert the irregular data to basis representation and compute the scores: +score_functions = {"R^2": r2_score, "MSE": mean_squared_error} scores = { - "n_points_per_curve": n_points_list, - "Train R2 score": [], - "Test R2 score": [], - "Train MSE": [], - "Test MSE": [], + score_name: pd.DataFrame({ + "Mixed-effects": { + "Train": score_fun(train_original, train_converted), + "Test": score_fun(test_original, test_converted), + }, + "Curve-wise": { + "Train": score_fun(train_original, train_curvewise_to_basis), + "Test": score_fun(test_original, test_curvewise_to_basis), + }, + }) + for score_name, score_fun in score_functions.items() } -converter = EMMixedEffectsConverter(basis) -for train_irregular, test_irregular in zip( - train_irregular_list, - test_irregular_list, -): - converter = converter.fit(train_irregular) - train_converted = converter.transform(train_irregular) - test_converted = converter.transform(test_irregular) - - scores["Train R2 score"].append(r2_score( - train_original, train_converted.to_grid(train_original.grid_points), - )) - scores["Test R2 score"].append(r2_score( - test_original, test_converted.to_grid(test_original.grid_points), - )) - scores["Train MSE"].append(mean_squared_error( - train_original, train_converted.to_grid(train_original.grid_points), - )) - scores["Test MSE"].append(mean_squared_error( - test_original, test_converted.to_grid(test_original.grid_points), - )) - -# %% -# Finally, we can see the results in a table: -df = pd.DataFrame(scores) -df = df.set_index("n_points_per_curve") -print(df) +for score_name, score_df in scores.items(): + print("-" * 35) + print(f"{score_name} scores:") + print("-" * 35) + print(score_df, end=f"\n\n\n") # %% # References diff --git a/skfda/datasets/_sample_from_fdata.py b/skfda/datasets/_sample_from_fdata.py index 3f15d3392..c685640d6 100644 --- a/skfda/datasets/_sample_from_fdata.py +++ b/skfda/datasets/_sample_from_fdata.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import List +from typing import List, Tuple from functools import singledispatch import numpy as np @@ -23,7 +23,11 @@ def irregular_sample( Args: fdata: Functional data object to sample from. - n_points_per_curve: Number of points to sample per curve. + n_points_per_curve: Number of points to sample per curve. If fdata is + an FDataGrid or an FDataIrregular and a sample has less points than + specified in n_points_per_curve, the sample will have the same number + of points as before. + """ if fdata.dim_domain != 1 or fdata.dim_codomain != 1: raise NotImplementedError( @@ -34,11 +38,17 @@ def irregular_sample( if isinstance(n_points_per_curve, int): n_points_per_curve = np.full(fdata.n_samples, n_points_per_curve) - points_list = _irregular_sample_points_list( - fdata, - n_points_per_curve=n_points_per_curve, - random_state=random_state, + points_list, n_points_per_curve = ( + _irregular_sample_points_list( + fdata, + n_points_per_curve=n_points_per_curve, + random_state=random_state, + ) ) + values = np.concatenate([ + func(func_points).reshape(-1) + for func, func_points in zip(fdata, points_list) + ]) return FDataIrregular( points=np.concatenate(points_list), start_indices=np.cumsum( @@ -59,7 +69,7 @@ def _irregular_sample_points_list( fdata: FDataBasis | FDataGrid | FDataIrregular, n_points_per_curve: NDArrayInt, random_state: RandomState, -) -> List[NDArrayFloat]: +) -> Tuple[List[NDArrayFloat], NDArrayInt]: raise NotImplementedError( "Only implemented for FDataGrid and FDataBasis.", ) @@ -72,14 +82,18 @@ def _irregular_sample_points_matrix_fdatagrid( random_state: RandomState, ) -> List[NDArrayFloat]: # This only works for 1D domains + n_points_per_curve = np.minimum( + n_points_per_curve, + len(fdata.grid_points[0]), + ) return [ random_state.choice( - fdata.grid_points[0], - size=(n_points), - replace=True, + fdata.grid_points[0].reshape(-1), + size=n_points, + replace=False, ) for n_points in n_points_per_curve - ] + ], n_points_per_curve @_irregular_sample_points_list.register @@ -89,17 +103,24 @@ def _irregular_sample_points_matrix_fdatairregular( random_state: RandomState, ) -> List[NDArrayFloat]: # This only works for 1D domains + original_n_points_per_curve = np.diff( + np.concatenate([fdata.start_indices, [len(fdata.points)]]), + ) + n_points_per_curve = np.minimum( + n_points_per_curve, + original_n_points_per_curve, + ) return [ random_state.choice( - curve_points, - size=(n_points), - replace=True, + curve_points.reshape(-1), + size=min(n_points, len(curve_points)), + replace=False, ) for n_points, curve_points in zip( n_points_per_curve, np.split(fdata.points, fdata.start_indices[1:]), ) - ] + ], n_points_per_curve @_irregular_sample_points_list.register @@ -115,4 +136,4 @@ def _irregular_sample_points_matrix_fdatabasis( size=(n_points), ) for n_points in n_points_per_curve - ] + ], n_points_per_curve diff --git a/skfda/representation/irregular.py b/skfda/representation/irregular.py index 79fdd1fea..d5f864015 100644 --- a/skfda/representation/irregular.py +++ b/skfda/representation/irregular.py @@ -43,7 +43,7 @@ T = TypeVar("T", bound='FDataIrregular') IrregularToBasisConversionType = Literal[ - "separately", "mixed_effects", "mixed_effects_minimize", + "function-wise", "mixed-effects", "mixed-effects-minimize", ] ###################### @@ -1084,7 +1084,7 @@ def to_basis( self, basis: Basis, *, - conversion_type: IrregularToBasisConversionType = "separately", + conversion_type: IrregularToBasisConversionType = "function-wise", **kwargs: Any, ) -> FDataBasis: """Return the basis representation of the object. @@ -1094,16 +1094,16 @@ def to_basis( going to be represented. conversion_type: method to use for the conversion: - - "separately": (default) each curve is converted independently + - "function-wise": (default) each curve is converted independently (meaning that only the information of each curve is used for its conversion) with :class:`~skfda.preprocessing.smoothing.BasisSmoother`. - - "mixed_effects": all curves are converted jointly (this means + - "mixed-effects": all curves are converted jointly (this means that the information of all curves is used to convert each one) using the EM algorithm to fit the mixed effects model: :class:`~skfda.representation.conversion.EMMixedEffectsConverter`. - - "mixed_effects_minimize": all curves are converted jointly + - "mixed-effects-minimize": all curves are converted jointly using the scipy.optimize.minimize to fit the mixed effects model: :class:`~skfda.representation.conversion.MinimizeMixedEffectsConverter`. @@ -1138,7 +1138,7 @@ def to_basis( ... n_basis=5, domain_range=fd_temperatures.domain_range, ... ) >>> temp_basis_repr = temp_irregular.to_basis( #doctest: +SKIP - ... basis, conversion_type="mixed_effects", + ... basis, conversion_type="mixed-effects", ... ) >>> fig = plt.figure(figsize=(10, 10)) >>> for k in range(4): #doctest: +SKIP @@ -1175,13 +1175,13 @@ def to_basis( if not basis.is_domain_range_fixed(): basis = basis.copy(domain_range=self.domain_range) - if conversion_type != "separately": + if conversion_type != "function-wise": from ..representation.conversion import ( EMMixedEffectsConverter, MinimizeMixedEffectsConverter, ) converter_class = ( - EMMixedEffectsConverter if conversion_type == "mixed_effects" + EMMixedEffectsConverter if conversion_type == "mixed-effects" else MinimizeMixedEffectsConverter ) converter = converter_class(basis) From 1fac99e2a2681ae971751425181580fe52f97a65 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Mon, 22 Apr 2024 19:49:55 +0200 Subject: [PATCH 21/48] jupyter plot for to_basis --- docs/conf.py | 1 - ...plot_irregular_mixed_effects_robustness.py | 55 +++++++++------- skfda/representation/irregular.py | 66 +++++++++---------- 3 files changed, 64 insertions(+), 58 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index aea8c0197..e25650181 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -80,7 +80,6 @@ # Sphinx extensions extensions = [ "jupyter_sphinx", - "matplotlib.sphinxext.plot_directive", "myst_parser", "sphinx.ext.autodoc", "sphinx.ext.autodoc.typehints", diff --git a/examples/plot_irregular_mixed_effects_robustness.py b/examples/plot_irregular_mixed_effects_robustness.py index bb5f5d9e8..4e3797912 100644 --- a/examples/plot_irregular_mixed_effects_robustness.py +++ b/examples/plot_irregular_mixed_effects_robustness.py @@ -69,9 +69,18 @@ ), ) -# remove the original dataset from the lists -train_irregular_list = train_irregular_list[1:] -test_irregular_list = test_irregular_list[1:] +train_irregular_datasets = { + n_points: train_irregular + for n_points, train_irregular in zip( + n_points_list, train_irregular_list[1:], + ) +} +test_irregular_datasets = { + n_points: test_irregular + for n_points, test_irregular in zip( + n_points_list, test_irregular_list[1:], + ) +} # %% # We convert the irregular data to basis representation and compute the scores. @@ -98,8 +107,8 @@ converter = EMMixedEffectsConverter(basis) for n_points, train_irregular, test_irregular in zip( n_points_list, - train_irregular_list, - test_irregular_list, + train_irregular_datasets.values(), + test_irregular_datasets.values(), ): converter = converter.fit(train_irregular) train_sparse_converted = converter.transform(train_irregular) @@ -134,41 +143,31 @@ print("-" * 62) print(( pd.DataFrame(scores[score_name]) - .set_index("n_points_per_curve").sort_index() + .set_index("n_points_per_curve").sort_index().to_string() ), end="\n\n\n") # %% # The following plots show the original curves along with the converted # test curves for the conversions with 5, 4 and 3 points per curve. -for ( - n_points_per_curve, - test_irregular, - test_converted, - test_original_converted, -) in zip( - n_points_list, - test_irregular_list, - converted_data["Test-sparse"].values(), - converted_data["Test-original"].values(), -): - if n_points_per_curve not in [5, 4, 3]: - continue - fig = plt.figure(figsize=(10, 23)) + + +def plot_converted_test_curves(n_points_per_curve): + plt.figure(figsize=(10, 23)) for k in range(7): axes = plt.subplot(7, 1, k + 1) - test_irregular[k].scatter( + test_irregular_datasets[n_points_per_curve][k].scatter( axes=axes, color=f"C{k}", ) test_original[k].plot( axes=axes, color=f"C{k}", linewidth=0.65, label="Original test curve", ) - test_converted[k].plot( + converted_data["Test-sparse"][n_points_per_curve][k].plot( axes=axes, color=f"C{k}", linestyle="--", label=f"Test curve transformed from {n_points_per_curve} points", ) - test_original_converted[k].plot( + converted_data["Test-original"][n_points_per_curve][k].plot( axes=axes, color=f"C{k}", alpha=0.5, label="Test curve transformed from original 365 points", ) @@ -178,6 +177,16 @@ plt.show() + +# %% +plot_converted_test_curves(n_points_per_curve=5) + +# %% +plot_converted_test_curves(n_points_per_curve=4) + +# %% +plot_converted_test_curves(n_points_per_curve=3) + # %% # References # ---------- diff --git a/skfda/representation/irregular.py b/skfda/representation/irregular.py index d5f864015..8cba1e6bf 100644 --- a/skfda/representation/irregular.py +++ b/skfda/representation/irregular.py @@ -1121,40 +1121,38 @@ def to_basis( FDataBasis: Basis representation of the funtional data object. - .. plot:: - :format: python - :include-source: True - - >>> from skfda.datasets import fetch_weather, irregular_sample - >>> from skfda.representation.basis import FourierBasis - >>> import matplotlib.pyplot as plt - >>> fd_temperatures = fetch_weather().data.coordinates[0] - >>> temp_irregular = irregular_sample( - ... fdata=fd_temperatures, - ... n_points_per_curve=8, - ... random_state=4934755, - ... ) - >>> basis = FourierBasis( - ... n_basis=5, domain_range=fd_temperatures.domain_range, - ... ) - >>> temp_basis_repr = temp_irregular.to_basis( #doctest: +SKIP - ... basis, conversion_type="mixed-effects", - ... ) - >>> fig = plt.figure(figsize=(10, 10)) - >>> for k in range(4): #doctest: +SKIP - ... axes = plt.subplot(2, 2, k + 1) - ... fd_temperatures.plot(axes=axes, alpha=0.05, color="black") - ... fd_temperatures[k].plot( - ... axes=axes, color=f"C{k}", - ... label="Original data", linestyle="--", - ... ) - ... temp_basis_repr[k].plot( - ... axes=axes, color=f"C{k}", - ... label="Basis representation", - ... ) - ... temp_irregular[k].scatter(axes=axes, color=f"C{k}") - ... plt.legend() - >>> plt.show() #doctest: +SKIP + .. jupyter-execute:: + + from skfda.datasets import fetch_weather, irregular_sample + from skfda.representation.basis import FourierBasis + import matplotlib.pyplot as plt + fd_temperatures = fetch_weather().data.coordinates[0] + temp_irregular = irregular_sample( + fdata=fd_temperatures, + n_points_per_curve=8, + random_state=4934755, + ) + basis = FourierBasis( + n_basis=5, domain_range=fd_temperatures.domain_range, + ) + temp_basis_repr = temp_irregular.to_basis( + basis, conversion_type="mixed-effects", + ) + fig = plt.figure(figsize=(10, 10)) + for k in range(4): + axes = plt.subplot(2, 2, k + 1) + fd_temperatures.plot(axes=axes, alpha=0.05, color="black") + fd_temperatures[k].plot( + axes=axes, color=f"C{k}", + label="Original data", linestyle="--", + ) + temp_basis_repr[k].plot( + axes=axes, color=f"C{k}", + label="Basis representation", + ) + temp_irregular[k].scatter(axes=axes, color=f"C{k}") + plt.legend() + plt.show() """ if self.dim_domain != basis.dim_domain: raise ValueError( From c23064db17661ac6f7723d4e87feb4fe14743c48 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Wed, 24 Apr 2024 19:25:29 +0200 Subject: [PATCH 22/48] update converters --- .../conversion/_mixed_effects.py | 133 ++++++++++-------- skfda/tests/test_mixed_effects_converter.py | 18 ++- 2 files changed, 85 insertions(+), 66 deletions(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index ed55340ab..b44c8a392 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -12,17 +12,15 @@ from abc import ABC from dataclasses import dataclass from typing import ( - Any, Callable, - Dict, List, Literal, - Optional, Protocol, ) import numpy as np import scipy +from sklearn.utils import Bunch from typing_extensions import Self from ...representation import FDataBasis, FDataIrregular @@ -124,7 +122,7 @@ def _linalg_solve( def _sum_mahalanobis( r_list: List[NDArrayFloat], cov_mat_list: List[NDArrayFloat], - r_list2: Optional[List[NDArrayFloat]] = None, + r_list2: List[NDArrayFloat] | None = None, ) -> NDArrayFloat: """sum_k ( r_list[k]^T @ cov_mat_list[k]^{-1} @ r_list2[k] ) @@ -369,19 +367,29 @@ class MixedEffectsConverter(_ToBasisConverter[FDataIrregular], ABC): """Abstract class for mixed effects to-basis-converters. TODO: explain the model in detail. + Args: + basis: Basis to convert to. + + Parameters: + result: Bunch containing the result of the fitting of the model. + Contains the parameters: + + - fitted_model: Fitted mixed effects model. + - fitted_params: Fitted parameters of the mixed effects model. + - minimize_result: Result of the scipy.optimize.minimize call, + if this function was used. + - success: Whether the fitting was successful. + - message: Message of the fitting. + - nit: Number of iterations of the fitting. """ # after fitting: - fitted_model: Optional[_MixedEffectsModel] - fitted_params: Optional[_MixedEffectsParams] - result: Optional[Dict[str, Any] | scipy.optimize.OptimizeResult] + result: Bunch | None def __init__( self, basis: Basis, ) -> None: - self.fitted_model = None - self.fitted_params = None self.result = None super().__init__(basis) @@ -390,12 +398,14 @@ def transform( X: FDataIrregular, ) -> FDataBasis: """Transform to FDataBasis using the fitted converter.""" - if self.fitted_params is None: # or self.model is None: + if self.result is None: raise ValueError("The converter has not been fitted.") model = _MixedEffectsModel(X, self.basis) - mean = self.fitted_params.mean - gamma_estimates = model.random_effects_estimate(self.fitted_params) + mean = self.result.fitted_params.mean + gamma_estimates = model.random_effects_estimate( + self.result.fitted_params, + ) coefficients = mean[np.newaxis, :] + gamma_estimates @@ -432,14 +442,14 @@ class Params: """ sqrt_cov_div_sigmasq: NDArrayFloat - _mean: Optional[NDArrayFloat] - _model: Optional[_MixedEffectsModel] + _mean: NDArrayFloat | None + _model: _MixedEffectsModel | None def __init__( self, sqrt_cov_div_sigmasq: NDArrayFloat, - mean: Optional[NDArrayFloat], - model: Optional[_MixedEffectsModel] = None, + mean: NDArrayFloat | None, + model: _MixedEffectsModel | None = None, ) -> None: if mean is None: assert model is not None, "model is required if mean is None" @@ -499,7 +509,7 @@ def from_vec( cls, vec: NDArrayFloat, dim_effects: int, - model: Optional[_MixedEffectsModel] = None, + model: _MixedEffectsModel | None = None, has_mean: bool = True, ) -> Self: """Create Params from vectorized parameters.""" @@ -529,10 +539,10 @@ def fit( X: FDataIrregular, y: object = None, *, - initial_params: Optional[ - MinimizeMixedEffectsConverter.Params | NDArrayFloat - ] = None, - minimization_method: Optional[str] = None, + initial_params: ( + MinimizeMixedEffectsConverter.Params | NDArrayFloat | None + ) = None, + minimization_method: str | None = None, has_mean: bool = True, ) -> Self: """Fit the model. @@ -550,7 +560,7 @@ def fit( self after fit """ dim_effects = self.basis.n_basis - model = _MixedEffectsModel(X, self.basis) + fitted_model = _MixedEffectsModel(X, self.basis) n_samples = X.n_samples if isinstance(initial_params, MinimizeMixedEffectsConverter.Params): initial_params_vec = initial_params.to_vec() @@ -563,36 +573,43 @@ def fit( initial_params_generic.covariance, ), mean=initial_params_generic.mean if has_mean else None, - model=model, + model=fitted_model, ).to_vec() if minimization_method is None: minimization_method = _SCIPY_MINIMIZATION_METHODS[0] def objective_function(params_vec: NDArrayFloat) -> float: - return - model.profile_loglikelihood( + return - fitted_model.profile_loglikelihood( params=MinimizeMixedEffectsConverter.Params.from_vec( params_vec, dim_effects, model=self, has_mean=has_mean, ) ) / n_samples - self.result = _minimize( + minimize_result = _minimize( fun=objective_function, x0=initial_params_vec, minimization_methods=minimization_method, ) - self.fitted_model = model params = MinimizeMixedEffectsConverter.Params.from_vec( - self.result.x, + minimize_result.x, dim_effects=dim_effects, - model=model, + model=fitted_model, has_mean=has_mean, ) - self.fitted_params = _MixedEffectsParamsResult( + fitted_params = _MixedEffectsParamsResult( mean=params.mean, covariance=params.covariance, sigmasq=params.sigmasq, ) + self.result = Bunch( + fitted_model=fitted_model, + fitted_params=fitted_params, + minimize_result=minimize_result, + success=minimize_result.success, + message=minimize_result.message, + nit=minimize_result.nit, + ) return self @@ -704,13 +721,13 @@ def fit( X: FDataIrregular, y: object = None, *, - initial_params: Optional[ - EMMixedEffectsConverter.Params | NDArrayFloat - ] = None, + initial_params: ( + EMMixedEffectsConverter.Params | NDArrayFloat | None + ) = None, maxiter: int = 700, - convergence_criterion: Optional[ - Literal["params", "squared-error", "loglikelihood"] - ] = None, + convergence_criterion: ( + Literal["params", "squared-error", "loglikelihood"] | None + ) = None, rtol: float = 1e-3, ) -> Self: """Fit the model using the EM algorithm. @@ -727,15 +744,12 @@ def fit( - "squared-error" to userelative changes in the squared error of the estimated values with respect to the original data. - "loglikelihood" to use relative changes in the loglikelihood. - # - "prop-offset" to use the criteria proposed by Bates & - # Watts 1981 (A Relative Offset Convergence Criterion for - # Nonlinear Least Squares). rtol: relative tolerance for convergence. Returns: The converter after fitting. """ - model = _MixedEffectsModel(X, self.basis) + fitted_model = _MixedEffectsModel(X, self.basis) if initial_params is None: initial_params_generic = _initial_params(self.basis.n_basis) @@ -762,23 +776,23 @@ def fit( use_error = convergence_criterion in ("squared-error",) if use_error: - big_values = np.concatenate(model.values) + big_values = np.concatenate(fitted_model.values) converged = False - convergence_val: Optional[NDArrayFloat | float] = None - prev_convergence_val: Optional[NDArrayFloat | float] = None + convergence_val: NDArrayFloat | float | None = None + prev_convergence_val: NDArrayFloat | float | None = None for iter_number in range(maxiter): curr_params = next_params - values_cov = model.values_covariances( + values_cov = fitted_model.values_covariances( curr_params.sigmasq, curr_params.covariance, ) - mean = self._mean(model, values_cov) - partial_residuals = model.partial_residuals(mean) - random_effects = model._random_effects_estimate( + mean = self._mean(fitted_model, values_cov) + partial_residuals = fitted_model.partial_residuals(mean) + random_effects = fitted_model._random_effects_estimate( curr_params.covariance, values_cov, partial_residuals, ) next_params = self._next_params( - model=model, + model=fitted_model, curr_params=curr_params, partial_residuals=partial_residuals, values_cov=values_cov, @@ -791,13 +805,13 @@ def fit( estimates = np.concatenate([ # estimated values basis_eval @ (mean + random_effect) for basis_eval, random_effect in zip( - model.basis_evaluations, random_effects, + fitted_model.basis_evaluations, random_effects, ) ]) error = big_values - estimates convergence_val = np.inner(error, error) # sum of squares elif convergence_criterion == "loglikelihood": - convergence_val = model.profile_loglikelihood( + convergence_val = fitted_model.profile_loglikelihood( _MixedEffectsParamsResult( mean=mean, covariance=next_params.covariance, @@ -822,21 +836,22 @@ def fit( f"{iter_number}/{maxiter} iterations." ) - self.result = { - "success": converged, - "message": message, - "nit": iter_number, - } - self.fitted_model = model - final_params = next_params - values_cov = model.values_covariances( + values_cov = fitted_model.values_covariances( curr_params.sigmasq, curr_params.covariance, ) - final_mean = self._mean(model, values_cov) - self.fitted_params = _MixedEffectsParamsResult( + final_mean = self._mean(fitted_model, values_cov) + fitted_params = _MixedEffectsParamsResult( mean=final_mean, covariance=final_params.covariance, sigmasq=final_params.sigmasq, ) + + self.result = Bunch( + fitted_model=fitted_model, + fitted_params=fitted_params, + success=converged, + message=message, + nit=iter_number, + ) return self diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py index 343306232..391c8af74 100644 --- a/skfda/tests/test_mixed_effects_converter.py +++ b/skfda/tests/test_mixed_effects_converter.py @@ -304,13 +304,7 @@ def _cmp_estimation_with_original( fdatairregular, **fit_kwargs, ) - if ( - isinstance(estimator.result, dict) - and "success" in estimator.result - and not estimator.result["success"] - ): - raise Exception(f"Optimization failed: {estimator.result}") - + assert estimator.result.success, "Optimization failed" assert r2_score(fdatabasis_estimated, fdatabasis_original) > 0.9 @@ -368,6 +362,16 @@ def fun(i: int): # ) +def test_compare_minimize_with_original() -> None: + """Compare the EM conversion with the original data.""" + _test_compare_with_original( + estimator_cls=MinimizeMixedEffectsConverter, + fit_kwargs={ + "minimization_method": "Powell", + } + ) + + def test_compare_em_with_original() -> None: """Compare the EM conversion with the original data.""" _test_compare_with_original( From f3663d85760723f71d0ac4d3464196356f80b9a6 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Wed, 24 Apr 2024 19:26:19 +0200 Subject: [PATCH 23/48] remove commented code from tests --- skfda/tests/test_mixed_effects_converter.py | 81 --------------------- 1 file changed, 81 deletions(-) diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py index 391c8af74..dfaa97fd2 100644 --- a/skfda/tests/test_mixed_effects_converter.py +++ b/skfda/tests/test_mixed_effects_converter.py @@ -11,7 +11,6 @@ Tuple, Type, ) -import matplotlib.pyplot as plt from skfda import FDataBasis from skfda.misc.scoring import r2_score @@ -207,80 +206,6 @@ def _get_points( ) -# def __test_simple_conversion() -> None: -# """Visual test.""" -# _max_val = 10 -# _domain_range = (0, 10) -# n_points = 6 -# n_basis = 5 -# n_samples = 50 -# points = _get_points(_domain_range, n_points, n_samples, 9) - -# basis = FourierBasis(n_basis=n_basis, domain_range=_domain_range) -# # BSplineBasis( -# # n_basis=n_basis, domain_range=_domain_range, order=n_basis - 1, -# # ) - -# sigma = 0.3 -# Gamma_sqrt = np.zeros((n_basis, n_basis)) -# Gamma_sqrt[np.tril_indices(n_basis)] = np.random.rand( -# n_basis * (n_basis + 1) // 2, -# ) * _max_val -# Gamma = Gamma_sqrt @ Gamma_sqrt.T -# beta = np.random.rand(n_basis) * _max_val -# fdatabasis_original = FDataBasis( -# basis=basis, -# coefficients=np.random.multivariate_normal( -# mean=beta, cov=Gamma, size=n_samples, -# ), -# ) - -# def fun(i: int) -> Callable[[NDArrayFloat], NDArrayFloat]: -# def fi(x: NDArrayFloat) -> NDArrayFloat: -# return fdatabasis_original[i](x).reshape(x.shape) -# return fi - -# funcs = [fun(i) for i in range(n_samples)] - -# fdatairregular = _create_irregular_samples( -# funcs=funcs, -# n_points=n_points, -# points=points, -# noise_generate_std=sigma, -# ) -# converter = MinimizeMixedEffectsConverter(basis) -# fdatabasis_estimated = converter.fit_transform(fdatairregular) -# fdatabasis_basic = fdatairregular.to_basis(basis) -# if True: -# _ = plt.figure(figsize=(15, 6)) - -# axes = plt.subplot(2, 2, 1) -# plt.title("Original data") -# fdatairregular[:5].plot(axes=axes) -# left, right = plt.ylim() -# plt.ylim((min(0, left), max(1.4, right))) - -# axes = plt.subplot(2, 2, 2) -# plt.title("Estimated basis representation.\n") -# fdatairregular.scatter(axes=axes) -# fdatabasis_estimated[:5].plot(axes=axes) -# left, right = plt.ylim() -# plt.ylim((min(0, left), max(1.4, right))) - -# axes = plt.subplot(2, 2, 4) -# plt.title("Original basis representation") -# fdatairregular.scatter(axes=axes) -# fdatabasis_original[:5].plot(axes=axes) -# left, right = plt.ylim() -# plt.ylim((min(0, left), max(1.4, right))) - -# axes = plt.subplot(2, 2, 3) -# plt.title(f"{basis}") -# basis.plot(axes=axes) - -# plt.show() - - def _cmp_estimation_with_original( n_points: int, sigma: float, # to generate the noise @@ -356,12 +281,6 @@ def fun(i: int): ) -# def test_compare_with_statsmodels_minimize() -> None: -# _test_general_compare_with_original( -# MinimizeMixedEffectsConverter, -# ) - - def test_compare_minimize_with_original() -> None: """Compare the EM conversion with the original data.""" _test_compare_with_original( From e90eac70ac2d0ebe5635f24d4ebf7da3d8b44f8e Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Fri, 26 Apr 2024 10:47:41 +0200 Subject: [PATCH 24/48] design of mixed effects converters and bib references --- docs/refs.bib | 27 ++++++++++ .../conversion/_mixed_effects.py | 52 ++++++++++--------- skfda/representation/irregular.py | 7 ++- 3 files changed, 60 insertions(+), 26 deletions(-) diff --git a/docs/refs.bib b/docs/refs.bib index 7d82543a0..4cce3aa0a 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -666,3 +666,30 @@ @article{james_2018_sparsenessfda year = {2018}, url = {https://api.semanticscholar.org/CorpusID:14265225} } + +@article{Lindstrom_1988, + doi = {10.1080/01621459.1988.10478693}, + title = {{N}ewton—{R}aphson and {EM} {A}lgorithms for {L}inear {M}ixed-{E}ffects {M}odels for {R}epeated-{M}easures {D}ata}, + author = {Lindstrom, Mary J. and Bates, Douglas M.}, + journal = {Journal of the American Statistical Association 1988-dec vol. 83 iss. 404}, + year = {1988}, + month = {dec}, + volume = {83}, + issue = {404}, + page = {1014--1022}, + url = {libgen.li/file.php?md5=432e6fa80db6feb0cb39b8d0215e5d3a} +} + +@article{laird+lange+stram_1987_emmixedeffects, + author = {Nan Laird, Nicholas Lange and Daniel Stram}, + title = {Maximum Likelihood Computations with Repeated Measures: Application of the EM Algorithm}, + journal = {Journal of the American Statistical Association}, + volume = {82}, + number = {397}, + pages = {97--105}, + year = {1987}, + publisher = {Taylor \& Francis}, + doi = {10.1080/01621459.1987.10478395}, + url = {https://www.tandfonline.com/doi/abs/10.1080/01621459.1987.10478395}, + eprint = {https://www.tandfonline.com/doi/pdf/10.1080/01621459.1987.10478395} +} \ No newline at end of file diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index b44c8a392..e228abc87 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -366,7 +366,6 @@ def n_samples(self) -> int: class MixedEffectsConverter(_ToBasisConverter[FDataIrregular], ABC): """Abstract class for mixed effects to-basis-converters. - TODO: explain the model in detail. Args: basis: Basis to convert to. @@ -374,7 +373,7 @@ class MixedEffectsConverter(_ToBasisConverter[FDataIrregular], ABC): result: Bunch containing the result of the fitting of the model. Contains the parameters: - - fitted_model: Fitted mixed effects model. + - model: Fitted mixed effects model. - fitted_params: Fitted parameters of the mixed effects model. - minimize_result: Result of the scipy.optimize.minimize call, if this function was used. @@ -424,7 +423,7 @@ class MinimizeMixedEffectsConverter(MixedEffectsConverter): """Mixed effects to-basis-converter using scipy.optimize. Minimizes the profile loglikelihood of the mixed effects model as proposed - by Mary J. Lindstrom & Douglas M. Bates (1988). + by :footcite:t:`Lindstrom_1988`. """ @dataclass(frozen=True) @@ -551,7 +550,7 @@ def fit( X: irregular data to fit. y: ignored. initial_params: initial params of the model. - minimization_methods: scipy.optimize.minimize method to be used for + minimization_method: scipy.optimize.minimize method to be used for the minimization of the loglikelihood of the model. has_mean: Whether the mean is a fixed parameter to be optimized or estimated with ML estimator from the covariance parameters. @@ -560,7 +559,7 @@ def fit( self after fit """ dim_effects = self.basis.n_basis - fitted_model = _MixedEffectsModel(X, self.basis) + model = _MixedEffectsModel(X, self.basis) n_samples = X.n_samples if isinstance(initial_params, MinimizeMixedEffectsConverter.Params): initial_params_vec = initial_params.to_vec() @@ -573,14 +572,14 @@ def fit( initial_params_generic.covariance, ), mean=initial_params_generic.mean if has_mean else None, - model=fitted_model, + model=model, ).to_vec() if minimization_method is None: minimization_method = _SCIPY_MINIMIZATION_METHODS[0] def objective_function(params_vec: NDArrayFloat) -> float: - return - fitted_model.profile_loglikelihood( + return - model.profile_loglikelihood( params=MinimizeMixedEffectsConverter.Params.from_vec( params_vec, dim_effects, model=self, has_mean=has_mean, ) @@ -594,7 +593,7 @@ def objective_function(params_vec: NDArrayFloat) -> float: params = MinimizeMixedEffectsConverter.Params.from_vec( minimize_result.x, dim_effects=dim_effects, - model=fitted_model, + model=model, has_mean=has_mean, ) fitted_params = _MixedEffectsParamsResult( @@ -603,7 +602,7 @@ def objective_function(params_vec: NDArrayFloat) -> float: sigmasq=params.sigmasq, ) self.result = Bunch( - fitted_model=fitted_model, + model=model, fitted_params=fitted_params, minimize_result=minimize_result, success=minimize_result.success, @@ -615,7 +614,12 @@ def objective_function(params_vec: NDArrayFloat) -> float: class EMMixedEffectsConverter(MixedEffectsConverter): - """Mixed effects to-basis-converter using the EM algorithm.""" + """Mixed effects to-basis-converter using the EM algorithm. + + Minimizes the profile loglikelihood of the mixed effects model with the EM + algorithm as proposed by + :footcite:t:`laird+lange+stram_1987_emmixedeffects`. + """ @dataclass(frozen=True) class Params: """Mixed effects parameters for the EM algorithm.""" @@ -736,12 +740,12 @@ def fit( X: irregular data to fit. y: ignored. initial_params: initial params of the model. - niter: maximum number of iterations. + maxiter: maximum number of iterations. convergence_criterion: convergence criterion to use when fitting. - "params" to use relative differences between parameters (the default). - - "squared-error" to userelative changes in the squared error + - "squared-error" to use relative changes in the squared error of the estimated values with respect to the original data. - "loglikelihood" to use relative changes in the loglikelihood. rtol: relative tolerance for convergence. @@ -749,7 +753,7 @@ def fit( Returns: The converter after fitting. """ - fitted_model = _MixedEffectsModel(X, self.basis) + model = _MixedEffectsModel(X, self.basis) if initial_params is None: initial_params_generic = _initial_params(self.basis.n_basis) @@ -776,23 +780,23 @@ def fit( use_error = convergence_criterion in ("squared-error",) if use_error: - big_values = np.concatenate(fitted_model.values) + big_values = np.concatenate(model.values) converged = False convergence_val: NDArrayFloat | float | None = None prev_convergence_val: NDArrayFloat | float | None = None for iter_number in range(maxiter): curr_params = next_params - values_cov = fitted_model.values_covariances( + values_cov = model.values_covariances( curr_params.sigmasq, curr_params.covariance, ) - mean = self._mean(fitted_model, values_cov) - partial_residuals = fitted_model.partial_residuals(mean) - random_effects = fitted_model._random_effects_estimate( + mean = self._mean(model, values_cov) + partial_residuals = model.partial_residuals(mean) + random_effects = model._random_effects_estimate( curr_params.covariance, values_cov, partial_residuals, ) next_params = self._next_params( - model=fitted_model, + model=model, curr_params=curr_params, partial_residuals=partial_residuals, values_cov=values_cov, @@ -805,13 +809,13 @@ def fit( estimates = np.concatenate([ # estimated values basis_eval @ (mean + random_effect) for basis_eval, random_effect in zip( - fitted_model.basis_evaluations, random_effects, + model.basis_evaluations, random_effects, ) ]) error = big_values - estimates convergence_val = np.inner(error, error) # sum of squares elif convergence_criterion == "loglikelihood": - convergence_val = fitted_model.profile_loglikelihood( + convergence_val = model.profile_loglikelihood( _MixedEffectsParamsResult( mean=mean, covariance=next_params.covariance, @@ -837,10 +841,10 @@ def fit( ) final_params = next_params - values_cov = fitted_model.values_covariances( + values_cov = model.values_covariances( curr_params.sigmasq, curr_params.covariance, ) - final_mean = self._mean(fitted_model, values_cov) + final_mean = self._mean(model, values_cov) fitted_params = _MixedEffectsParamsResult( mean=final_mean, covariance=final_params.covariance, @@ -848,7 +852,7 @@ def fit( ) self.result = Bunch( - fitted_model=fitted_model, + model=model, fitted_params=fitted_params, success=converged, message=message, diff --git a/skfda/representation/irregular.py b/skfda/representation/irregular.py index 8cba1e6bf..b1e65346e 100644 --- a/skfda/representation/irregular.py +++ b/skfda/representation/irregular.py @@ -1108,7 +1108,7 @@ def to_basis( model: :class:`~skfda.representation.conversion.MinimizeMixedEffectsConverter`. kwargs: keyword arguments to be passed to FDataBasis.from_data() - in the case of conversion_type="separately. If conversion_type + in the case of conversion_type="separately". If conversion_type has another value, the keyword arguments are passed to the fit method of the :class:`~skfda.representation.conversion.MixedEffectsConverter`. @@ -1173,7 +1173,7 @@ def to_basis( if not basis.is_domain_range_fixed(): basis = basis.copy(domain_range=self.domain_range) - if conversion_type != "function-wise": + if conversion_type in ("mixed-effects", "mixed-effects-minimize"): from ..representation.conversion import ( EMMixedEffectsConverter, MinimizeMixedEffectsConverter, @@ -1185,6 +1185,9 @@ def to_basis( converter = converter_class(basis) return converter.fit_transform(self, **kwargs) + if conversion_type != "function-wise": + raise ValueError(f"Invalid conversion type: {conversion_type}") + from ..preprocessing.smoothing import BasisSmoother smoother = BasisSmoother( basis=basis, From c5bd7a5fd6a5e225e84a699d2e701f920d5af38b Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Fri, 26 Apr 2024 23:32:35 +0200 Subject: [PATCH 25/48] comments and remove unnecessary test --- .../conversion/_mixed_effects.py | 30 +++++++++++++++ skfda/representation/irregular.py | 2 +- skfda/tests/test_mixed_effects_converter.py | 38 ------------------- 3 files changed, 31 insertions(+), 39 deletions(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index e228abc87..7abb2c2c3 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -56,6 +56,23 @@ def _get_values_list( fdatairregular: FDataIrregular, ) -> List[NDArrayFloat]: + """Get the values vectors for the mixed-effects model. + + Args: + fdatairregular: Irregular data. + + Returns: + List of values vectors (one vector per functional datum). + + Examples: + >>> fdata = FDataIrregular( + ... start_indices=[0, 1, 5], + ... values=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), + ... points=list(range(9)), + ... ) + >>> _get_values_list(fdata) + [array([1]), array([2, 3, 4, 5]), array([6, 7, 8, 9])] + """ assert fdatairregular.dim_domain == 1 assert fdatairregular.dim_codomain == 1 return np.split( @@ -68,6 +85,19 @@ def _get_basis_evaluations_list( fdatairregular: FDataIrregular, basis: Basis, ) -> List[NDArrayFloat]: + """Get the matrix of basis evaluations for the mixed-effects model. + + Args: + fdatairregular: Irregular data. + basis: Basis to evaluate. + + Returns: + A list of matrices (one matrix per functional datum), each matrix is + of shape (n_points, n_basis), where n_points is the number of points + of the functional datum and n_basis is the number of basis functions. + The i-th row of the matrix is the evaluation of the basis functions at + the i-th point of the functional datum. + """ assert fdatairregular.dim_domain == 1 assert fdatairregular.dim_codomain == 1 return np.split( diff --git a/skfda/representation/irregular.py b/skfda/representation/irregular.py index b1e65346e..afd94d724 100644 --- a/skfda/representation/irregular.py +++ b/skfda/representation/irregular.py @@ -599,7 +599,7 @@ def derivative( def integrate( self: T, *, - domain: Optional[DomainRange] = None, + domain: DomainRange | None = None, ) -> NDArrayFloat: """Integrate the FDataIrregular object. diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py index dfaa97fd2..91b1afda0 100644 --- a/skfda/tests/test_mixed_effects_converter.py +++ b/skfda/tests/test_mixed_effects_converter.py @@ -32,12 +32,6 @@ _MixedEffectsModel, ) -_fdatairregular = FDataIrregular( - start_indices=[0, 1, 5], - values=np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]), - points=list(range(9)), -) - def test_loglikelihood() -> None: """Test loglikelihood function comparing it with Statsmodels' MixedLM.""" @@ -108,38 +102,6 @@ def test_loglikelihood() -> None: assert np.allclose(mixedlm_loglikelihood, model_loglikelihood) -def test_values_list() -> None: - """Test conversion from FDataIrregular to ME model: values.""" - fdatairregular = _fdatairregular - x_list = _get_values_list(fdatairregular) - expected_x_list = [ - np.array([1]), - np.array([2, 3, 4, 5]), - np.array([6, 7, 8, 9]), - ] - for x, expected_x in zip(x_list, expected_x_list): - assert np.all(x == expected_x) - - -def test_basis_evaluations_list() -> None: - """Test conversion from FDataIrregular to ME model: basis evaluations.""" - fdatairregular = _fdatairregular - basis = FourierBasis(n_basis=3, domain_range=(0, 10)) - phi_list = _get_basis_evaluations_list(fdatairregular, basis) - - def eval_basis(x: float) -> npt.NDArray[np.float_]: - return basis(x).reshape(-1) - - expected_phi = [ - np.array([eval_basis(0)]), - np.array([eval_basis(j) for j in [1, 2, 3, 4]]), - np.array([eval_basis(j) for j in [5, 6, 7, 8]]), - ] - - for phi, expected_phi in zip(phi_list, expected_phi): - np.testing.assert_allclose(phi, expected_phi) - - def _create_irregular_samples( funcs: Iterable[ Callable[[npt.NDArray[np.float_]], npt.NDArray[np.float_]] From a9c2ee40312ff20f39a08b16c86fdc8b911b7d32 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 27 Apr 2024 11:02:51 +0200 Subject: [PATCH 26/48] example --- .../plot_irregular_to_basis_mixed_effects.py | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index f90a7d5c4..4124bf769 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -33,23 +33,23 @@ # :footcite:t:`james_2018_sparsenessfda`. This just means that # the coefficients of the basis representation are generated from a Gaussian # distribution. -n_curves = 50 +n_curves = 70 n_basis = 4 -domain_range = (0, 12) +domain_range = (0, 10) basis = BSplineBasis(n_basis=n_basis, domain_range=domain_range, order=3) plt.figure(figsize=(10, 5)) basis.plot() plt.title("Basis functions") -coeff_mean = np.array([-10, 20, -24, 4]) +coeff_mean = np.array([-15, 20, -4, 6]) coeff_cov_sqrt = np.array([ - [3.2, 0.0, 0.0, 0.0], - [0.4, 6.0, 0.0, 0.0], - [0.3, 1.5, 2.0, 0.0], - [1.2, 0.3, 2.5, 1.8], + [4.0, 0.0, 0.0, 0.0], + [1.2, 2.6, 0.0, 0.0], + [4.7, 2.9, 2.0, 0.0], + [4.9, 0.3, 0.1, 3.6], ]) -random_state = np.random.RandomState(seed=4934755) +random_state = np.random.RandomState(seed=34285676) coefficients = ( coeff_mean + random_state.normal(size=(n_curves, n_basis)) @ coeff_cov_sqrt ) @@ -68,10 +68,10 @@ # Moreover, we add some noise to the data. fd_irregular_without_noise = irregular_sample( fdatabasis_original, - n_points_per_curve=random_state.randint(3, 5, n_curves), + n_points_per_curve=random_state.randint(4, 8, n_curves), random_state=random_state, ) -noise_std = 0.1 +noise_std = 1 fd_irregular = FDataIrregular( points=fd_irregular_without_noise.points, start_indices=fd_irregular_without_noise.start_indices, @@ -118,30 +118,28 @@ # %% # To visualize the conversion results, we plot the first 8 original and # converted curves of the test set. On the background, we plot the train set. -fig = plt.figure(figsize=(10, 25)) -for k in range(8): - axes = plt.subplot(8, 1, k + 1) - - # train_original.plot(axes=axes, color=(0, 0, 0, 0.05)) - # train_irregular.scatter(axes=axes, color=(0, 0, 0, 0.05), marker=".") +fig = plt.figure(figsize=(11, 16)) +plt.suptitle("Comparison of the original and converted data (test set)") +for k in range(10): + axes = plt.subplot(5, 2, k + 1) test_irregular[k].scatter( axes=axes, color=f"C{k}", label="Irregular" ) test_curvewise_to_basis[k].plot( - axes=axes, color=f"C{k}", linestyle=":", - label="Curve-wise conversion", + axes=axes, color=f"C{k}", linestyle=":", label="Curve-wise", ) test_converted[k].plot( - axes=axes, color=f"C{k}", linestyle="--", - label="Mixed-effects conversion", + axes=axes, color=f"C{k}", linestyle="--", label="Mixed-effects", ) test_original[k].plot( - axes=axes, color=f"C{k}", alpha=0.5, - label="Original basis representation", + axes=axes, color=f"C{k}", alpha=0.5, label="Original", ) - axes.legend(bbox_to_anchor=(1., 1.)) + # axes.legend(bbox_to_anchor=(1., 0.3)) + axes.legend() plt.tight_layout(rect=[0, 0, 1, 0.98]) + # Same scale for all plots: + plt.ylim((-17, 15)) plt.show() # %% @@ -166,7 +164,7 @@ print("-" * 35) print(f"{score_name} scores:") print("-" * 35) - print(score_df, end=f"\n\n\n") + print(score_df, end="\n\n\n") # %% # References From de2cf1eebc28c102fd0482e97f17d5e50dcb65e9 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 27 Apr 2024 19:44:40 +0200 Subject: [PATCH 27/48] . --- .../plot_irregular_mixed_effects_robustness.py | 16 +++++++++++----- .../plot_irregular_to_basis_mixed_effects.py | 10 +++++----- examples/table.tex | 12 ++++++++++++ 3 files changed, 28 insertions(+), 10 deletions(-) create mode 100644 examples/table.tex diff --git a/examples/plot_irregular_mixed_effects_robustness.py b/examples/plot_irregular_mixed_effects_robustness.py index 4e3797912..b76b89e0f 100644 --- a/examples/plot_irregular_mixed_effects_robustness.py +++ b/examples/plot_irregular_mixed_effects_robustness.py @@ -6,6 +6,7 @@ effects model and checks the robustness of the method by fitting the model with decreasing number of measurement points per curve. """ +# %% # Author: Pablo Cuesta Sierra # License: MIT @@ -40,7 +41,8 @@ # %% # We split the data into train and test sets: -random_state = np.random.RandomState(seed=4934792) +seed = 13627798 +random_state = np.random.RandomState(seed=seed) train_original, test_original = train_test_split( fd_temperatures, test_size=0.3, @@ -143,10 +145,14 @@ print("-" * 62) print(( pd.DataFrame(scores[score_name]) - .set_index("n_points_per_curve").sort_index().to_string() - ), end="\n\n\n") - -# %% + .sort_values("n_points_per_curve") + # .style.format(precision=4) + .round(4).astype(str) + .to_latex() + )) +print(f"{seed=}") + + # %% # The following plots show the original curves along with the converted # test curves for the conversions with 5, 4 and 3 points per curve. diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index 4124bf769..570cd8c8c 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -71,7 +71,7 @@ n_points_per_curve=random_state.randint(4, 8, n_curves), random_state=random_state, ) -noise_std = 1 +noise_std = .5 fd_irregular = FDataIrregular( points=fd_irregular_without_noise.points, start_indices=fd_irregular_without_noise.start_indices, @@ -123,16 +123,16 @@ for k in range(10): axes = plt.subplot(5, 2, k + 1) - test_irregular[k].scatter( + test_irregular[k+1].scatter( axes=axes, color=f"C{k}", label="Irregular" ) - test_curvewise_to_basis[k].plot( + test_curvewise_to_basis[k+1].plot( axes=axes, color=f"C{k}", linestyle=":", label="Curve-wise", ) - test_converted[k].plot( + test_converted[k+1].plot( axes=axes, color=f"C{k}", linestyle="--", label="Mixed-effects", ) - test_original[k].plot( + test_original[k+1].plot( axes=axes, color=f"C{k}", alpha=0.5, label="Original", ) # axes.legend(bbox_to_anchor=(1., 0.3)) diff --git a/examples/table.tex b/examples/table.tex new file mode 100644 index 000000000..9e42d06f1 --- /dev/null +++ b/examples/table.tex @@ -0,0 +1,12 @@ +\begin{tabular}{lrrrr} +\toprule + & n_points_per_curve & Train-sparse & Test-sparse & Test-original \\ +\midrule +5 & 3 & 47.800565 & 54.602773 & 43.214559 \\ +4 & 4 & 9.875101 & 9.232975 & 3.388808 \\ +3 & 5 & 6.568285 & 6.256520 & 1.323652 \\ +2 & 7 & 2.124880 & 2.336709 & 1.221686 \\ +1 & 10 & 1.565116 & 2.250119 & 1.258129 \\ +0 & 40 & 0.953337 & 1.316078 & 1.103929 \\ +\bottomrule +\end{tabular} From abb53aa0d11d25ae66543e53c80e25e2ed54cf3a Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 27 Apr 2024 19:49:31 +0200 Subject: [PATCH 28/48] fix previous commit which was a mistake --- .../plot_irregular_mixed_effects_robustness.py | 18 ++++++------------ examples/table.tex | 12 ------------ 2 files changed, 6 insertions(+), 24 deletions(-) delete mode 100644 examples/table.tex diff --git a/examples/plot_irregular_mixed_effects_robustness.py b/examples/plot_irregular_mixed_effects_robustness.py index b76b89e0f..3f0535d00 100644 --- a/examples/plot_irregular_mixed_effects_robustness.py +++ b/examples/plot_irregular_mixed_effects_robustness.py @@ -4,9 +4,8 @@ This example converts irregular data to a basis representation using a mixed effects model and checks the robustness of the method by fitting -the model with decreasing number of measurement points per curve. +the model with decreasing number of measurement points per curve. """ -# %% # Author: Pablo Cuesta Sierra # License: MIT @@ -41,8 +40,7 @@ # %% # We split the data into train and test sets: -seed = 13627798 -random_state = np.random.RandomState(seed=seed) +random_state = np.random.RandomState(seed=13627798) train_original, test_original = train_test_split( fd_temperatures, test_size=0.3, @@ -145,14 +143,10 @@ print("-" * 62) print(( pd.DataFrame(scores[score_name]) - .sort_values("n_points_per_curve") - # .style.format(precision=4) - .round(4).astype(str) - .to_latex() - )) -print(f"{seed=}") - - # %% + .set_index("n_points_per_curve").sort_index().to_string() + ), end="\n\n\n") + +# %% # The following plots show the original curves along with the converted # test curves for the conversions with 5, 4 and 3 points per curve. diff --git a/examples/table.tex b/examples/table.tex deleted file mode 100644 index 9e42d06f1..000000000 --- a/examples/table.tex +++ /dev/null @@ -1,12 +0,0 @@ -\begin{tabular}{lrrrr} -\toprule - & n_points_per_curve & Train-sparse & Test-sparse & Test-original \\ -\midrule -5 & 3 & 47.800565 & 54.602773 & 43.214559 \\ -4 & 4 & 9.875101 & 9.232975 & 3.388808 \\ -3 & 5 & 6.568285 & 6.256520 & 1.323652 \\ -2 & 7 & 2.124880 & 2.336709 & 1.221686 \\ -1 & 10 & 1.565116 & 2.250119 & 1.258129 \\ -0 & 40 & 0.953337 & 1.316078 & 1.103929 \\ -\bottomrule -\end{tabular} From 702d47a621009a1009216c7cad3d0f9de8b8c182 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 8 Jun 2024 16:58:58 +0200 Subject: [PATCH 29/48] simple conversion with real data too in example/tutorial --- .../plot_irregular_to_basis_mixed_effects.py | 213 ++++++++++++++---- 1 file changed, 173 insertions(+), 40 deletions(-) diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index 570cd8c8c..6248bbd78 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -17,18 +17,20 @@ from sklearn.model_selection import train_test_split from skfda import FDataBasis, FDataIrregular -from skfda.datasets import irregular_sample -from skfda.representation.basis import BSplineBasis +from skfda.datasets import fetch_weather, irregular_sample +from skfda.representation.basis import BSplineBasis, FourierBasis from skfda.representation.conversion import EMMixedEffectsConverter from skfda.misc.scoring import r2_score, mean_squared_error # %% +# Sythetic data +# ------------- # For this example, we are going to simulate the irregular # sampling of a dataset following the mixed effects model, to later attempt to # reconstruct said original dataset. # -# First, we generate the original basis representation of the data following +# We generate the original basis representation of the data following # the mixed effects model for irregular data as presented by # :footcite:t:`james_2018_sparsenessfda`. This just means that # the coefficients of the basis representation are generated from a Gaussian @@ -38,16 +40,12 @@ domain_range = (0, 10) basis = BSplineBasis(n_basis=n_basis, domain_range=domain_range, order=3) -plt.figure(figsize=(10, 5)) -basis.plot() -plt.title("Basis functions") - coeff_mean = np.array([-15, 20, -4, 6]) coeff_cov_sqrt = np.array([ [4.0, 0.0, 0.0, 0.0], - [1.2, 2.6, 0.0, 0.0], + [-3.2, -2.6, 0.0, 0.0], [4.7, 2.9, 2.0, 0.0], - [4.9, 0.3, 0.1, 3.6], + [-1.9, 6.3, 4.6, -3.6], ]) random_state = np.random.RandomState(seed=34285676) coefficients = ( @@ -55,23 +53,28 @@ ) fdatabasis_original = FDataBasis(basis, coefficients) -# Plot the first 6 curves +# %% +# Plot the basis functions used to generate the data +basis.plot() +plt.title("Basis functions") + +# %% +# Plot some of the generated curves plt.figure(figsize=(10, 5)) -fdatabasis_original[:6].plot() +fdatabasis_original[:10].plot() plt.title("Original curves") plt.show() - # %% -# Sencondly, we subsample of the original data by measuring a random number of +# We subsample the original data by measuring a random number of # points per curve generating an irregular dataset. -# Moreover, we add some noise to the data. +# Moreover, we add some gaussian noise to the data. fd_irregular_without_noise = irregular_sample( - fdatabasis_original, - n_points_per_curve=random_state.randint(4, 8, n_curves), + fdata=fdatabasis_original, + n_points_per_curve=random_state.randint(2, 6, n_curves), random_state=random_state, ) -noise_std = .5 +noise_std = .3 fd_irregular = FDataIrregular( points=fd_irregular_without_noise.points, start_indices=fd_irregular_without_noise.start_indices, @@ -80,12 +83,14 @@ ), ) -# Plot 9 curves of the newly created irregular data -fig = plt.figure(figsize=(10, 10)) -for k in range(9): - axes = plt.subplot(3, 3, k + 1) +# %% +# Plot 3 curves of the newly created irregular data along with the original +fig = plt.figure(figsize=(10, 3)) +for k in range(3): + axes = plt.subplot(1, 3, k + 1) fdatabasis_original[k].plot(axes=axes, alpha=0.3, color=f"C{k}") fd_irregular[k].plot(axes=axes, marker=".", color=f"C{k}") + plt.ylim((-27, 27)) plt.show() # %% @@ -112,37 +117,49 @@ train_converted = converter.transform(train_irregular) test_converted = converter.transform(test_irregular) -train_curvewise_to_basis = train_irregular.to_basis(basis) -test_curvewise_to_basis = test_irregular.to_basis(basis) +train_functionwise_to_basis = train_irregular.to_basis( + basis, + conversion_type="function-wise", +) +test_functionwise_to_basis = test_irregular.to_basis( + basis, + conversion_type="function-wise", +) # %% -# To visualize the conversion results, we plot the first 8 original and -# converted curves of the test set. On the background, we plot the train set. +# To visualize the conversion results, we plot the first original and +# converted curves of the test set. fig = plt.figure(figsize=(11, 16)) plt.suptitle("Comparison of the original and converted data (test set)") for k in range(10): axes = plt.subplot(5, 2, k + 1) - test_irregular[k+1].scatter( - axes=axes, color=f"C{k}", label="Irregular" + test_irregular[k].scatter(axes=axes, color=f"C{k}", label="Irregular") + test_original[k].plot( + axes=axes, color=f"C{k}", alpha=0.5, label="Original", ) - test_curvewise_to_basis[k+1].plot( - axes=axes, color=f"C{k}", linestyle=":", label="Curve-wise", + test_functionwise_to_basis[k].plot( + axes=axes, color=f"C{k}", linestyle=":", label="Function-wise", ) - test_converted[k+1].plot( + test_converted[k].plot( axes=axes, color=f"C{k}", linestyle="--", label="Mixed-effects", ) - test_original[k+1].plot( - axes=axes, color=f"C{k}", alpha=0.5, label="Original", - ) - # axes.legend(bbox_to_anchor=(1., 0.3)) axes.legend() - plt.tight_layout(rect=[0, 0, 1, 0.98]) - # Same scale for all plots: - plt.ylim((-17, 15)) + plt.ylim((-27, 27)) # Same scale for all plots + +plt.tight_layout(rect=[0, 0, 1, 0.98]) plt.show() # %% +# As can be seen in the previous plot, when measurements are distributed +# across the domain, both the mixed effects model and the function-wise +# conversion are able to provide a good approximation of the original data. +# However, when the measurements are concentrated in a small region of +# the domain, e can see that the mixed effects model is able to provide a more +# accurate approximation. Moreover, the mixed effects model is able to remove +# the noise from the measurements, which is not the case for the function-wise +# conversion. +# # Finally, we make use of the :math:`R^2` score and the :math:`MSE` to compare # the converted basis representations with the original data, both for the # train and test sets. @@ -154,18 +171,134 @@ "Test": score_fun(test_original, test_converted), }, "Curve-wise": { - "Train": score_fun(train_original, train_curvewise_to_basis), - "Test": score_fun(test_original, test_curvewise_to_basis), + "Train": score_fun(train_original, train_functionwise_to_basis), + "Test": score_fun(test_original, test_functionwise_to_basis), }, }) for score_name, score_fun in score_functions.items() } for score_name, score_df in scores.items(): - print("-" * 35) print(f"{score_name} scores:") print("-" * 35) print(score_df, end="\n\n\n") + +# %% +# Real-world data +# --------------- +# The Canadian Weather dataset is downloaded from the package 'fda' in +# CRAN. It contains a FDataGrid with daily temperatures and precipitations, +# that is, it has a 2-dimensional image. We are interested only in the daily +# average temperatures, so we will use the first coordinate. +# +# As we want to illustrate the conversion of irregular data to basis, +# representation, we will take an irregular sample of the temperatures dataset +# containing only 8 points per curve. +weather = fetch_weather() +fd_temperatures = weather.data.coordinates[0] + +random_state = np.random.RandomState(seed=439472) # for reproducibility +irregular_temperatures = irregular_sample( + fdata=fd_temperatures, n_points_per_curve=8, random_state=random_state, +) +# %% +# The dataset contains information about the region of each station, +# which have different types of climate. We save the indices of the stations +# in each region to later plot some of them. +print(weather.categories["region"]) +arctic = np.where(weather.target == 0)[0] +atlantic = np.where(weather.target == 1)[0] +continental = np.where(weather.target == 2)[0] +pacific = np.where(weather.target == 3)[0] + + +# %% +# Here we plot the original data alongside one of the original curves +# and its irregularly sampled version. +fig = plt.figure(figsize=(10, 4)) + +axes = plt.subplot(1, 2, 1) +fd_temperatures.plot(axes=axes) +ylim = axes.get_ylim() +plt.title("All temperature curves") + +axes = plt.subplot(1, 2, 2) +k = 13 # index of the station +fd_temperatures[k].plot(axes=axes, color="black", alpha=0.4) +irregular_temperatures[k].scatter(axes=axes, color="black", marker="o") +plt.ylim(ylim) +plt.title(f"{fd_temperatures.sample_names[k]} station's temperature curve") + +plt.show() + +# %% +# Now, we convert the irregularly sampled temperature curves to basis +# representation. Due to the periodicity of the data, a Fourier basis is used. +basis = FourierBasis(n_basis=5, domain_range=fd_temperatures.domain_range) +irregular_temperatures_converted = irregular_temperatures.to_basis( + basis, conversion_type="mixed-effects", +) +curvewise_temperatures_converted = irregular_temperatures.to_basis( + basis, conversion_type="function-wise", +) + +# %% +# To visualize the conversion, we now plot 4 of the converted +# curves (one from each region) along with the original temperatures +# and the irregular points that we sampled. +idxes = [arctic[2], atlantic[4], continental[11], pacific[1]] +fig = plt.figure(figsize=(10, 10)) +for k in range(4): + axes = plt.subplot(2, 2, k + 1) + plt.tight_layout() + idx = idxes[k] + fd_temperatures[idx].plot( + axes=axes, color=f"C{k}", alpha=0.5, label="Original", + ) + curvewise_temperatures_converted[idx].plot( + axes=axes, color=f"C{k}", linestyle=":", label="Function-wise", + ) + irregular_temperatures_converted[idx].plot( + axes=axes, color=f"C{k}", linestyle="--", label="Mixed-effects", + ) + irregular_temperatures[idx].scatter( + axes=axes, color=f"C{k}", alpha=0.5, label="Irregular", + ) + plt.title( + f"{fd_temperatures.sample_names[idx]} station " + f"({weather.categories['region'][weather.target[idx]]})" + ) + plt.ylim(ylim) + axes.legend() + +plt.show() + +# %% +# Finally, we get a score of the quality of the conversion by comparing +# the obtained basis representation with the original data from the CRAN +# dataset. The :math:`R^2` score is used. +# +# Note that, to compare the original data and the basis representation (which +# have different :class:`FData` types), we have to evaluate the latter at +# the grid points of the former. +r2_me = r2_score( + fd_temperatures, + irregular_temperatures_converted.to_grid(fd_temperatures.grid_points), +) +r2_curvewise = r2_score( + fd_temperatures, + curvewise_temperatures_converted.to_grid(fd_temperatures.grid_points), +) +print(f"R2 score (function-wise): {r2_curvewise:f}") +print(f"R2 score (mixed-effects): {r2_me:f}") + +# %% +# As in the synthetic case, both conversion types are similar for the curves +# where the measurements are distributed across the domain. Otherwise, the +# mixed-effects model provides a more accurate approximation in the regions +# where the measurements of one curve are missing by using the information +# from the whole dataset. + # %% # References # ---------- From e2a4466b8ec2fb7eff57f6404dd3eb72167c1977 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 8 Jun 2024 17:01:22 +0200 Subject: [PATCH 30/48] remove extra comment line --- examples/plot_irregular_to_basis_mixed_effects.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index 6248bbd78..c1d46177b 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -5,7 +5,6 @@ This example converts irregular data to a basis representation using a mixed effects model. """ -# %% # Author: Pablo Cuesta Sierra # License: MIT From e6f00da2a6efce0bcffe762b4e6076c7bbf22626 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 8 Jun 2024 17:02:57 +0200 Subject: [PATCH 31/48] isort --- examples/plot_irregular_to_basis_mixed_effects.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index c1d46177b..fb0bd8b52 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -17,10 +17,9 @@ from skfda import FDataBasis, FDataIrregular from skfda.datasets import fetch_weather, irregular_sample +from skfda.misc.scoring import mean_squared_error, r2_score from skfda.representation.basis import BSplineBasis, FourierBasis from skfda.representation.conversion import EMMixedEffectsConverter -from skfda.misc.scoring import r2_score, mean_squared_error - # %% # Sythetic data From ba34522aba32959f34d4c8ee8bd31e0d66136125 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sat, 8 Jun 2024 23:43:44 +0200 Subject: [PATCH 32/48] wip example of decimation --- ...plot_irregular_mixed_effects_robustness.py | 145 ++++++++++++++---- 1 file changed, 113 insertions(+), 32 deletions(-) diff --git a/examples/plot_irregular_mixed_effects_robustness.py b/examples/plot_irregular_mixed_effects_robustness.py index 3f0535d00..8c765ec66 100644 --- a/examples/plot_irregular_mixed_effects_robustness.py +++ b/examples/plot_irregular_mixed_effects_robustness.py @@ -1,11 +1,13 @@ """ -Mixed-effects model for irregular data when removing measurement points +Mixed effects model for irregular data: robustness of the conversion by +decimation ======================================================================= This example converts irregular data to a basis representation using a mixed effects model and checks the robustness of the method by fitting the model with decreasing number of measurement points per curve. """ +# %% # Author: Pablo Cuesta Sierra # License: MIT @@ -18,24 +20,38 @@ from skfda import FDataIrregular from skfda.datasets import fetch_weather, irregular_sample +from skfda.misc.scoring import mean_squared_error, r2_score from skfda.representation.basis import FourierBasis from skfda.representation.conversion import EMMixedEffectsConverter -from skfda.misc.scoring import r2_score, mean_squared_error - # %% # For this example, we are going to check the robustness of # the mixed effects method for converting irregular data to basis # representation by removing some measurement points from the test and train -# sets and comparing the results. The temperatures from the Canadian weather -# dataset are used to generate the irregular data. +# sets and comparing the resulting conversions. +# +# The temperatures from the Canadian weather dataset are used to generate +# the irregular data. +# We use a Fourier basis due to the periodic nature of the data. fd_temperatures = fetch_weather().data.coordinates[0] basis = FourierBasis(n_basis=5, domain_range=fd_temperatures.domain_range) -fd_temperatures.plot() -plt.show() -basis.plot() +# %% +# We plot the original data and the basis functions. +fig = plt.figure(figsize=(10, 4)) + +axes = plt.subplot(1, 2, 1) +fd_temperatures.plot(axes=axes) +ylim = axes.get_ylim() +xlabel = axes.get_xlabel() +plt.title(fd_temperatures.dataset_name) + +axes = plt.subplot(1, 2, 2) +basis.plot(axes=axes) +axes.set_xlabel(xlabel) plt.title("Basis functions") + +plt.suptitle("") plt.show() # %% @@ -52,7 +68,7 @@ # curve, by removing measurement points from the previous dataset iteratively. train_irregular_list = [train_original] test_irregular_list = [test_original] -n_points_list = [40, 10, 7, 5, 4, 3] +n_points_list = [365, 40, 10, 7, 5, 4, 3] for n_points in n_points_list: train_irregular_list.append( irregular_sample( @@ -138,7 +154,6 @@ # Finally, we have the scores for the train and test sets with decreasing # number of measurement points per curve. for score_name in scores.keys(): - print("-" * 62) print(f"{score_name} scores:") print("-" * 62) print(( @@ -146,46 +161,112 @@ .set_index("n_points_per_curve").sort_index().to_string() ), end="\n\n\n") + # %% -# The following plots show the original curves along with the converted -# test curves for the conversions with 5, 4 and 3 points per curve. +# Plot the scores: +for score_name in scores.keys(): + df = ( + pd.DataFrame(scores[score_name]) + .sort_values("n_points_per_curve") + .set_index("n_points_per_curve") + ) + fig = plt.figure() + k = len(df) + plt.plot( + df.index[:k], + df["Train-sparse"][:k], + # fig=fig, + label=r"Fit $\mathcal{D}_{train}^{\ j}$; transform $\mathcal{D}_{train}^{\ j}$", + marker=".", + ) + plt.plot( + df.index[:k], + df["Test-sparse"][:k], + # fig=fig, + label=r"Fit $\mathcal{D}_{train}^{\ j}$; transform $\mathcal{D}_{test}^{\ j}$", + marker=".", + ) + plt.plot( + df.index[:k], + df["Test-original"][:k], + # fig=fig, + label=r"Fit $\mathcal{D}_{train}^{\ j}$; transform $\mathcal{D}_{test}^{\ 0}$", + marker=".", + ) + if score_name == "MSE": + plt.yscale("log") + plt.ylabel(f"${score_name}$ score (logscale)") + eps_name = "05-plot-mse.eps" + else: + plt.ylabel(f"${score_name}$ score") + eps_name = "05-plot-r2.eps" + plt.xscale("log") + plt.xlabel(r"Measurements per function (logscale)") + plt.legend() + # fig.savefig( + # f"plots/{eps_name}", + # format="eps", + # bbox_inches="tight", + # ) + plt.plot() -def plot_converted_test_curves(n_points_per_curve): - plt.figure(figsize=(10, 23)) - for k in range(7): - axes = plt.subplot(7, 1, k + 1) +# %% +# Show the original curves along with the converted +# test curves for the conversions with 7, 5, 4 and 3 points per curve. +def plot_curve(k): + plt.figure(figsize=(8, 8)) + i = 0 + for n_points_per_curve in n_points_list[3:]: + axes = plt.subplot(2, 2, i + 1) + i += 1 test_irregular_datasets[n_points_per_curve][k].scatter( - axes=axes, color=f"C{k}", + axes=axes, color="C0", + ) + fd_temperatures.mean().plot( + axes=axes, color=[0.4] * 3, label="Original dataset mean", + ) + fd_temperatures.plot( + axes=axes, color=[0.7] * 3, linewidth=0.2, ) test_original[k].plot( - axes=axes, color=f"C{k}", linewidth=0.65, - label="Original test curve", + axes=axes, color="C0", linewidth=0.65, label="Original test curve", ) converted_data["Test-sparse"][n_points_per_curve][k].plot( - axes=axes, color=f"C{k}", linestyle="--", - label=f"Test curve transformed from {n_points_per_curve} points", - ) - converted_data["Test-original"][n_points_per_curve][k].plot( - axes=axes, color=f"C{k}", alpha=0.5, - label="Test curve transformed from original 365 points", + axes=axes, + color="C0", + linestyle="--", + label=f"Test curve transformed", ) - axes.legend(bbox_to_anchor=(1., 1.)) plt.tight_layout(rect=[0, 0, 1, 0.98]) - plt.suptitle(f"Fitted model with {n_points_per_curve=}") + plt.title(f"Transform of test curves with {n_points_per_curve} points") + plt.ylim(ylim) + + plt.suptitle( + "Evolution of the conversion of a curve with decreasing measurements " + f"({test_original.sample_names[k]} station)" + ) + + # # Add legend: + handles, labels = plt.gca().get_legend_handles_labels() + plt.legend( + handles=handles, + loc="lower center", + ncols=3, + bbox_to_anchor=(-.1, -0.3), + ) + plt.tight_layout(pad=10) plt.show() -# %% -plot_converted_test_curves(n_points_per_curve=5) +# Plot two of the curves: +plot_curve(7) # %% -plot_converted_test_curves(n_points_per_curve=4) +plot_curve(8) -# %% -plot_converted_test_curves(n_points_per_curve=3) # %% # References From 31d69f23cfad15f8e0cc93f6562c98fae578820f Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sun, 9 Jun 2024 13:08:07 +0200 Subject: [PATCH 33/48] decimation example --- ...plot_irregular_mixed_effects_robustness.py | 81 +++++++++---------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/examples/plot_irregular_mixed_effects_robustness.py b/examples/plot_irregular_mixed_effects_robustness.py index 8c765ec66..64170f91d 100644 --- a/examples/plot_irregular_mixed_effects_robustness.py +++ b/examples/plot_irregular_mixed_effects_robustness.py @@ -1,6 +1,5 @@ """ -Mixed effects model for irregular data: robustness of the conversion by -decimation +Mixed effects model for irregular data: robustness of the conversion by decimation ======================================================================= This example converts irregular data to a basis representation using a mixed @@ -158,70 +157,61 @@ print("-" * 62) print(( pd.DataFrame(scores[score_name]) - .set_index("n_points_per_curve").sort_index().to_string() + .round(3).set_index("n_points_per_curve").sort_index() ), end="\n\n\n") # %% -# Plot the scores: -for score_name in scores.keys(): +# Plot the scores. +plt.figure(figsize=(12, 5)) +for i, (score_name, values) in enumerate(scores.items()): df = ( - pd.DataFrame(scores[score_name]) - .sort_values("n_points_per_curve") - .set_index("n_points_per_curve") + pd.DataFrame(values) + .sort_values("n_points_per_curve").set_index("n_points_per_curve") ) - fig = plt.figure() - k = len(df) + plt.subplot(1, 2, i + 1) + label_start = r"Fit $\mathcal{D}_{train}^{\ j}$; " plt.plot( - df.index[:k], - df["Train-sparse"][:k], - # fig=fig, - label=r"Fit $\mathcal{D}_{train}^{\ j}$; transform $\mathcal{D}_{train}^{\ j}$", + df.index, + df["Train-sparse"], + label=label_start + r"ransform $\mathcal{D}_{train}^{\ j}$", marker=".", ) plt.plot( - df.index[:k], - df["Test-sparse"][:k], - # fig=fig, - label=r"Fit $\mathcal{D}_{train}^{\ j}$; transform $\mathcal{D}_{test}^{\ j}$", + df.index, + df["Test-sparse"], + label=label_start + r"transform $\mathcal{D}_{test}^{\ j}$", marker=".", ) plt.plot( - df.index[:k], - df["Test-original"][:k], - # fig=fig, - label=r"Fit $\mathcal{D}_{train}^{\ j}$; transform $\mathcal{D}_{test}^{\ 0}$", + df.index, + df["Test-original"], + label=label_start + r"transform $\mathcal{D}_{test}^{\ 0}$", marker=".", ) if score_name == "MSE": plt.yscale("log") plt.ylabel(f"${score_name}$ score (logscale)") - eps_name = "05-plot-mse.eps" else: plt.ylabel(f"${score_name}$ score") - eps_name = "05-plot-r2.eps" + plt.xscale("log") plt.xlabel(r"Measurements per function (logscale)") plt.legend() - # fig.savefig( - # f"plots/{eps_name}", - # format="eps", - # bbox_inches="tight", - # ) plt.plot() # %% # Show the original curves along with the converted # test curves for the conversions with 7, 5, 4 and 3 points per curve. -def plot_curve(k): - plt.figure(figsize=(8, 8)) +def plot_conversion_evolution(index: int): + plt.figure(figsize=(8, 8.5)) i = 0 for n_points_per_curve in n_points_list[3:]: axes = plt.subplot(2, 2, i + 1) i += 1 - test_irregular_datasets[n_points_per_curve][k].scatter( + test_irregular_datasets[n_points_per_curve][index].scatter( axes=axes, color="C0", ) fd_temperatures.mean().plot( @@ -230,42 +220,51 @@ def plot_curve(k): fd_temperatures.plot( axes=axes, color=[0.7] * 3, linewidth=0.2, ) - test_original[k].plot( + test_original[index].plot( axes=axes, color="C0", linewidth=0.65, label="Original test curve", ) - converted_data["Test-sparse"][n_points_per_curve][k].plot( + converted_data["Test-sparse"][n_points_per_curve][index].plot( axes=axes, color="C0", linestyle="--", label=f"Test curve transformed", ) - plt.tight_layout(rect=[0, 0, 1, 0.98]) plt.title(f"Transform of test curves with {n_points_per_curve} points") plt.ylim(ylim) plt.suptitle( "Evolution of the conversion of a curve with decreasing measurements " - f"({test_original.sample_names[k]} station)" + f"({test_original.sample_names[index]} station)" ) - # # Add legend: + # Add common legend at the bottom: handles, labels = plt.gca().get_legend_handles_labels() + plt.tight_layout(h_pad=0, rect=[0, 0.1, 1, 1]) plt.legend( handles=handles, loc="lower center", ncols=3, bbox_to_anchor=(-.1, -0.3), ) - plt.tight_layout(pad=10) plt.show() -# Plot two of the curves: -plot_curve(7) +# %% +# Toronto station's temperature curve conversion evolution. +plot_conversion_evolution(7) + +# %% +# Iqaluit station's temperature curve conversion evolution. +plot_conversion_evolution(8) # %% -plot_curve(8) +# As can be seen in the figures, the fewer the measurements, the closer +# the converted curve is to the mean of the original dataset. +# This leads us to believe that when the amount of measurements is too low, +# the mixed-effects model is able to capture the general trend of the data, +# but it is not able to properly capture the individual variation of each +# curve. # %% From 6c5e62254b5b88ee3cee7b564ac809407da499bb Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sun, 9 Jun 2024 13:10:25 +0200 Subject: [PATCH 34/48] decimation example --- examples/plot_irregular_mixed_effects_robustness.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/plot_irregular_mixed_effects_robustness.py b/examples/plot_irregular_mixed_effects_robustness.py index 64170f91d..3eecbf3ec 100644 --- a/examples/plot_irregular_mixed_effects_robustness.py +++ b/examples/plot_irregular_mixed_effects_robustness.py @@ -6,7 +6,6 @@ effects model and checks the robustness of the method by fitting the model with decreasing number of measurement points per curve. """ -# %% # Author: Pablo Cuesta Sierra # License: MIT @@ -252,11 +251,11 @@ def plot_conversion_evolution(index: int): # %% # Toronto station's temperature curve conversion evolution. -plot_conversion_evolution(7) +plot_conversion_evolution(index=7) # %% # Iqaluit station's temperature curve conversion evolution. -plot_conversion_evolution(8) +plot_conversion_evolution(index=8) # %% # As can be seen in the figures, the fewer the measurements, the closer From 16c513af5a6927b91fa0015c39a9555d9917258d Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sun, 9 Jun 2024 20:14:10 +0200 Subject: [PATCH 35/48] decimation example --- ...plot_irregular_mixed_effects_robustness.py | 93 ++++++------------- 1 file changed, 30 insertions(+), 63 deletions(-) diff --git a/examples/plot_irregular_mixed_effects_robustness.py b/examples/plot_irregular_mixed_effects_robustness.py index 3eecbf3ec..1c2defaf7 100644 --- a/examples/plot_irregular_mixed_effects_robustness.py +++ b/examples/plot_irregular_mixed_effects_robustness.py @@ -64,37 +64,16 @@ # %% # Then, we create datasets with decreasing number of measurement points per # curve, by removing measurement points from the previous dataset iteratively. -train_irregular_list = [train_original] -test_irregular_list = [test_original] n_points_list = [365, 40, 10, 7, 5, 4, 3] +train_irregular_datasets = {} +test_irregular_datasets = {} +current_train = train_original +current_test = test_original for n_points in n_points_list: - train_irregular_list.append( - irregular_sample( - train_irregular_list[-1], - n_points_per_curve=n_points, - random_state=random_state, - ), - ) - test_irregular_list.append( - irregular_sample( - test_irregular_list[-1], - n_points_per_curve=n_points, - random_state=random_state, - ), - ) - -train_irregular_datasets = { - n_points: train_irregular - for n_points, train_irregular in zip( - n_points_list, train_irregular_list[1:], - ) -} -test_irregular_datasets = { - n_points: test_irregular - for n_points, test_irregular in zip( - n_points_list, test_irregular_list[1:], - ) -} + current_train = irregular_sample(current_train, n_points, random_state) + current_test = irregular_sample(current_test, n_points, random_state) + train_irregular_datasets[n_points] = current_train + test_irregular_datasets[n_points] = current_test # %% # We convert the irregular data to basis representation and compute the scores. @@ -102,21 +81,14 @@ # the converter with a train set that has :math:`k` points per curve, we # use it to transform that train set, the test set with :math:`k` points per # curve and the original test set with 365 points per curve. -score_functions = { - "R^2": r2_score, - "MSE": mean_squared_error, -} -converted_data = { - "Train-sparse": {}, - "Test-sparse": {}, - "Test-original": {}, -} +score_functions = {"R^2": r2_score, "MSE": mean_squared_error} +converted_data = {"Train-sparse": {}, "Test-sparse": {}, "Test-original": {}} scores = { score_name: { "n_points_per_curve": n_points_list, - **{data_name: [] for data_name in converted_data.keys()}, + **{data_name: [] for data_name in converted_data}, } - for score_name in score_functions.keys() + for score_name in score_functions } converter = EMMixedEffectsConverter(basis) for n_points, train_irregular, test_irregular in zip( @@ -125,28 +97,23 @@ test_irregular_datasets.values(), ): converter = converter.fit(train_irregular) - train_sparse_converted = converter.transform(train_irregular) - test_sparse_converted = converter.transform(test_irregular) - test_original_converted = converter.transform( - FDataIrregular.from_fdatagrid(test_original), - ) - converted_data["Train-sparse"][n_points] = train_sparse_converted - converted_data["Test-sparse"][n_points] = test_sparse_converted - converted_data["Test-original"][n_points] = test_original_converted - + transformed = { + "Train-sparse": converter.transform(train_irregular), + "Test-sparse": converter.transform(test_irregular), + "Test-original": converter.transform( + FDataIrregular.from_fdatagrid(test_original), + ), + } + # Store the converted data + for key, data in transformed.items(): + converted_data[key][n_points] = data + # Calculate and store the scores for score_name, score_fun in score_functions.items(): - scores[score_name]["Train-sparse"].append(score_fun( - train_original, - train_sparse_converted.to_grid(train_original.grid_points), - )) - scores[score_name]["Test-sparse"].append(score_fun( - test_original, - test_sparse_converted.to_grid(test_original.grid_points), - )) - scores[score_name]["Test-original"].append(score_fun( - test_original, - test_original_converted.to_grid(test_original.grid_points), - )) + for key in converted_data: + scores[score_name][key].append(score_fun( + test_original if "Test" in key else train_original, + transformed[key].to_grid(test_original.grid_points), + )) # %% # Finally, we have the scores for the train and test sets with decreasing @@ -250,11 +217,11 @@ def plot_conversion_evolution(index: int): # %% -# Toronto station's temperature curve conversion evolution. +# Toronto station's temperature curve conversion evolution: plot_conversion_evolution(index=7) # %% -# Iqaluit station's temperature curve conversion evolution. +# Iqaluit station's temperature curve conversion evolution: plot_conversion_evolution(index=8) # %% From 9e1a56677c31c4e8a0d90f47f485b98fa0f23922 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Tue, 11 Jun 2024 22:18:50 +0200 Subject: [PATCH 36/48] remove extra example --- examples/plot_fdatairregular_to_basis.py | 85 ------------------- .../plot_irregular_to_basis_mixed_effects.py | 1 + 2 files changed, 1 insertion(+), 85 deletions(-) delete mode 100644 examples/plot_fdatairregular_to_basis.py diff --git a/examples/plot_fdatairregular_to_basis.py b/examples/plot_fdatairregular_to_basis.py deleted file mode 100644 index ee3f8eef0..000000000 --- a/examples/plot_fdatairregular_to_basis.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Irregular data to basis representation -======================================================================= - -Convert irregular data to a basis representation using the ``to_basis`` -method of the :class:`skfda.representation.irregular.FDataIrregular` class. -""" -# Author: Pablo Cuesta Sierra -# License: MIT - -# sphinx_gallery_thumbnail_number = -1 - -import matplotlib.pyplot as plt -import numpy as np - -from skfda.datasets import fetch_weather, irregular_sample -from skfda.representation.basis import FourierBasis -from skfda.misc.scoring import r2_score - -# %% -# First, the Canadian Weather dataset is downloaded from the package 'fda' in -# CRAN. It contains a FDataGrid with daily temperatures and precipitations, -# that is, it has a 2-dimensional image. We are interested only in the daily -# average temperatures, so we will use the first coordinate. -# -# As we want to ilustrate the conversion of irregular data to basis, -# representation, we will take an irregular sample of the temperatures dataset -# containing only 8 points per curve. -fd_temperatures = fetch_weather().data.coordinates[0] -random_state = np.random.RandomState(seed=4934755) -irregular_temperatures = irregular_sample( - fdata=fd_temperatures, n_points_per_curve=8, random_state=random_state, -) - -# %% -# To get an idea of the irregular data we will be working with, 6 of the -# irregular curves are plotted, along with the original curves -# that they come from. -fig = plt.figure() -irregular_temperatures[-6:].scatter(fig=fig) -fd_temperatures[-6:].plot(fig=fig, alpha=0.1) -plt.show() - -# %% -# Now, we will convert the irregularly sampled temperature curves to basis -# representation. Due to the periodicity of the data, we will be using a -# Fourier basis. -basis = FourierBasis(n_basis=5, domain_range=fd_temperatures.domain_range) -irregular_temperatures_converted = irregular_temperatures.to_basis( - basis, conversion_type="mixed_effects", -) - -# %% -# To visualize the conversion, we will now plot 6 of the converted -# curves (smooth basis representation) along with the original temperatures -# (non-smooth) and the irregular points that we sampled. -fig = plt.figure(figsize=(10, 14)) -for k in range(6): - axes = plt.subplot(3, 2, k + 1) - fd_temperatures.plot(axes=axes, alpha=0.05, color="black") - fd_temperatures[k].plot(axes=axes, color=f"C{k}") - irregular_temperatures_converted[k].plot(axes=axes, color=f"C{k}") - irregular_temperatures[k].scatter(axes=axes, color=f"C{k}") -plt.show() - -# %% -# Finally, we will get a score of the quality of the conversion by comparing -# the obtained basis representation (``irregular_temperatures_converted``) -# with the original data (``fd_temperatures``) from the CRAN dataset. We will -# be using the :func:`skfda.misc.scoring.r2_score`. -# -# Note that, to compare the original data and the basis representation (which -# have different :class:`FData` types), we have to evaluate the latter at -# the grid points of the former. -r2 = r2_score( - fd_temperatures, - irregular_temperatures_converted.to_grid(fd_temperatures.grid_points), -) -print(f"R2 score: {r2:.2f}") - -# %% -# References -# ---------- -# -# .. footbibliography:: diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index fb0bd8b52..74389bee6 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -55,6 +55,7 @@ # Plot the basis functions used to generate the data basis.plot() plt.title("Basis functions") +plt.show() # %% # Plot some of the generated curves From 509acd4882ba3d2040c9a4ba8718ebbdf91fa1b1 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Tue, 11 Jun 2024 23:42:36 +0200 Subject: [PATCH 37/48] adapting mixed effects for higher dimensions --- .../conversion/_mixed_effects.py | 59 ++++++++++++++++--- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index 7abb2c2c3..0cb0c06af 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -49,7 +49,7 @@ _EM_MINIMIZATION_METHODS = [ "params", "squared-error", - "loglikelihood" + "loglikelihood", ] @@ -62,7 +62,9 @@ def _get_values_list( fdatairregular: Irregular data. Returns: - List of values vectors (one vector per functional datum). + List of values vectors (one vector per functional datum). If the + codomain is multidimensional, the vectors are flattened so that each + measurement's values are contiguous. Examples: >>> fdata = FDataIrregular( @@ -72,12 +74,17 @@ def _get_values_list( ... ) >>> _get_values_list(fdata) [array([1]), array([2, 3, 4, 5]), array([6, 7, 8, 9])] + >>> fdata_multidim = FDataIrregular( + ... start_indices=[0, 1, 3], + ... values=np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]), + ... points=list(zip(range(5), range(5))), + ... ) + >>> _get_values_list(fdata_multidim) + [array([1, 2]), array([3, 4, 5, 6]), array([ 7, 8, 9, 10])] """ - assert fdatairregular.dim_domain == 1 - assert fdatairregular.dim_codomain == 1 return np.split( fdatairregular.values.reshape(-1), - fdatairregular.start_indices[1:], + fdatairregular.start_indices[1:] * fdatairregular.dim_codomain, ) @@ -97,12 +104,46 @@ def _get_basis_evaluations_list( of the functional datum and n_basis is the number of basis functions. The i-th row of the matrix is the evaluation of the basis functions at the i-th point of the functional datum. + + Examples: + >>> from skfda.representation.basis import ( + ... MonomialBasis, VectorValuedBasis, + ... ) + >>> basis = MonomialBasis(n_basis=2) + >>> fdata = FDataIrregular( + ... start_indices=[0, 1, 5], + ... values=list(range(7)), + ... points=list(range(7)), + ... ) + >>> _get_basis_evaluations_list(fdata, basis) + [array([[1, 0]]), array([[1, 1], + [1, 2], + [1, 3], + [1, 4]]), array([[1, 5], + [1, 6]])] + >>> monomial_2 = MonomialBasis(n_basis=2, domain_range=(0, 10)) + >>> monomial_3 = MonomialBasis(n_basis=3, domain_range=(0, 10)) + >>> vector_basis = VectorValuedBasis([monomial_2, monomial_3]) + >>> fdata = FDataIrregular( + ... start_indices=[0, 1, 4], + ... values=list(zip(range(6), range(6))), + ... points=list(range(6)), + ... ) + >>> _get_basis_evaluations_list(fdata, vector_basis) + [array([[ 1., 0., 0., 0., 0.], + [ 0., 0., 1., 0., 0.]]), array([[ 1., 1., 0., 0., 0.], + [ 0., 0., 1., 1., 1.], + [ 1., 2., 0., 0., 0.], + [ 0., 0., 1., 2., 4.], + [ 1., 3., 0., 0., 0.], + [ 0., 0., 1., 3., 9.]]), array([[ 1., 4., 0., 0., 0.], + [ 0., 0., 1., 4., 16.], + [ 1., 5., 0., 0., 0.], + [ 0., 0., 1., 5., 25.]])] """ - assert fdatairregular.dim_domain == 1 - assert fdatairregular.dim_codomain == 1 return np.split( - basis(fdatairregular.points)[:, :, 0].T, - fdatairregular.start_indices[1:], + basis(fdatairregular.points).reshape(basis.n_basis, -1).T, + fdatairregular.start_indices[1:] * fdatairregular.dim_codomain, ) From 9e4a612174ec01a28ea3a161765fb8f28b51dc5b Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Wed, 12 Jun 2024 12:12:18 +0200 Subject: [PATCH 38/48] isort --- skfda/representation/conversion/_mixed_effects.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index 0cb0c06af..e0dd86e16 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -11,12 +11,7 @@ from abc import ABC from dataclasses import dataclass -from typing import ( - Callable, - List, - Literal, - Protocol, -) +from typing import Callable, List, Literal, Protocol import numpy as np import scipy @@ -28,7 +23,6 @@ from ...typing._numpy import NDArrayFloat from ._to_basis import _ToBasisConverter - _SCIPY_MINIMIZATION_METHODS = [ "BFGS", # no hessian "Powell", # no jacobian From f442a58733e7657cb4bad8af525228d9dd60e0cc Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Wed, 12 Jun 2024 13:11:47 +0200 Subject: [PATCH 39/48] test sample_irregular --- skfda/datasets/_sample_from_fdata.py | 2 +- skfda/tests/_test_vector_mixed_effects.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 skfda/tests/_test_vector_mixed_effects.py diff --git a/skfda/datasets/_sample_from_fdata.py b/skfda/datasets/_sample_from_fdata.py index c685640d6..3597a0cc0 100644 --- a/skfda/datasets/_sample_from_fdata.py +++ b/skfda/datasets/_sample_from_fdata.py @@ -27,7 +27,7 @@ def irregular_sample( an FDataGrid or an FDataIrregular and a sample has less points than specified in n_points_per_curve, the sample will have the same number of points as before. - + random_state: Random state to control the random number generation. """ if fdata.dim_domain != 1 or fdata.dim_codomain != 1: raise NotImplementedError( diff --git a/skfda/tests/_test_vector_mixed_effects.py b/skfda/tests/_test_vector_mixed_effects.py new file mode 100644 index 000000000..a4f4a6a2e --- /dev/null +++ b/skfda/tests/_test_vector_mixed_effects.py @@ -0,0 +1,19 @@ +# %% +import numpy as np +from skfda.representation.basis import MonomialBasis +from skfda.representation.basis import VectorValuedBasis +import matplotlib.pyplot as plt +# %% +m2 = MonomialBasis(n_basis=2, domain_range=(0, 10)) +m3 = MonomialBasis(n_basis=3, domain_range=(0, 10)) +m3.plot() +plt.show() + +# %% +# m2.plot() +vbasis = VectorValuedBasis([m2, m3]) +vbasis.plot() +plt.show() + +# %% +vbasis(2) \ No newline at end of file From 84e92d0f5be8d2391a3680842c012106f4b2adf4 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Wed, 12 Jun 2024 14:50:32 +0200 Subject: [PATCH 40/48] irregular_sample for multidimensional datasets (domain and codomain) --- skfda/datasets/_sample_from_fdata.py | 104 +++++++++------- skfda/tests/test_sample_from_fdata.py | 168 ++++++++++++++++++++++++++ 2 files changed, 226 insertions(+), 46 deletions(-) create mode 100644 skfda/tests/test_sample_from_fdata.py diff --git a/skfda/datasets/_sample_from_fdata.py b/skfda/datasets/_sample_from_fdata.py index 3597a0cc0..e7ef587fa 100644 --- a/skfda/datasets/_sample_from_fdata.py +++ b/skfda/datasets/_sample_from_fdata.py @@ -1,10 +1,11 @@ from __future__ import annotations -from typing import List, Tuple from functools import singledispatch -import numpy as np +from typing import List, Tuple +import numpy as np +from .._utils import _cartesian_product, _to_grid_points from ..misc.validation import validate_random_state from ..representation import FDataBasis, FDataGrid, FDataIrregular from ..typing._base import RandomState, RandomStateLike @@ -18,8 +19,12 @@ def irregular_sample( ) -> FDataIrregular: """Irregularly sample from a FDataGrid or FDataBasis object. - Only implemented for 1D domains and codomains. The points are selected at - random (uniformly) from the domain of the input object. + The points are selected at random (uniformly) from the domain of the input + object. + If the input is an FDataGrid or an FDataIrregular, the points are selected + uniformly from the finite grid points of the object. If the input is an + FDataBasis, the points are selected from the rectangular domain of the + with a uniform (continuous) distribution. Args: fdata: Functional data object to sample from. @@ -29,49 +34,53 @@ def irregular_sample( of points as before. random_state: Random state to control the random number generation. """ - if fdata.dim_domain != 1 or fdata.dim_codomain != 1: - raise NotImplementedError( - "Only implemented for 1D domains and codomains.", - ) - random_state = validate_random_state(random_state) if isinstance(n_points_per_curve, int): n_points_per_curve = np.full(fdata.n_samples, n_points_per_curve) - points_list, n_points_per_curve = ( + points_list, start_indices = ( _irregular_sample_points_list( fdata, n_points_per_curve=n_points_per_curve, random_state=random_state, ) ) - values = np.concatenate([ - func(func_points).reshape(-1) - for func, func_points in zip(fdata, points_list) - ]) + return FDataIrregular( points=np.concatenate(points_list), - start_indices=np.cumsum( - np.concatenate([ - np.zeros(1, dtype=int), - n_points_per_curve[:-1], - ]), - ), + start_indices=start_indices, values=np.concatenate([ - func(func_points).reshape(-1) + func(func_points)[0, :, :] for func, func_points in zip(fdata, points_list) ]), ) +def _start_indices(n_points_per_curve: NDArrayInt) -> NDArrayInt: + return np.cumsum( + np.concatenate([ + np.zeros(1, dtype=int), + n_points_per_curve[:-1], + ]), + ) + + @singledispatch def _irregular_sample_points_list( fdata: FDataBasis | FDataGrid | FDataIrregular, n_points_per_curve: NDArrayInt, random_state: RandomState, ) -> Tuple[List[NDArrayFloat], NDArrayInt]: + """Return a list of points and the start indices for each curve. + + The points are selected at random (uniformly) from the domain of the input. + + Returns: + points_list: List of points for each curve. + start_indices: Start indices for each curve. + """ raise NotImplementedError( - "Only implemented for FDataGrid and FDataBasis.", + "Only implemented for FDataBasis, FDataGrid and FDataIrregular.", ) @@ -80,20 +89,21 @@ def _irregular_sample_points_matrix_fdatagrid( fdata: FDataGrid, n_points_per_curve: NDArrayInt, random_state: RandomState, -) -> List[NDArrayFloat]: - # This only works for 1D domains +) -> Tuple[List[NDArrayFloat], NDArrayInt]: + all_points_single_function = _cartesian_product( + _to_grid_points(fdata.grid_points), + ) + flat_points = np.tile( + all_points_single_function, (fdata.n_samples, 1), + ) n_points_per_curve = np.minimum( n_points_per_curve, - len(fdata.grid_points[0]), + len(flat_points), ) return [ - random_state.choice( - fdata.grid_points[0].reshape(-1), - size=n_points, - replace=False, - ) + random_state.permutation(flat_points)[:n_points] for n_points in n_points_per_curve - ], n_points_per_curve + ], _start_indices(n_points_per_curve) @_irregular_sample_points_list.register @@ -101,8 +111,7 @@ def _irregular_sample_points_matrix_fdatairregular( fdata: FDataIrregular, n_points_per_curve: NDArrayInt, random_state: RandomState, -) -> List[NDArrayFloat]: - # This only works for 1D domains +) -> Tuple[List[NDArrayFloat], NDArrayInt]: original_n_points_per_curve = np.diff( np.concatenate([fdata.start_indices, [len(fdata.points)]]), ) @@ -111,16 +120,14 @@ def _irregular_sample_points_matrix_fdatairregular( original_n_points_per_curve, ) return [ - random_state.choice( - curve_points.reshape(-1), - size=min(n_points, len(curve_points)), - replace=False, - ) + random_state.permutation(curve_points)[ + :min(n_points, len(curve_points)), + ] for n_points, curve_points in zip( n_points_per_curve, np.split(fdata.points, fdata.start_indices[1:]), ) - ], n_points_per_curve + ], _start_indices(n_points_per_curve) @_irregular_sample_points_list.register @@ -128,12 +135,17 @@ def _irregular_sample_points_matrix_fdatabasis( fdata: FDataBasis, n_points_per_curve: NDArrayInt, random_state: RandomState, -) -> List[NDArrayFloat]: - # This only works for 1D domains - return [ +) -> Tuple[List[NDArrayFloat], NDArrayInt]: + len_points = np.sum(n_points_per_curve) + separate_coordinate_points = [ random_state.uniform( - *fdata.domain_range[0], - size=(n_points), + *domain_range_coordinate, + size=(len_points), ) - for n_points in n_points_per_curve - ], n_points_per_curve + for domain_range_coordinate in fdata.domain_range + ] + start_indices = _start_indices(n_points_per_curve) + points = np.stack( + separate_coordinate_points, axis=1, + ) + return np.split(points, start_indices[1:]), start_indices diff --git a/skfda/tests/test_sample_from_fdata.py b/skfda/tests/test_sample_from_fdata.py new file mode 100644 index 000000000..5e351d299 --- /dev/null +++ b/skfda/tests/test_sample_from_fdata.py @@ -0,0 +1,168 @@ +from typing import Any + +import numpy as np +import pytest + +from skfda import FData, FDataBasis, FDataGrid, FDataIrregular +from skfda.datasets import irregular_sample +from skfda.representation.basis import ( + FourierBasis, + MonomialBasis, + TensorBasis, + VectorValuedBasis, +) + +random_state = np.random.RandomState(23486974) + + +def _assert_equivalent(fdata: FData, fdatairregular: FDataIrregular) -> None: + points = np.split(fdatairregular.points, fdatairregular.start_indices[1:]) + assert len(points) == len(fdatairregular) == len(fdata) + # for fun_points, original, irregular in zip(points, fdata, fdatairregular): + for i, (fun_points, original, irregular) in enumerate(zip(points, fdata, fdatairregular)): + try: + np.testing.assert_allclose( + irregular.values, original(fun_points)[0], + # irregular(fun_points), original(fun_points), + ) + except AssertionError: + print(f"{i=}") + print(f"{fun_points=}") + print(f"{original=}") + print(f"{irregular=}") + print() + print(f"{original(fun_points)=}") + print(f"{irregular.values=}") + raise AssertionError() + + +@pytest.fixture +def fdatabasis_1dimensional() -> FDataBasis: + basis = MonomialBasis(n_basis=4, domain_range=(0, 1)) + return FDataBasis( + basis=basis, + coefficients=random_state.randn(15, basis.n_basis), + ) + + +@pytest.fixture +def fdatabasis_multidimensional() -> FDataBasis: + """3-dimensional domain and 2-dimensional codomain""" + basis_momonial1 = MonomialBasis(n_basis=3, domain_range=(-3, 3)) + basis_fourier1 = FourierBasis(n_basis=3, domain_range=(-3, 3)) + basis_monomial2 = MonomialBasis(n_basis=2, domain_range=(0, 1)) + basis_fourier2 = FourierBasis(n_basis=5, domain_range=(0, 1)) + + tensor_basis1 = TensorBasis([basis_momonial1, basis_monomial2]) + tensor_basis2 = TensorBasis([basis_fourier1, basis_fourier2]) + + basis = VectorValuedBasis([tensor_basis1, tensor_basis2, tensor_basis1]) + return FDataBasis( + basis=basis, + coefficients=random_state.randn(15, basis.n_basis), + ) + + +@pytest.fixture +def fdatabasis_2dimensional_domain() -> FDataBasis: + basis_fourier = FourierBasis(n_basis=5, domain_range=(-3, 3)) + basis_monomial = MonomialBasis(n_basis=4, domain_range=(0, 1)) + + basis = TensorBasis([basis_fourier, basis_monomial]) + # import matplotlib.pyplot as plt + # basis.plot() + # plt.show() + return FDataBasis( + basis=basis, + coefficients=random_state.randn(15, basis.n_basis), + ) + + +@pytest.fixture +def fdatagrid_1dimensional() -> FDataGrid: + return FDataGrid( + data_matrix=random_state.randn(14, 50), + grid_points=np.linspace(0, 100, 50), + ) + + +@pytest.fixture +def fdatagrid_multidimensional() -> FDataGrid: + return FDataGrid( + data_matrix=random_state.randn(14, 10, 5, 7, 5), + grid_points=[ + np.linspace(0, 100, 10), + np.linspace(-20, 20, 5), + np.linspace(-20, 20, 7), + ], + ) + + +@pytest.fixture +def fdatairregular_1dimensional() -> FDataIrregular: + start_indices = np.concatenate([ + [0], np.cumsum(random_state.randint(2, 5, 17)), + ]) + return FDataIrregular( + points=random_state.randn(100), + values=random_state.randn(100), + start_indices=start_indices, + ) + + +@pytest.fixture +def fdatairregular_multidimensional() -> FDataIrregular: + start_indices = np.concatenate([ + [0], np.cumsum(random_state.randint(2, 5, 17)), + ]) + return FDataIrregular( + points=random_state.randn(100, 1), # TODO: change to multidimensional + # domain when evaluation of FDataIrregular is working for + # multidimensional domain + values=random_state.randn(100, 5), + start_indices=start_indices, + ) + + +@pytest.mark.parametrize( + "fdata_fixture", + [ + "fdatabasis_1dimensional", + "fdatagrid_1dimensional", + "fdatairregular_1dimensional", + "fdatabasis_2dimensional_domain", + "fdatabasis_multidimensional", + "fdatagrid_multidimensional", + "fdatairregular_multidimensional", + ], +) +def test_irregular_sample( + fdata_fixture: str, request: Any +) -> None: + fdata: FDataBasis | FDataGrid | FDataIrregular = ( + request.getfixturevalue(fdata_fixture) + ) + n_points_per_curve = random_state.randint(1, 15, fdata.n_samples) + fdatairregular = irregular_sample( + fdata, + n_points_per_curve=n_points_per_curve, + random_state=random_state, + ) + + got_points_per_curve = np.diff( + np.append(fdatairregular.start_indices, [len(fdatairregular.points)]), + ) + if isinstance(fdata, FDataBasis): + assert all(got_points_per_curve == n_points_per_curve) + else: + assert all(got_points_per_curve <= n_points_per_curve) + + assert fdatairregular.values.shape == ( + sum(got_points_per_curve), fdata.dim_codomain, + ) + + # The values of the irregular sample should not contain NaNs + # because the original datasets do not contain NaNs in their values + assert np.sum(np.isnan(fdatairregular.values)) == 0 + + _assert_equivalent(fdata, fdatairregular) From 704d254f3676de8b436ef45bceee4780db7978c7 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Wed, 12 Jun 2024 19:02:09 +0200 Subject: [PATCH 41/48] comment and clean the tests for irregular_sample --- skfda/tests/test_sample_from_fdata.py | 32 +++++++++------------------ 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/skfda/tests/test_sample_from_fdata.py b/skfda/tests/test_sample_from_fdata.py index 5e351d299..8bb6dde1c 100644 --- a/skfda/tests/test_sample_from_fdata.py +++ b/skfda/tests/test_sample_from_fdata.py @@ -18,22 +18,13 @@ def _assert_equivalent(fdata: FData, fdatairregular: FDataIrregular) -> None: points = np.split(fdatairregular.points, fdatairregular.start_indices[1:]) assert len(points) == len(fdatairregular) == len(fdata) - # for fun_points, original, irregular in zip(points, fdata, fdatairregular): - for i, (fun_points, original, irregular) in enumerate(zip(points, fdata, fdatairregular)): - try: - np.testing.assert_allclose( - irregular.values, original(fun_points)[0], - # irregular(fun_points), original(fun_points), - ) - except AssertionError: - print(f"{i=}") - print(f"{fun_points=}") - print(f"{original=}") - print(f"{irregular=}") - print() - print(f"{original(fun_points)=}") - print(f"{irregular.values=}") - raise AssertionError() + for fun_points, original, irregular in zip(points, fdata, fdatairregular): + np.testing.assert_allclose( + irregular.values, original(fun_points)[0], + # irregular(fun_points), original(fun_points), + ) + # The commented line above should be used but evaluation of + # FDataIrregular is not working for multidimensional domain @pytest.fixture @@ -67,11 +58,7 @@ def fdatabasis_multidimensional() -> FDataBasis: def fdatabasis_2dimensional_domain() -> FDataBasis: basis_fourier = FourierBasis(n_basis=5, domain_range=(-3, 3)) basis_monomial = MonomialBasis(n_basis=4, domain_range=(0, 1)) - basis = TensorBasis([basis_fourier, basis_monomial]) - # import matplotlib.pyplot as plt - # basis.plot() - # plt.show() return FDataBasis( basis=basis, coefficients=random_state.randn(15, basis.n_basis), @@ -88,6 +75,7 @@ def fdatagrid_1dimensional() -> FDataGrid: @pytest.fixture def fdatagrid_multidimensional() -> FDataGrid: + """3-dimensional domain and 5-dimensional codomain""" return FDataGrid( data_matrix=random_state.randn(14, 10, 5, 7, 5), grid_points=[ @@ -116,9 +104,9 @@ def fdatairregular_multidimensional() -> FDataIrregular: [0], np.cumsum(random_state.randint(2, 5, 17)), ]) return FDataIrregular( - points=random_state.randn(100, 1), # TODO: change to multidimensional + points=random_state.randn(100, 1), # TODO: Change to multidimensional # domain when evaluation of FDataIrregular is working for - # multidimensional domain + # multidimensional domains. values=random_state.randn(100, 5), start_indices=start_indices, ) From c4151e995b3a6c51a78297fe0b913e2d90f35ede Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Wed, 12 Jun 2024 19:51:47 +0200 Subject: [PATCH 42/48] Tests for the multidimensional case (_mixed_effects) --- .../conversion/_mixed_effects.py | 6 +- skfda/tests/test_mixed_effects_converter.py | 282 +++++++++--------- 2 files changed, 152 insertions(+), 136 deletions(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index e0dd86e16..8101ac54d 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -179,7 +179,7 @@ def _linalg_solve( try: return scipy.linalg.solve(a=a, b=b, assume_a=assume_a) # type: ignore except scipy.linalg.LinAlgError: - # TODO: is the best way to handle this ? + # TODO: is the best way to handle this? # print("Warning: scipy.linalg.solve failed, using scipy.linalg.lstsq") return scipy.linalg.lstsq(a=a, b=b)[0] # type: ignore @@ -220,7 +220,7 @@ def sigmasq(self) -> float: @property def covariance_div_sigmasq(self) -> NDArrayFloat: - """Covariance of the mixed effects.""" + """Covariance of the mixed effects divided by sigmasq.""" @property def mean(self) -> NDArrayFloat: @@ -921,6 +921,6 @@ def fit( fitted_params=fitted_params, success=converged, message=message, - nit=iter_number, + nit=iter_number + 1, ) return self diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py index 91b1afda0..b21e2c20d 100644 --- a/skfda/tests/test_mixed_effects_converter.py +++ b/skfda/tests/test_mixed_effects_converter.py @@ -1,34 +1,25 @@ """Tests for the mixed effects to-basis-converter.""" -import pytest +from typing import Any, Literal, Optional, Tuple, Type + import numpy as np -import numpy.typing as npt -from typing import ( - Any, - Callable, - Iterable, - List, - Optional, - Tuple, - Type, -) +import pytest from skfda import FDataBasis +from skfda.datasets import irregular_sample from skfda.misc.scoring import r2_score -from skfda.representation import ( - FDataBasis, - FDataIrregular, -) -from skfda.typing._numpy import (NDArrayFloat, NDArrayInt) +from skfda.representation import FDataBasis, FDataIrregular from skfda.representation.basis import ( + Basis, BSplineBasis, FourierBasis, + MonomialBasis, + TensorBasis, + VectorValuedBasis, ) from skfda.representation.conversion._mixed_effects import ( + EMMixedEffectsConverter, MinimizeMixedEffectsConverter, MixedEffectsConverter, - EMMixedEffectsConverter, - _get_values_list, - _get_basis_evaluations_list, _MixedEffectsModel, ) @@ -48,6 +39,8 @@ def test_loglikelihood() -> None: basis = FourierBasis(n_basis=5, domain_range=(0, 10)) model = _MixedEffectsModel(fdatairregular, basis) + # These values have been obtained with Statsmodels' MixedLM + # for the same model params_loglike_list = [ (np.array([ 217.36197672, 111.34775404, 169.8070363, 337.91045293, @@ -102,164 +95,187 @@ def test_loglikelihood() -> None: assert np.allclose(mixedlm_loglikelihood, model_loglikelihood) -def _create_irregular_samples( - funcs: Iterable[ - Callable[[npt.NDArray[np.float_]], npt.NDArray[np.float_]] - ], - points: npt.NDArray[np.float_], - noise_generate_std: float, +def _create_irregular_samples_with_noise( + fdatabasis_original: FDataBasis, *, - start_indices: NDArrayInt | None = None, - n_points: int | None = None, + noise_generate_std: float, + n_points_range: Tuple[int], + random_state: np.random.RandomState, ) -> FDataIrregular: - """Generate samples of functions at points with gaussian noise. + """Generate samples of functions at random points with gaussian noise. Args: - funcs: Functions to sample. - points: Points where to sample. + fdatabasis_original: Functions to sample. noise_generate_std: Standard deviation of the gaussian noise. - start_indices: Start indices of each sample. - n_points: Number of points of each sample. If not None, start_indices - is ignored. + n_points_range: Range of the number of points of each sample. """ - if n_points is not None: - start_indices = np.arange(0, len(points), n_points) - elif start_indices is None: - raise ValueError("Either n_points or start_indices must be provided") - fun_points = np.split(points, start_indices[1:]) - fun_values = np.concatenate([ - func(point) for func, point in zip(funcs, fun_points) - ]).reshape((-1, 1)) + n_points_per_sample = random_state.randint( + *n_points_range, fdatabasis_original.n_samples, + ) + fdatairregular_no_noise = irregular_sample( + fdatabasis_original, + n_points_per_curve=n_points_per_sample, + random_state=random_state, + ) noise_values = np.random.normal( - 0, noise_generate_std, len(fun_values), - ).reshape((-1, 1)) - return FDataIrregular( - start_indices=start_indices, - points=points, - values=fun_values + noise_values, + 0, noise_generate_std, fdatairregular_no_noise.values.shape, ) - - -def _get_points( - domain_range: Tuple[float, float], - n_points: int, - n_samples: int, - type_gen_points: int, -) -> npt.NDArray[np.float_]: - n = type_gen_points - tot_n_points = n_points * n_samples - domain_split = np.linspace(*domain_range, n + 1) - domains = list(zip(domain_split[:-1], domain_split[1:])) - points_list = [ - np.random.uniform( - domain[0] - 0.6 * (domain[1] - domain[0]), - domain[1] + 0.6 * (domain[1] - domain[0]), - size=tot_n_points // n) - for domain in domains - ] - ret_value = np.concatenate(points_list).reshape((-1, 1))[:tot_n_points] - - return ( - ret_value - * (ret_value >= domain_range[0]) - * (ret_value <= domain_range[1]) - + domain_range[0] * (ret_value < domain_range[0]) - + domain_range[1] * (ret_value > domain_range[1]) + return FDataIrregular( + start_indices=fdatairregular_no_noise.start_indices, + points=fdatairregular_no_noise.points, + values=fdatairregular_no_noise.values + noise_values, ) def _cmp_estimation_with_original( - n_points: int, - sigma: float, # to generate the noise - domain_range: Tuple[float, float], - funcs: List[Callable[[NDArrayFloat], NDArrayFloat]], - type_gen_points: int, - estimator: MixedEffectsConverter, - fit_kwargs: dict[str, Any], fdatabasis_original: FDataBasis, + noise_generate_std: float, # to generate the noise + converter: MixedEffectsConverter, + fit_kwargs: dict[str, Any], + check: Literal["r2_score", "coefficients", "both"], + random_state: np.random.RandomState, ) -> None: - n_samples = len(funcs) - points = _get_points(domain_range, n_points, n_samples, type_gen_points) - fdatairregular = _create_irregular_samples( - funcs=funcs, - points=points, - noise_generate_std=sigma, - n_points=n_points, + fdatairregular = _create_irregular_samples_with_noise( + fdatabasis_original=fdatabasis_original, + noise_generate_std=noise_generate_std, + n_points_range=(5, 9), + random_state=random_state, ) - - fdatabasis_estimated = estimator.fit_transform( + fdatabasis_estimated = converter.fit_transform( fdatairregular, **fit_kwargs, ) - assert estimator.result.success, "Optimization failed" - assert r2_score(fdatabasis_estimated, fdatabasis_original) > 0.9 + assert converter.result.success, "Optimization failed" + if check in ("r2_score", "both"): + assert r2_score(fdatabasis_estimated, fdatabasis_original) > 0.9 + if check in ("r2_score", "both"): + np.allclose( + fdatabasis_estimated.coefficients, + fdatabasis_original.coefficients, + ) -def _test_compare_with_original( - estimator_cls: Type[MixedEffectsConverter], - fit_kwargs: Optional[dict[str, Any]] = None, -) -> None: - np.random.seed(34285676) - if fit_kwargs is None: - fit_kwargs = {} - - domain_range = (0, 100) - _max_val = 5 - n_points = 7 - n_basis = 3 - n_samples = 40 +def _get_fdatabasis_original( + basis: Basis, + n_samples: int, + random_state: np.random.RandomState, +) -> FDataBasis: + # These scales are arbitrary + _scale_cov = 5 + _scale_mean = 10 - basis = BSplineBasis( - n_basis=n_basis, domain_range=domain_range, order=2, - ) - sigma = 0.1 + n_basis = basis.n_basis fe_cov_sqrt = np.zeros((n_basis, n_basis)) - fe_cov_sqrt[np.tril_indices(n_basis)] = np.random.rand( + fe_cov_sqrt[np.tril_indices(n_basis)] = random_state.randn( n_basis * (n_basis + 1) // 2, - ) * _max_val + ) * _scale_cov fe_cov = fe_cov_sqrt @ fe_cov_sqrt.T - mean = np.array([-15, 20, 6]) - fdatabasis_original = FDataBasis( + mean = random_state.randn(n_basis) * _scale_mean + return FDataBasis( basis=basis, - coefficients=np.random.multivariate_normal( + coefficients=random_state.multivariate_normal( mean=mean, cov=fe_cov, size=n_samples, ), ) - def fun(i: int): - return lambda x: fdatabasis_original[i](x).reshape(x.shape) - funcs = [fun(i) for i in range(n_samples)] +def _test_cmp_with_original_bsplines( + converter_cls: Type[MixedEffectsConverter], + fit_kwargs: Optional[dict[str, Any]] = None, +) -> None: + random_state = np.random.RandomState(238953274) + if fit_kwargs is None: + fit_kwargs = {} + + fdatabasis_original = _get_fdatabasis_original( + basis=BSplineBasis( + n_basis=3, domain_range=(0, 100), order=2, + ), + n_samples=40, + random_state=random_state, + ) _cmp_estimation_with_original( - n_points=n_points, - sigma=sigma, - funcs=funcs, - type_gen_points=5, - estimator=estimator_cls(basis=basis), - domain_range=domain_range, - fit_kwargs=fit_kwargs, fdatabasis_original=fdatabasis_original, + noise_generate_std=0.1, + converter=converter_cls(basis=fdatabasis_original.basis), + fit_kwargs=fit_kwargs, + check="both", + random_state=random_state, ) -def test_compare_minimize_with_original() -> None: - """Compare the EM conversion with the original data.""" - _test_compare_with_original( - estimator_cls=MinimizeMixedEffectsConverter, +def test_cmp_minimize_with_original() -> None: + """Compare the MinimizeMixedEffects conversion with the original data.""" + _test_cmp_with_original_bsplines( + converter_cls=MinimizeMixedEffectsConverter, fit_kwargs={ "minimization_method": "Powell", } ) -def test_compare_em_with_original() -> None: - """Compare the EM conversion with the original data.""" - _test_compare_with_original( - estimator_cls=EMMixedEffectsConverter, +# This test for EM with simple splines as we have the multidimensional one, +# so as to reduce execution time. +# def test_compare_em_with_original_bsplines() -> None: +# """Compare the EM conversion with the original data.""" +# _test_cmp_with_original_bsplines( +# converter_cls=EMMixedEffectsConverter, +# fit_kwargs={ +# "maxiter": 500, +# "convergence_criterion": "params", +# "rtol": 1e-3, +# } +# ) + + +def _test_cmp_with_original_multidimensional_data( + converter_cls: Type[MixedEffectsConverter], + fit_kwargs: Optional[dict[str, Any]] = None, +) -> None: + """Compare the conversion with the original data. + + The dimension of the domain and the dimension of the codomain are both 2. + """ + random_state = np.random.RandomState(238953274) + if fit_kwargs is None: + fit_kwargs = {} + + basis_momonial1 = MonomialBasis(n_basis=3, domain_range=(-3, 3)) + basis_fourier1 = FourierBasis(n_basis=1, domain_range=(-3, 3)) + basis_monomial2 = MonomialBasis(n_basis=1, domain_range=(0, 1)) + basis_fourier2 = FourierBasis(n_basis=3, domain_range=(0, 1)) + + tensor_basis1 = TensorBasis([basis_momonial1, basis_monomial2]) + tensor_basis2 = TensorBasis([basis_fourier1, basis_fourier2]) + + basis = VectorValuedBasis([tensor_basis1, tensor_basis2, tensor_basis1]) + fdatabasis_original = _get_fdatabasis_original( + basis=basis, + n_samples=40, + random_state=random_state, + ) + + _cmp_estimation_with_original( + fdatabasis_original=fdatabasis_original, + noise_generate_std=0.1, + converter=converter_cls(basis=fdatabasis_original.basis), + fit_kwargs=fit_kwargs, + check="coefficients", + random_state=random_state, + ) + + +def test_compare_em_with_original_multidimensional_data() -> None: + """Compare the EM conversion with the original data. + + The dimension of the domain and the dimension of the codomain are both 2. + """ + _test_cmp_with_original_multidimensional_data( + converter_cls=EMMixedEffectsConverter, fit_kwargs={ "maxiter": 500, "convergence_criterion": "params", - "rtol": 1e-3, + "rtol": 1e-1, } ) From 4073501c9fdec072b1ce13951932bb4682cf96a2 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Thu, 13 Jun 2024 17:04:12 +0200 Subject: [PATCH 43/48] documentation and review --- docs/modules/representation/conversion.rst | 13 +++- skfda/datasets/_sample_from_fdata.py | 49 ++++++------- .../conversion/_mixed_effects.py | 72 +++++++++++-------- 3 files changed, 79 insertions(+), 55 deletions(-) diff --git a/docs/modules/representation/conversion.rst b/docs/modules/representation/conversion.rst index d1e1c0694..2cadd1b0d 100644 --- a/docs/modules/representation/conversion.rst +++ b/docs/modules/representation/conversion.rst @@ -8,8 +8,17 @@ and :class:`FDataBasis` has been implemented via converters. :class:`FDataIrregular` to :class:`FDataBasis` ---------------------------------------------- -The following classes are used for converting irregular functional data to -basis representation using the mixed effects model. +The following module contains a detailed description of one of the types +of converters. + +.. autosummary:: + :toctree: autosummary + + skfda.representation.conversion._mixed_effects + + +The following classes are used for converting irregular functional +data to basis representation using the mixed effects model. .. autosummary:: :toctree: autosummary diff --git a/skfda/datasets/_sample_from_fdata.py b/skfda/datasets/_sample_from_fdata.py index e7ef587fa..5cd8d9eb0 100644 --- a/skfda/datasets/_sample_from_fdata.py +++ b/skfda/datasets/_sample_from_fdata.py @@ -29,9 +29,9 @@ def irregular_sample( Args: fdata: Functional data object to sample from. n_points_per_curve: Number of points to sample per curve. If fdata is - an FDataGrid or an FDataIrregular and a sample has less points than - specified in n_points_per_curve, the sample will have the same number - of points as before. + an FDataGrid or an FDataIrregular and a sample has less points than + specified in n_points_per_curve, the sample will have the same + number of points as before. random_state: Random state to control the random number generation. """ random_state = validate_random_state(random_state) @@ -100,10 +100,13 @@ def _irregular_sample_points_matrix_fdatagrid( n_points_per_curve, len(flat_points), ) - return [ - random_state.permutation(flat_points)[:n_points] - for n_points in n_points_per_curve - ], _start_indices(n_points_per_curve) + return ( + [ + random_state.permutation(flat_points)[:n_points] + for n_points in n_points_per_curve + ], + _start_indices(n_points_per_curve), + ) @_irregular_sample_points_list.register @@ -119,15 +122,18 @@ def _irregular_sample_points_matrix_fdatairregular( n_points_per_curve, original_n_points_per_curve, ) - return [ - random_state.permutation(curve_points)[ - :min(n_points, len(curve_points)), - ] - for n_points, curve_points in zip( - n_points_per_curve, - np.split(fdata.points, fdata.start_indices[1:]), - ) - ], _start_indices(n_points_per_curve) + return ( + [ + random_state.permutation(curve_points)[ + :min(n_points, len(curve_points)), + ] + for n_points, curve_points in zip( + n_points_per_curve, + np.split(fdata.points, fdata.start_indices[1:]), + ) + ], + _start_indices(n_points_per_curve), + ) @_irregular_sample_points_list.register @@ -138,14 +144,9 @@ def _irregular_sample_points_matrix_fdatabasis( ) -> Tuple[List[NDArrayFloat], NDArrayInt]: len_points = np.sum(n_points_per_curve) separate_coordinate_points = [ - random_state.uniform( - *domain_range_coordinate, - size=(len_points), - ) + random_state.uniform(*domain_range_coordinate, size=(len_points)) for domain_range_coordinate in fdata.domain_range ] start_indices = _start_indices(n_points_per_curve) - points = np.stack( - separate_coordinate_points, axis=1, - ) - return np.split(points, start_indices[1:]), start_indices + points = np.stack(separate_coordinate_points, axis=1) + return (np.split(points, start_indices[1:]), start_indices) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index 8101ac54d..78ab1055e 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -4,7 +4,16 @@ This module contains the class for converting irregular data to basis representation using the mixed effects model. -#TODO: Add references ? (laird & ware) +The use of the mixed effects model for conversion of irregularly sampled +functional data to a basis representation is detailed in +:footcite:t:`james_2018_sparsenessfda`. In the following, we provide a brief +overview of the model. + +Let :math:`(x_i(t))_{i=1}^N` be a functional dataset where each :math:`x_i` +is a function from :math:`[a, b]` to :math:`\mathbb{R}` and we have the +measurements of :math:`x_i(t)` at :math:`M_i` points of the domain +:math:`\{t_{i1}, t_{i2}, \dots, t_{iM_i}\}`. Let :math:`\{\phi_b\}_{b=1}^B` +be the basis that we want to express the data in. """ from __future__ import annotations @@ -93,12 +102,21 @@ def _get_basis_evaluations_list( basis: Basis to evaluate. Returns: - A list of matrices (one matrix per functional datum), each matrix is + A list of matrices (one matrix per functional datum). + + In the case of 1-dimensional codomain, each matrix is of shape (n_points, n_basis), where n_points is the number of points of the functional datum and n_basis is the number of basis functions. The i-th row of the matrix is the evaluation of the basis functions at the i-th point of the functional datum. + In the case of p-dimensional codomain, each matrix is + of shape (n_points * dim_codomain, n_basis) (where n_points is the + number of points of the functional datum). + The (i*dim_codomain + j)-th row of the matrix is the j-th coordinate of + the evaluation of the basis functions at the i-th point of the + functional datum. + Examples: >>> from skfda.representation.basis import ( ... MonomialBasis, VectorValuedBasis, @@ -144,38 +162,35 @@ def _get_basis_evaluations_list( def _minimize( fun: Callable[[NDArrayFloat], float], x0: NDArrayFloat, - minimization_methods: str | List[str] | None = None, + minimization_method: str | None = None, ) -> scipy.optimize.OptimizeResult: - """Minimize a scalar function of one or more variables.""" - if isinstance(minimization_methods, str): - minimization_methods = [minimization_methods] - - if minimization_methods is None: - minimization_methods = _SCIPY_MINIMIZATION_METHODS - else: - for method in minimization_methods: - if method not in _SCIPY_MINIMIZATION_METHODS: - raise ValueError(f"Invalid minimize method: \"{method}\".") - - for method in minimization_methods: - result = scipy.optimize.minimize( - fun=fun, - x0=x0, - method=method, - options={ - # "disp": True, - # "maxiter": 1000, - }, + """Minimize a scalar function of one or more variables. + + Uses scipy.optimize.minimize. + """ + if minimization_method is None: + minimization_method = _SCIPY_MINIMIZATION_METHODS[0] + elif minimization_method not in _SCIPY_MINIMIZATION_METHODS: + raise ValueError( + f"Invalid minimize method: \"{minimization_method}\".", ) - if result.success is True: - break + + result = scipy.optimize.minimize( + fun=fun, + x0=x0, + method=minimization_method, + options={ + # "disp": True, + # "maxiter": 1000, + }, + ) return result # even if it failed def _linalg_solve( a: NDArrayFloat, b: NDArrayFloat, *, assume_a: str = 'gen' ) -> NDArrayFloat: - """Solve a linear system of equations: a @ x = b""" + """Solve a linear system of equations: a @ x = b (returns x).""" try: return scipy.linalg.solve(a=a, b=b, assume_a=assume_a) # type: ignore except scipy.linalg.LinAlgError: @@ -447,8 +462,7 @@ class MixedEffectsConverter(_ToBasisConverter[FDataIrregular], ABC): - nit: Number of iterations of the fitting. """ - # after fitting: - result: Bunch | None + result: Bunch | None # not None after fitting def __init__( self, @@ -653,7 +667,7 @@ def objective_function(params_vec: NDArrayFloat) -> float: minimize_result = _minimize( fun=objective_function, x0=initial_params_vec, - minimization_methods=minimization_method, + minimization_method=minimization_method, ) params = MinimizeMixedEffectsConverter.Params.from_vec( minimize_result.x, From 0b9d37fe25fadad818225b457e905eee21c61d87 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Fri, 14 Jun 2024 12:01:50 +0200 Subject: [PATCH 44/48] simplify test for mixed_effets so that it runs faster --- skfda/tests/test_mixed_effects_converter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py index b21e2c20d..054ca5970 100644 --- a/skfda/tests/test_mixed_effects_converter.py +++ b/skfda/tests/test_mixed_effects_converter.py @@ -191,7 +191,7 @@ def _test_cmp_with_original_bsplines( basis=BSplineBasis( n_basis=3, domain_range=(0, 100), order=2, ), - n_samples=40, + n_samples=20, random_state=random_state, ) @@ -266,7 +266,7 @@ def _test_cmp_with_original_multidimensional_data( ) -def test_compare_em_with_original_multidimensional_data() -> None: +def test_cmp_em_with_original_multidimensional_data() -> None: """Compare the EM conversion with the original data. The dimension of the domain and the dimension of the codomain are both 2. From 523a9ea77105f138b6c360c674f15f0b08d59a39 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Fri, 14 Jun 2024 15:06:18 +0200 Subject: [PATCH 45/48] documentation and small changes --- docs/modules/representation/conversion.rst | 21 +-- .../conversion/mixed_effects.rst | 16 ++ docs/refs.bib | 3 - ...plot_irregular_mixed_effects_robustness.py | 7 - .../plot_irregular_to_basis_mixed_effects.py | 1 - .../conversion/_mixed_effects.py | 164 ++++++++++++++---- skfda/representation/conversion/_to_basis.py | 7 +- skfda/tests/test_mixed_effects_converter.py | 2 +- 8 files changed, 159 insertions(+), 62 deletions(-) create mode 100644 docs/modules/representation/conversion/mixed_effects.rst diff --git a/docs/modules/representation/conversion.rst b/docs/modules/representation/conversion.rst index 2cadd1b0d..c519f1c29 100644 --- a/docs/modules/representation/conversion.rst +++ b/docs/modules/representation/conversion.rst @@ -8,22 +8,11 @@ and :class:`FDataBasis` has been implemented via converters. :class:`FDataIrregular` to :class:`FDataBasis` ---------------------------------------------- -The following module contains a detailed description of one of the types -of converters. +These are the submodules that contain the converters for the conversion between +:class:`FDataIrregular` and :class:`FDataBasis`: -.. autosummary:: - :toctree: autosummary +.. toctree:: + :maxdepth: 2 - skfda.representation.conversion._mixed_effects - - -The following classes are used for converting irregular functional -data to basis representation using the mixed effects model. - -.. autosummary:: - :toctree: autosummary - - skfda.representation.conversion.EMMixedEffectsConverter - skfda.representation.conversion.MinimizeMixedEffectsConverter - skfda.representation.conversion.MixedEffectsConverter + conversion/mixed_effects diff --git a/docs/modules/representation/conversion/mixed_effects.rst b/docs/modules/representation/conversion/mixed_effects.rst new file mode 100644 index 000000000..53334422b --- /dev/null +++ b/docs/modules/representation/conversion/mixed_effects.rst @@ -0,0 +1,16 @@ +Mixed effects converters +######################## + +The following classes can be used for converting irregular functional +data to basis representation using the mixed effects model. + +.. autosummary:: + :toctree: autosummary + + skfda.representation.conversion.MinimizeMixedEffectsConverter + skfda.representation.conversion.EMMixedEffectsConverter + + +.. automodule:: skfda.representation.conversion._mixed_effects + :no-members: + diff --git a/docs/refs.bib b/docs/refs.bib index 4cce3aa0a..9dc39710a 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -677,7 +677,6 @@ @article{Lindstrom_1988 volume = {83}, issue = {404}, page = {1014--1022}, - url = {libgen.li/file.php?md5=432e6fa80db6feb0cb39b8d0215e5d3a} } @article{laird+lange+stram_1987_emmixedeffects, @@ -690,6 +689,4 @@ @article{laird+lange+stram_1987_emmixedeffects year = {1987}, publisher = {Taylor \& Francis}, doi = {10.1080/01621459.1987.10478395}, - url = {https://www.tandfonline.com/doi/abs/10.1080/01621459.1987.10478395}, - eprint = {https://www.tandfonline.com/doi/pdf/10.1080/01621459.1987.10478395} } \ No newline at end of file diff --git a/examples/plot_irregular_mixed_effects_robustness.py b/examples/plot_irregular_mixed_effects_robustness.py index 1c2defaf7..1d9827e1f 100644 --- a/examples/plot_irregular_mixed_effects_robustness.py +++ b/examples/plot_irregular_mixed_effects_robustness.py @@ -231,10 +231,3 @@ def plot_conversion_evolution(index: int): # the mixed-effects model is able to capture the general trend of the data, # but it is not able to properly capture the individual variation of each # curve. - - -# %% -# References -# ---------- -# -# .. footbibliography:: diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index 74389bee6..a51c32a40 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -59,7 +59,6 @@ # %% # Plot some of the generated curves -plt.figure(figsize=(10, 5)) fdatabasis_original[:10].plot() plt.title("Original curves") plt.show() diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index 78ab1055e..c241764ba 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -1,19 +1,108 @@ # -*- coding: utf-8 -*- -"""Mixed effects converters. +r""" +Mixed effects converters +======================== -This module contains the class for converting irregular data to basis +This module contains the classes for converting irregular data to basis representation using the mixed effects model. The use of the mixed effects model for conversion of irregularly sampled functional data to a basis representation is detailed in :footcite:t:`james_2018_sparsenessfda`. In the following, we provide a brief -overview of the model. +overview of the model for 1-dimensional functional data. -Let :math:`(x_i(t))_{i=1}^N` be a functional dataset where each :math:`x_i` +The mixed effects model for functional data +------------------------------------------- + +Let :math:`\{x_i(t)\}_{i=1}^N` be a functional dataset where each :math:`x_i` is a function from :math:`[a, b]` to :math:`\mathbb{R}` and we have the measurements of :math:`x_i(t)` at :math:`M_i` points of the domain -:math:`\{t_{i1}, t_{i2}, \dots, t_{iM_i}\}`. Let :math:`\{\phi_b\}_{b=1}^B` -be the basis that we want to express the data in. +:math:`\mathbf{t}_i = (t_{i1}, t_{i2}, \dots, t_{iM_i})`. +That is, we have the irregularly sampled data: +:math:`\{x_i(\mathbf{t}_i))\}_{i=1}^N`, where +:math:`x_i(\mathbf{t}_i) = (x_i(t_{i1}), x_i(t_{i2}), \dots, x_i(t_{iM_i}))`. +Let :math:`\{\phi_b\}_{b=1}^B` be the basis that we want to express the +data in. We denote by :math:`\pmb{\phi}(t)` the vector of evaluations +:math:`(\phi_1(t), \phi_2(t), \dots, \phi_B(t))`. + +The mixed effects model assumes the data comes from the model (for each +:math:`1\leq i \leq N` and :math:`a\leq t \leq b`): + +.. math:: + x_i(t) = \pmb{\phi}(t)^T (\pmb{\beta} + \pmb{\gamma}_i) + \epsilon_i(t), + +where :math:`\pmb{\beta}\in\mathbb{R}^B` is an unknown constant vector +called the fixed effects (we will call it the **mean**); +:math:`\{\pmb{\gamma}_i\}_{i=1}^N\subseteq\mathbb{R}^B` are unknown +random vectors called the **random effects** and they are assumed to be +independent and identically with a normal distribution of mean 0 and +covariance matrix :math:`\pmb{\Gamma}` (which we call **covariance** for +short); and :math:`\epsilon_i(t)` is a random noise term that is assumed to +have a normal distribution with mean 0 and variance :math:`\sigma^2` (which we +call **sigmasq**). We assume that +:math:`\{\epsilon_i(t)\}_{i,t}\cup\{\pmb{\gamma}_i\}_i` are independent. + +In order to work with this model and the data available, we denote (for each +:math:`1 \leq i \leq N`): + +.. math:: + + \pmb{x}_i = \left(\begin{array}{c} + x_i(t_{i1}) \\ + x_i(t_{i2}) \\ + \vdots \\ + x_i(t_{iM_i}) + \end{array}\right), + \qquad + \pmb{\Phi}_i = \left(\begin{array}{c} + \pmb{\phi}(t_{i1})^T \\ + \pmb{\phi}(t_{i2})^T \\ + \vdots \\ + \pmb{\phi}(t_{iM_i})^T + \end{array}\right), + \qquad + \pmb{\epsilon}_i = \left(\begin{array}{c} + \epsilon_i(t_{i1}) \\ + \epsilon_i(t_{i2}) \\ + \vdots \\ + \epsilon_i(t_{iM_i}) + \end{array}\right), + +and we have that our model can be written as (for each +:math:`1 \leq i \leq N`): + +.. math:: + + \pmb{x}_i = \pmb{\Phi}_i (\pmb{\beta} + \pmb{\gamma}_i) + \pmb{\epsilon}_i. + +We call :math:`\pmb{x}_i` the *i-th* **values** *vector*, and +:math:`\pmb{\Phi}_i` the *i-th* **basis evaluations** *matrix*. + + +Fitting the model +----------------- + +The model is fitted by maximizing its likelihood to get the MLE (Maximum +Likelihood Estimates) of :math:`\pmb{\beta}`, :math:`\pmb{\Gamma}` and +:math:`\sigma`, and then computing the random effects +(:math:`\{\pmb{\gamma}_i\}_i`) with their least squares linear estimators. + +The MLE are computed using either the EM algorithm +(:class:`EMMixedEffectsConverter`, +:footcite:t:`laird+lange+stram_1987_emmixedeffects`), or by minimizing the +profile loglikelihood of the model with generic numerical optimization +(:class:`MinimizeMixedEffectsConverter`, :footcite:t:`Lindstrom_1988`). + +The examples +:ref:`sphx_glr_auto_examples_plot_irregular_mixed_effects_robustness.py` and +:ref:`sphx_glr_auto_examples_plot_irregular_to_basis_mixed_effects.py` +illustrate the basic usage of these converters. + + +References +---------- + +.. footbibliography:: """ from __future__ import annotations @@ -41,12 +130,15 @@ "COBYLA", # no jacobian "SLSQP", "CG", # no hessian - "trust-ncg", - "trust-exact", - "trust-krylov", "TNC", - "dogleg", - "Newton-CG", # requires jacobian + + # The following methods require jacobian and we do not provide it + + # "trust-ncg", + # "trust-exact", + # "trust-krylov", + # "dogleg", + # "Newton-CG", ] _EM_MINIMIZATION_METHODS = [ @@ -166,13 +258,20 @@ def _minimize( ) -> scipy.optimize.OptimizeResult: """Minimize a scalar function of one or more variables. - Uses scipy.optimize.minimize. + Uses ``scipy.optimize.minimize``. + + Args: + fun: Function to minimize. + x0: Starting point for the minimization. + minimization_method: ``scipy.optimize.minimize`` method to use for + minimization. """ if minimization_method is None: minimization_method = _SCIPY_MINIMIZATION_METHODS[0] elif minimization_method not in _SCIPY_MINIMIZATION_METHODS: raise ValueError( - f"Invalid minimize method: \"{minimization_method}\".", + f"Invalid minimize method: \"{minimization_method}\". " + f"Supported methods are {_SCIPY_MINIMIZATION_METHODS}." ) result = scipy.optimize.minimize( @@ -231,11 +330,11 @@ def covariance(self) -> NDArrayFloat: @property def sigmasq(self) -> float: - """Variance of the residuals.""" + """Variance of the noise term.""" @property def covariance_div_sigmasq(self) -> NDArrayFloat: - """Covariance of the mixed effects divided by sigmasq.""" + """Covariance of the random effects divided by sigmasq.""" @property def mean(self) -> NDArrayFloat: @@ -251,7 +350,7 @@ class _MixedEffectsParamsResult: @property def covariance_div_sigmasq(self) -> NDArrayFloat: - """covariance/sigmasq of the mixed effects model.""" + """Covariance of the random effects divided by sigmasq.""" return self.covariance / self.sigmasq @@ -353,7 +452,7 @@ def values_covariances( ) Args: - sigmasq: Variance of the residuals. + sigmasq: Variance of the noise term. random_effects_covariance: Covariance of the random effects. """ @@ -369,7 +468,7 @@ def _random_effects_estimate( values_covariances: List[NDArrayFloat], partial_residuals: List[NDArrayFloat], ) -> NDArrayFloat: - """Estimates of the random effects (generalized least squares) + """Estimates of the random effects (generalized least squares). random_effects_estimate[k] = ( random_effects_covariance @ basis_evaluations[k].T @@ -455,7 +554,7 @@ class MixedEffectsConverter(_ToBasisConverter[FDataIrregular], ABC): - model: Fitted mixed effects model. - fitted_params: Fitted parameters of the mixed effects model. - - minimize_result: Result of the scipy.optimize.minimize call, + - minimize_result: Result of the ``scipy.optimize.minimize`` call, if this function was used. - success: Whether the fitting was successful. - message: Message of the fitting. @@ -475,7 +574,7 @@ def transform( self, X: FDataIrregular, ) -> FDataBasis: - """Transform to FDataBasis using the fitted converter.""" + """Transform X to FDataBasis using the fitted converter.""" if self.result is None: raise ValueError("The converter has not been fitted.") @@ -499,7 +598,7 @@ def transform( class MinimizeMixedEffectsConverter(MixedEffectsConverter): - """Mixed effects to-basis-converter using scipy.optimize. + """Mixed effects to-basis-converter using ``scipy.optimize.minimize``. Minimizes the profile loglikelihood of the mixed effects model as proposed by :footcite:t:`Lindstrom_1988`. @@ -573,7 +672,7 @@ def covariance(self) -> NDArrayFloat: @property def sigmasq(self) -> float: - """Variance of the residuals.""" + """Variance of the noise term.""" assert self._model is not None, "Model is required" return _sum_mahalanobis( self._model.partial_residuals(self.mean), @@ -629,8 +728,8 @@ def fit( X: irregular data to fit. y: ignored. initial_params: initial params of the model. - minimization_method: scipy.optimize.minimize method to be used for - the minimization of the loglikelihood of the model. + minimization_method: ``scipy.optimize.minimize`` method to be used + for the minimization of the loglikelihood of the model. has_mean: Whether the mean is a fixed parameter to be optimized or estimated with ML estimator from the covariance parameters. @@ -686,7 +785,11 @@ def objective_function(params_vec: NDArrayFloat) -> float: minimize_result=minimize_result, success=minimize_result.success, message=minimize_result.message, - nit=minimize_result.nit, + **( + {"nit": minimize_result.nit} + if "nit" in minimize_result.keys() + else {} + ), ) return self @@ -739,7 +842,7 @@ def _mean( model: _MixedEffectsModel, values_covariances_list: List[NDArrayFloat], ) -> NDArrayFloat: - """Return the beta estimate.""" + """Return the mean estimate.""" return _linalg_solve( a=_sum_mahalanobis( model.basis_evaluations, @@ -822,11 +925,14 @@ def fit( maxiter: maximum number of iterations. convergence_criterion: convergence criterion to use when fitting. - - "params" to use relative differences between parameters + - "params": + to use relative differences between parameters (the default). - - "squared-error" to use relative changes in the squared error + - "squared-error": + to use relative changes in the squared error of the estimated values with respect to the original data. - - "loglikelihood" to use relative changes in the loglikelihood. + - "loglikelihood": + to use relative changes in the loglikelihood. rtol: relative tolerance for convergence. Returns: diff --git a/skfda/representation/conversion/_to_basis.py b/skfda/representation/conversion/_to_basis.py index 54868e985..7703a9489 100644 --- a/skfda/representation/conversion/_to_basis.py +++ b/skfda/representation/conversion/_to_basis.py @@ -7,11 +7,9 @@ """ from __future__ import annotations -from typing import ( - TypeVar, -) +from typing import TypeVar -from ..._utils._sklearn_adapter import BaseEstimator, TransformerMixin +from ..._utils._sklearn_adapter import TransformerMixin from ...representation import FData, FDataBasis from ...representation.basis import Basis @@ -23,7 +21,6 @@ class _ToBasisConverter( - BaseEstimator, TransformerMixin[Input, FDataBasis, object], ): """To basis converter. diff --git a/skfda/tests/test_mixed_effects_converter.py b/skfda/tests/test_mixed_effects_converter.py index 054ca5970..b2078ab5c 100644 --- a/skfda/tests/test_mixed_effects_converter.py +++ b/skfda/tests/test_mixed_effects_converter.py @@ -274,7 +274,7 @@ def test_cmp_em_with_original_multidimensional_data() -> None: _test_cmp_with_original_multidimensional_data( converter_cls=EMMixedEffectsConverter, fit_kwargs={ - "maxiter": 500, + "maxiter": 300, "convergence_criterion": "params", "rtol": 1e-1, } From 20bd812ab2f53f17d965d8a366f6405f6f7a290d Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Fri, 14 Jun 2024 16:14:08 +0200 Subject: [PATCH 46/48] change order of examples in documentation --- skfda/representation/conversion/_mixed_effects.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skfda/representation/conversion/_mixed_effects.py b/skfda/representation/conversion/_mixed_effects.py index c241764ba..323919266 100644 --- a/skfda/representation/conversion/_mixed_effects.py +++ b/skfda/representation/conversion/_mixed_effects.py @@ -94,8 +94,8 @@ (:class:`MinimizeMixedEffectsConverter`, :footcite:t:`Lindstrom_1988`). The examples -:ref:`sphx_glr_auto_examples_plot_irregular_mixed_effects_robustness.py` and -:ref:`sphx_glr_auto_examples_plot_irregular_to_basis_mixed_effects.py` +:ref:`sphx_glr_auto_examples_plot_irregular_to_basis_mixed_effects.py` and +:ref:`sphx_glr_auto_examples_plot_irregular_mixed_effects_robustness.py` illustrate the basic usage of these converters. From 6ad8998f92e473e4f9d0b313b526b01e6c6a3c64 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Fri, 14 Jun 2024 16:19:37 +0200 Subject: [PATCH 47/48] remove file that should not have been commited --- skfda/tests/_test_vector_mixed_effects.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 skfda/tests/_test_vector_mixed_effects.py diff --git a/skfda/tests/_test_vector_mixed_effects.py b/skfda/tests/_test_vector_mixed_effects.py deleted file mode 100644 index a4f4a6a2e..000000000 --- a/skfda/tests/_test_vector_mixed_effects.py +++ /dev/null @@ -1,19 +0,0 @@ -# %% -import numpy as np -from skfda.representation.basis import MonomialBasis -from skfda.representation.basis import VectorValuedBasis -import matplotlib.pyplot as plt -# %% -m2 = MonomialBasis(n_basis=2, domain_range=(0, 10)) -m3 = MonomialBasis(n_basis=3, domain_range=(0, 10)) -m3.plot() -plt.show() - -# %% -# m2.plot() -vbasis = VectorValuedBasis([m2, m3]) -vbasis.plot() -plt.show() - -# %% -vbasis(2) \ No newline at end of file From 1d696e79d6cc8d575c175669a4fe47cb780ca291 Mon Sep 17 00:00:00 2001 From: Pablo Cuesta Sierra Date: Sun, 16 Jun 2024 21:13:13 +0200 Subject: [PATCH 48/48] Make example more visually interesting --- examples/plot_irregular_to_basis_mixed_effects.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/plot_irregular_to_basis_mixed_effects.py b/examples/plot_irregular_to_basis_mixed_effects.py index a51c32a40..b340080c6 100644 --- a/examples/plot_irregular_to_basis_mixed_effects.py +++ b/examples/plot_irregular_to_basis_mixed_effects.py @@ -191,13 +191,13 @@ # # As we want to illustrate the conversion of irregular data to basis, # representation, we will take an irregular sample of the temperatures dataset -# containing only 8 points per curve. +# containing only 7 points per curve. weather = fetch_weather() fd_temperatures = weather.data.coordinates[0] -random_state = np.random.RandomState(seed=439472) # for reproducibility +random_state = np.random.RandomState(seed=73947291) irregular_temperatures = irregular_sample( - fdata=fd_temperatures, n_points_per_curve=8, random_state=random_state, + fdata=fd_temperatures, n_points_per_curve=7, random_state=random_state, ) # %% # The dataset contains information about the region of each station, @@ -244,7 +244,7 @@ # To visualize the conversion, we now plot 4 of the converted # curves (one from each region) along with the original temperatures # and the irregular points that we sampled. -idxes = [arctic[2], atlantic[4], continental[11], pacific[1]] +idxes = [arctic[0], atlantic[11], continental[3], pacific[3]] fig = plt.figure(figsize=(10, 10)) for k in range(4): axes = plt.subplot(2, 2, k + 1)