Skip to content

Commit

Permalink
Expose multivariate PRESS to Python. More tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
romanwerpachowski committed Sep 9, 2020
1 parent 4734465 commit ff7f06e
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 4 deletions.
4 changes: 3 additions & 1 deletion ML/LinearRegression.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,12 @@ namespace ml
See https://en.wikipedia.org/wiki/PRESS_statistic for details.
@warning When calculating PRESS for regularised OLS, `regression` must standardise the data internally (call ridge() with `DoStandardise == true`).
@tparam Regression Functor type implementing particular regression.
@param[in] X D x N matrix of X values, with data points in columns.
@param[in] y Y vector with length N.
@param[in] regression Regression functor. `regression(X, y)` should return a result object supporting a `predict(X)` call (e.g. MultivariateOLSResult).
@param[in] regression Regression functor. `regression(X, y)` should return a result object supporting a `predict(X)` call (e.g. MultivariateOLSResult). Must standardise the data internally if necessary.
@return Value of the PRESS statistic.
@throw std::invalid_argument If `X.cols() != y.size()`.
*/
Expand Down
26 changes: 25 additions & 1 deletion Tests/test_LinearRegression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1134,7 +1134,7 @@ TEST_F(LinearRegressionTest, ridge_do_standardise_covariance)
ASSERT_NEAR(0, (sample_cov - result.cov).norm(), 5e-6) << "estimate:\n" << result.cov << "\n\nsample:\n" << sample_cov << "\n\ndifference:\n" << (sample_cov - result.cov);
}

TEST_F(LinearRegressionTest, press)
TEST_F(LinearRegressionTest, press_multivariate)
{
Eigen::MatrixXd X(2, 3);
X << -1, 0, 1,
Expand All @@ -1145,6 +1145,30 @@ TEST_F(LinearRegressionTest, press)
ASSERT_NEAR(4 + 1 + 4, press_statistic, 1e-15);
}

TEST_F(LinearRegressionTest, press_ridge_zero_lambda)
{
Eigen::MatrixXd X(1, 3);
X << -1, 0, 1;
Eigen::VectorXd y(3);
y << 1, 0, 1;
const double press_statistic = press(X, y, [](Eigen::Ref<const Eigen::MatrixXd> train_X, Eigen::Ref<const Eigen::VectorXd> train_y) {
return ridge<true>(train_X, train_y, 0);
});
ASSERT_NEAR(4 + 1 + 4, press_statistic, 1e-15);
}

TEST_F(LinearRegressionTest, press_ridge)
{
Eigen::MatrixXd X(1, 3);
X << -1, 0, 1;
Eigen::VectorXd y(3);
y << 1, 0, 1;
const double press_statistic = press(X, y, [](Eigen::Ref<const Eigen::MatrixXd> train_X, Eigen::Ref<const Eigen::VectorXd> train_y) {
return ridge<true>(train_X, train_y, 0);
});
ASSERT_NEAR(4 + 1 + 4, press_statistic, 1e-15);
}

TEST_F(LinearRegressionTest, press_univariate_with_intercept)
{
Eigen::VectorXd x(3);
Expand Down
39 changes: 37 additions & 2 deletions cppyml/linear_regression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,21 @@ namespace ml
}
}

static double press_cppyml(Eigen::Ref<const MatrixXdR> X, Eigen::Ref<const Eigen::VectorXd> y, const char* regularisation, const double reg_lambda)
{
if (!strcmp(regularisation, "ridge")) {
return press(X.transpose(), y, [reg_lambda](Eigen::Ref<const Eigen::MatrixXd> train_X, Eigen::Ref<const Eigen::VectorXd> train_y) {
return ridge<true>(train_X, train_y, reg_lambda);
});
} else if (!strcmp(regularisation, "none")) {
return press(X.transpose(), y, [](Eigen::Ref<const Eigen::MatrixXd> train_X, Eigen::Ref<const Eigen::VectorXd> train_y) {
return multivariate(train_X, train_y);
});
} else {
throw std::invalid_argument("Unknown regression type. Valid types: \"none\" or \"ridge\".");
}
}

static double press_univariate_cppyml(Eigen::Ref<const Eigen::VectorXd> x, Eigen::Ref<const Eigen::VectorXd> y, const bool with_intercept)
{
if (with_intercept) {
Expand All @@ -118,6 +133,8 @@ void init_linear_regression(py::module& m)
{
auto m_lin_reg = m.def_submodule("linear_regression", "Linear regression algorithms.");

constexpr bool default_do_standardise = false;

py::class_<ml::LinearRegression::Result> result(m_lin_reg, "Result");
result.def_readonly("n", &ml::LinearRegression::Result::n, "Number of data points.")
.def_readonly("dof", &ml::LinearRegression::Result::dof, "Number of residual degrees of freedom (e.g. `n - 2` or `n - 1` for univariate regression with or without intercept).")
Expand Down Expand Up @@ -299,7 +316,7 @@ or set the parameter `add_ones` to `True`.
)";

m_lin_reg.def("ridge", &ml::LinearRegression::ridge_row_major,
py::arg("X"), py::arg("y"), py::arg("lambda"), py::arg("do_standardise") = false,
py::arg("X"), py::arg("y"), py::arg("lambda"), py::arg("do_standardise") = default_do_standardise,
R"(Carries out multivariate ridge regression with intercept.
Given X and y, finds beta and beta0 minimising \f$ \lVert \vec{y} - X^T \vec{\beta} \rVert^2 + \lambda \lVert \vec{\beta} \rVert^2 \f$.
Expand All @@ -310,13 +327,31 @@ The matrix `X` is assumed to be standardised unless `do_standardise` is set to `
Args:
X: X matrix (shape N x D, with D <= N), with data points in rows.
y: Y vector with length N.
do_standardise: Whether to automatically subtract the mean from each row in `X` and divide it by its standard deviation.
do_standardise: Whether to automatically subtract the mean from each row in `X` and divide it by its standard deviation (defaults to False).
Returns:
Instance of `RidgeRegressionResult`. If `do_standardise` was `True`, the `beta` vector will be rescaled and shifted
to original `X` units and origins, and the `cov` matrix will be transformed accordingly.
)");

m_lin_reg.def("press", &ml::LinearRegression::press_cppyml,
py::arg("X"), py::arg("y"), py::arg("regularisation") = "none", py::arg("reg_lambda") = 0.,
R"(Calculates the PRESS statistic (Predicted Residual Error Sum of Squares).
See https://en.wikipedia.org/wiki/PRESS_statistic for details.
NOTE: Training data will be standardised internally if using regularisation.
Args:
X: X matrix (shape N x D, with D <= N), with data points in rows. Unstandardised.
y: Y vector with length N.
regularisation: Type of regularisation: "none" or "ridge". Defaults to "none".
reg_lambda: Non-negative regularisation strength. Defaults to 0. Ignored if `regularisation == "none"`.
Returns:
Value of the PRESS statistic.
)");

m_lin_reg.def("press_univariate", &ml::LinearRegression::press_univariate_cppyml,
py::arg("x"), py::arg("y"), py::arg("with_intercept") = true,
R"(Calculates the PRESS statistic (Predicted Residual Error Sum of Squares) for univariate regression.
Expand Down
22 changes: 22 additions & 0 deletions cppyml/tests/test_linear_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,28 @@ def test_press_univariate_without_intercept(self):
actual = linear_regression.press_univariate(x, y, False)
self.assertAlmostEqual(8, actual, delta=1e-15)

def test_press_no_regularisation(self):
X = np.array([[-1, 1], [0, 1], [1, 1]])
y = np.array([1, 0, 1])
actual1 = linear_regression.press(X, y)
actual2 = linear_regression.press(X, y, "none")
self.assertEqual(actual1, actual2)
self.assertAlmostEqual(9, actual1, delta=1e-15)

def test_press_ridge_zero_strength(self):
X = np.array([[-1], [0], [1]])
y = np.array([1, 0, 1])
actual1 = linear_regression.press(X, y, "ridge")
actual2 = linear_regression.press(X, y, "ridge", 0)
self.assertEqual(actual1, actual2)
self.assertAlmostEqual(9, actual1, delta=1e-15)

def test_press_ridge(self):
X = np.array([[-1], [0], [1]])
y = np.array([1, 0, 1])
actual = linear_regression.press(X, y, "ridge", 0.1)
self.assertGreater(9, actual)


if __name__ == "__main__":
unittest.main()

0 comments on commit ff7f06e

Please sign in to comment.