Expose multivariate PRESS to Python. More tests.

romanwerpachowski · Sep 9, 2020 · ff7f06e · ff7f06e
1 parent 4734465
commit ff7f06e
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 4 deletions.
diff --git a/ML/LinearRegression.hpp b/ML/LinearRegression.hpp
@@ -213,10 +213,12 @@ namespace ml
 
 		See https://en.wikipedia.org/wiki/PRESS_statistic for details.
 
+		@warning When calculating PRESS for regularised OLS, `regression` must standardise the data internally (call ridge() with `DoStandardise == true`).
+
 		@tparam Regression Functor type implementing particular regression.
 		@param[in] X D x N matrix of X values, with data points in columns.
 		@param[in] y Y vector with length N.
-		@param[in] regression Regression functor. `regression(X, y)` should return a result object supporting a `predict(X)` call (e.g. MultivariateOLSResult).
+		@param[in] regression Regression functor. `regression(X, y)` should return a result object supporting a `predict(X)` call (e.g. MultivariateOLSResult). Must standardise the data internally if necessary.
 		@return Value of the PRESS statistic.
 		@throw std::invalid_argument If `X.cols() != y.size()`.
 		*/

diff --git a/Tests/test_LinearRegression.cpp b/Tests/test_LinearRegression.cpp
@@ -1134,7 +1134,7 @@ TEST_F(LinearRegressionTest, ridge_do_standardise_covariance)
 	ASSERT_NEAR(0, (sample_cov - result.cov).norm(), 5e-6) << "estimate:\n" << result.cov << "\n\nsample:\n" << sample_cov << "\n\ndifference:\n" << (sample_cov - result.cov);
 }
 
-TEST_F(LinearRegressionTest, press)
+TEST_F(LinearRegressionTest, press_multivariate)
 {
 	Eigen::MatrixXd X(2, 3);
 	X << -1, 0, 1,
@@ -1145,6 +1145,30 @@ TEST_F(LinearRegressionTest, press)
 	ASSERT_NEAR(4 + 1 + 4, press_statistic, 1e-15);
 }
 
+TEST_F(LinearRegressionTest, press_ridge_zero_lambda)
+{
+	Eigen::MatrixXd X(1, 3);
+	X << -1, 0, 1;
+	Eigen::VectorXd y(3);
+	y << 1, 0, 1;
+	const double press_statistic = press(X, y, [](Eigen::Ref<const Eigen::MatrixXd> train_X, Eigen::Ref<const Eigen::VectorXd> train_y) {
+		return ridge<true>(train_X, train_y, 0);
+		});
+	ASSERT_NEAR(4 + 1 + 4, press_statistic, 1e-15);
+}
+
+TEST_F(LinearRegressionTest, press_ridge)
+{
+	Eigen::MatrixXd X(1, 3);
+	X << -1, 0, 1;
+	Eigen::VectorXd y(3);
+	y << 1, 0, 1;
+	const double press_statistic = press(X, y, [](Eigen::Ref<const Eigen::MatrixXd> train_X, Eigen::Ref<const Eigen::VectorXd> train_y) {
+		return ridge<true>(train_X, train_y, 0);
+		});
+	ASSERT_NEAR(4 + 1 + 4, press_statistic, 1e-15);
+}
+
 TEST_F(LinearRegressionTest, press_univariate_with_intercept)
 {
 	Eigen::VectorXd x(3);

diff --git a/cppyml/linear_regression.cpp b/cppyml/linear_regression.cpp
@@ -102,6 +102,21 @@ namespace ml
 			}			
 		}
 
+		static double press_cppyml(Eigen::Ref<const MatrixXdR> X, Eigen::Ref<const Eigen::VectorXd> y, const char* regularisation, const double reg_lambda)
+		{
+			if (!strcmp(regularisation, "ridge")) {
+				return press(X.transpose(), y, [reg_lambda](Eigen::Ref<const Eigen::MatrixXd> train_X, Eigen::Ref<const Eigen::VectorXd> train_y) {
+					return ridge<true>(train_X, train_y, reg_lambda);
+					});				
+			} else if (!strcmp(regularisation, "none")) {
+				return press(X.transpose(), y, [](Eigen::Ref<const Eigen::MatrixXd> train_X, Eigen::Ref<const Eigen::VectorXd> train_y) {
+					return multivariate(train_X, train_y);
+					});
+			} else {
+				throw std::invalid_argument("Unknown regression type. Valid types: \"none\" or \"ridge\".");
+			}
+		}
+
 		static double press_univariate_cppyml(Eigen::Ref<const Eigen::VectorXd> x, Eigen::Ref<const Eigen::VectorXd> y, const bool with_intercept)
 		{
 			if (with_intercept) {
@@ -118,6 +133,8 @@ void init_linear_regression(py::module& m)
 {
 	auto m_lin_reg = m.def_submodule("linear_regression", "Linear regression algorithms.");
 
+	constexpr bool default_do_standardise = false;
+
 	py::class_<ml::LinearRegression::Result> result(m_lin_reg, "Result");
 	result.def_readonly("n", &ml::LinearRegression::Result::n, "Number of data points.")
 		.def_readonly("dof", &ml::LinearRegression::Result::dof, "Number of residual degrees of freedom (e.g. `n - 2` or `n - 1` for univariate regression with or without intercept).")
@@ -299,7 +316,7 @@ or set the parameter `add_ones` to `True`.
 )";
 
 	m_lin_reg.def("ridge", &ml::LinearRegression::ridge_row_major,
-		py::arg("X"), py::arg("y"), py::arg("lambda"), py::arg("do_standardise") = false,
+		py::arg("X"), py::arg("y"), py::arg("lambda"), py::arg("do_standardise") = default_do_standardise,
 		R"(Carries out multivariate ridge regression with intercept.
 
 Given X and y, finds beta and beta0 minimising \f$ \lVert \vec{y} - X^T \vec{\beta} \rVert^2 + \lambda \lVert \vec{\beta} \rVert^2 \f$.
@@ -310,13 +327,31 @@ The matrix `X` is assumed to be standardised unless `do_standardise` is set to `
 Args:
 	X: X matrix (shape N x D, with D <= N), with data points in rows.
 	y: Y vector with length N.
-	do_standardise: Whether to automatically subtract the mean from each row in `X` and divide it by its standard deviation.
+	do_standardise: Whether to automatically subtract the mean from each row in `X` and divide it by its standard deviation (defaults to False).
 
 Returns:
 	Instance of `RidgeRegressionResult`. If `do_standardise` was `True`, the `beta` vector will be rescaled and shifted
 	to original `X` units and origins, and the `cov` matrix will be transformed accordingly.
 )");
 
+	m_lin_reg.def("press", &ml::LinearRegression::press_cppyml,
+		py::arg("X"), py::arg("y"), py::arg("regularisation") = "none", py::arg("reg_lambda") = 0.,
+		R"(Calculates the PRESS statistic (Predicted Residual Error Sum of Squares).
+
+See https://en.wikipedia.org/wiki/PRESS_statistic for details.
+
+NOTE: Training data will be standardised internally if using regularisation.
+
+Args:
+	X: X matrix (shape N x D, with D <= N), with data points in rows. Unstandardised.
+	y: Y vector with length N.
+	regularisation: Type of regularisation: "none" or "ridge". Defaults to "none".
+	reg_lambda: Non-negative regularisation strength. Defaults to 0. Ignored if `regularisation == "none"`.
+
+Returns:
+	Value of the PRESS statistic.
+)");
+
 	m_lin_reg.def("press_univariate", &ml::LinearRegression::press_univariate_cppyml,
 		py::arg("x"), py::arg("y"), py::arg("with_intercept") = true,
 		R"(Calculates the PRESS statistic (Predicted Residual Error Sum of Squares) for univariate regression.

diff --git a/cppyml/tests/test_linear_regression.py b/cppyml/tests/test_linear_regression.py
@@ -293,6 +293,28 @@ def test_press_univariate_without_intercept(self):
         actual = linear_regression.press_univariate(x, y, False)
         self.assertAlmostEqual(8, actual, delta=1e-15)
 
+    def test_press_no_regularisation(self):
+        X = np.array([[-1, 1], [0, 1], [1, 1]])
+        y = np.array([1, 0, 1])
+        actual1 = linear_regression.press(X, y)
+        actual2 = linear_regression.press(X, y, "none")        
+        self.assertEqual(actual1, actual2)
+        self.assertAlmostEqual(9, actual1, delta=1e-15)
+
+    def test_press_ridge_zero_strength(self):
+        X = np.array([[-1], [0], [1]])
+        y = np.array([1, 0, 1])
+        actual1 = linear_regression.press(X, y, "ridge")
+        actual2 = linear_regression.press(X, y, "ridge", 0)
+        self.assertEqual(actual1, actual2)
+        self.assertAlmostEqual(9, actual1, delta=1e-15)
+
+    def test_press_ridge(self):
+        X = np.array([[-1], [0], [1]])
+        y = np.array([1, 0, 1])
+        actual = linear_regression.press(X, y, "ridge", 0.1)
+        self.assertGreater(9, actual)
+
 
 if __name__ == "__main__": 
     unittest.main()