From 1a02b821aaeffd61915cb509f8254671b8d52f6b Mon Sep 17 00:00:00 2001 From: Anthony Li <38049831+doccstat@users.noreply.github.com> Date: Sat, 30 Mar 2024 12:18:03 -0500 Subject: [PATCH] Improve CodeFactor --- src/fastcpd_class_cost.cc | 303 +------- ...tcpd_class.cc => fastcpd_class_private.cc} | 676 +++++++----------- src/fastcpd_class_public.cc | 455 ++++++++++++ src/fastcpd_classes.h | 96 ++- src/fastcpd_cost_wrappers.cc | 34 - 5 files changed, 785 insertions(+), 779 deletions(-) rename src/{fastcpd_class.cc => fastcpd_class_private.cc} (79%) create mode 100644 src/fastcpd_class_public.cc delete mode 100644 src/fastcpd_cost_wrappers.cc diff --git a/src/fastcpd_class_cost.cc b/src/fastcpd_class_cost.cc index 83479629..af8521b6 100644 --- a/src/fastcpd_class_cost.cc +++ b/src/fastcpd_class_cost.cc @@ -1,289 +1,48 @@ #include "fastcpd_classes.h" -#include "fastcpd_functions.h" - -using ::fastcpd::functions::negative_log_likelihood_arma; -using ::fastcpd::functions::negative_log_likelihood_glm; -using ::fastcpd::functions::negative_log_likelihood_lasso_cv; -using ::fastcpd::functions::negative_log_likelihood_lasso_wo_cv; -using ::fastcpd::functions::negative_log_likelihood_mean; -using ::fastcpd::functions::negative_log_likelihood_meanvariance; -using ::fastcpd::functions::negative_log_likelihood_mgaussian; -using ::fastcpd::functions::negative_log_likelihood_variance; namespace fastcpd::classes { -CostResult Fastcpd::negative_log_likelihood( - mat data, - Nullable theta, - double lambda, - bool cv, - Nullable start -) { - CostResult cost_result; - if (theta.isNull()) { - cost_result = negative_log_likelihood_wo_theta(data, lambda, cv, start); - } else { - cost_result = CostResult{ - {colvec()}, - {colvec()}, - negative_log_likelihood_wo_cv(data, as(theta), lambda) - }; - } - cost_result.value = adjust_cost_value(cost_result.value, data.n_rows); - return cost_result; +ColMat::operator colvec() const { + // TODO(doccstat): Add a warning if the matrix has more than one column. + return data.as_col(); } -CostResult Fastcpd::negative_log_likelihood_wo_theta( - mat data, - double lambda, - bool cv, - Nullable start -) { - CostResult cost_result; - if (family == "lasso" && cv) { - cost_result = negative_log_likelihood_lasso_cv(data); - } else if (family == "lasso" && !cv) { - cost_result = negative_log_likelihood_lasso_wo_cv(data, lambda); - } else if ( - family == "binomial" || family == "poisson" || family == "gaussian" - ) { - cost_result = negative_log_likelihood_glm(data, start, family); - } else if (family == "arma") { - cost_result = negative_log_likelihood_arma(data, order); - } else if (family == "mean") { - cost_result = negative_log_likelihood_mean(data, variance_estimate); - } else if (family == "variance") { - cost_result = negative_log_likelihood_variance(data, variance_data_mean); - } else if (family == "meanvariance" || family == "mv") { - cost_result = negative_log_likelihood_meanvariance(data, epsilon); - } else if (family == "mgaussian") { - cost_result = negative_log_likelihood_mgaussian( - data, p_response, variance_estimate - ); - } else { - // # nocov start - stop("This branch should not be reached at fastcpd_class_cost.cc: 193."); - // # nocov end - } - return cost_result; +ColMat::operator mat() const { + return data; } -double Fastcpd::negative_log_likelihood_wo_cv( +ColMat::operator rowvec() const { + // TODO(doccstat): Add a warning if the matrix has more than one column. + return data.as_col().t(); +} + +CostFunction::CostFunction(Function cost) : cost(cost) {} + +CostResult CostFunction::operator()( // # nocov mat data, - colvec theta, - double lambda + Nullable theta, + double lambda, // UNUSED + bool cv, // UNUSED + Nullable start // UNUSED ) { - vec y = data.col(0); - if (family == "lasso" || family == "gaussian") { - // Calculate negative log likelihood in gaussian family - double penalty = lambda * accu(abs(theta)); - mat x = data.cols(1, data.n_cols - 1); - return accu(square(y - x * theta)) / 2 + penalty; - } else if (family == "binomial") { - // Calculate negative log likelihood in binomial family - mat x = data.cols(1, data.n_cols - 1); - colvec u = x * theta; - return accu(-y % u + arma::log(1 + exp(u))); - } else if (family == "poisson") { - mat x = data.cols(1, data.n_cols - 1); - colvec u = x * theta; - colvec y_factorial(y.n_elem); - for (unsigned int i = 0; i < y.n_elem; i++) { - double log_factorial = 0; - for (int j = 1; j <= y(i); ++j) { - log_factorial += std::log(j); - } - y_factorial(i) = log_factorial; - } - - return accu(-y % u + exp(u) + y_factorial); - } else if (family == "arma") { - colvec reversed_theta = reverse(theta); - if (data.n_rows < max(order) + 1) { - return 0; - } - colvec variance_term = zeros(data.n_rows); - for (unsigned int i = max(order); i < data.n_rows; i++) { - variance_term(i) = data(i, 0) - dot( - reversed_theta.rows(order(1) + 1, sum(order)), - data.rows(i - order(0), i - 1) - ) - dot( - reversed_theta.rows(1, order(1)), - variance_term.rows(i - order(1), i - 1) - ); - } - return (std::log(2.0 * M_PI) + - std::log(theta(sum(order)))) * (data.n_rows - 2) / 2.0 + - dot(variance_term, variance_term) / 2.0 / theta(sum(order)); - } else { - // # nocov start - stop("This branch should not be reached at functions.cc: 248."); - // # nocov end - } + DEBUG_RCOUT(data.n_rows); + SEXP value = + theta.isNull() ? cost(data) : cost(data, as(theta)); // # nocov + return {{colvec()}, {colvec()}, as(value)}; // # nocov } -colvec Fastcpd::cost_update_gradient(mat data, colvec theta) { - rowvec new_data = data.row(data.n_rows - 1); - rowvec x = new_data.tail(new_data.n_elem - 1); - double y = new_data(0); - colvec gradient; - if (family.compare("binomial") == 0) { - gradient = - (y - 1 / (1 + exp(-as_scalar(x * theta)))) * x.t(); - } else if (family.compare("poisson") == 0) { - gradient = - (y - exp(as_scalar(x * theta))) * x.t(); - } else if (family == "lasso" || family == "gaussian") { - gradient = - (y - as_scalar(x * theta)) * x.t(); - } else if (family == "arma") { - mat reversed_data = reverse(data, 0); - colvec reversed_theta = reverse(theta); - if (data.n_rows < max(order) + 1) { - return ones(theta.n_elem); - } - colvec variance_term = zeros(data.n_rows); - for (unsigned int i = max(order); i < data.n_rows; i++) { - variance_term(i) = data(i, 0) - dot( - reversed_theta.rows(order(1) + 1, sum(order)), - data.rows(i - order(0), i - 1) - ) - dot( - reversed_theta.rows(1, order(1)), - variance_term.rows(i - order(1), i - 1) - ); - } - colvec reversed_variance_term = reverse(variance_term); - mat phi_coefficient = zeros(data.n_rows, order(0)), - psi_coefficient = zeros(data.n_rows, order(1)); - for (unsigned int i = max(order); i < data.n_rows; i++) { - phi_coefficient.row(i) = -reversed_data.rows( - data.n_rows - i, data.n_rows - i + order(0) - 1 - ).t() - reversed_theta.rows(1, order(1)).t() * - phi_coefficient.rows(i - order(1), i - 1); - } - for (unsigned int i = order(1); i < data.n_rows; i++) { - psi_coefficient.row(i) = -reversed_variance_term.rows( - data.n_rows - i, data.n_rows - i + order(1) - 1 - ).t() - reversed_theta.rows(1, order(1)).t() * - psi_coefficient.rows(i - order(1), i - 1); - } - gradient = zeros(sum(order) + 1); - gradient.rows(0, order(0) - 1) = phi_coefficient.row(data.n_rows - 1).t() * - variance_term(data.n_rows - 1) / theta(sum(order)); - gradient.rows(order(0), sum(order) - 1) = - psi_coefficient.row(data.n_rows - 1).t() * - variance_term(data.n_rows - 1) / theta(sum(order)); - gradient(sum(order)) = 1.0 / 2.0 / theta(sum(order)) - - std::pow(variance_term(data.n_rows - 1), 2) / 2.0 / - std::pow(theta(sum(order)), 2); - } else { - // # nocov start - stop("This branch should not be reached at functions.cc: 188."); - // # nocov end - } - return gradient; +CostGradient::CostGradient(Function cost_gradient) : + cost_gradient(cost_gradient) {} + +colvec CostGradient::operator()(mat data, colvec theta) { + return as(cost_gradient(data, theta)); } -mat Fastcpd::cost_update_hessian(mat data, colvec theta) { - rowvec new_data = data.row(data.n_rows - 1); - rowvec x = new_data.tail(new_data.n_elem - 1); - mat hessian; - if (family.compare("binomial") == 0) { - double prob = 1 / (1 + exp(-as_scalar(x * theta))); - hessian = (x.t() * x) * as_scalar((1 - prob) * prob); - } else if (family.compare("poisson") == 0) { - double prob = exp(as_scalar(x * theta)); - // Prevent numerical issues if `prob` is too large. - hessian = (x.t() * x) * std::min(as_scalar(prob), 1e10); - } else if (family == "lasso" || family == "gaussian") { - hessian = x.t() * x; - } else if (family == "arma") { - // TODO(doccstat): Maybe we can store all these computations - mat reversed_data = reverse(data, 0); - colvec reversed_theta = reverse(theta); - if (data.n_rows < max(order) + 1) { - return eye(theta.n_elem, theta.n_elem); - } - colvec variance_term = zeros(data.n_rows); - for (unsigned int i = max(order); i < data.n_rows; i++) { - variance_term(i) = data(i, 0) - dot( - reversed_theta.rows(order(1) + 1, sum(order)), - data.rows(i - order(0), i - 1) - ) - dot( - reversed_theta.rows(1, order(1)), - variance_term.rows(i - order(1), i - 1) - ); - } - colvec reversed_variance_term = reverse(variance_term); - mat phi_coefficient = zeros(data.n_rows, order(0)), - psi_coefficient = zeros(data.n_rows, order(1)); - for (unsigned int i = max(order); i < data.n_rows; i++) { - phi_coefficient.row(i) = -reversed_data.rows( - data.n_rows - i, data.n_rows - i + order(0) - 1 - ).t() - reversed_theta.rows(1, order(1)).t() * - phi_coefficient.rows(i - order(1), i - 1); - } - for (unsigned int i = order(1); i < data.n_rows; i++) { - psi_coefficient.row(i) = -reversed_variance_term.rows( - data.n_rows - i, data.n_rows - i + order(1) - 1 - ).t() - reversed_theta.rows(1, order(1)).t() * - psi_coefficient.rows(i - order(1), i - 1); - } - mat reversed_coef_phi = reverse(phi_coefficient, 0), - reversed_coef_psi = reverse(psi_coefficient, 0); - cube phi_psi_coefficient = zeros(order(1), order(0), data.n_rows), - psi_psi_coefficient = zeros(order(1), order(1), data.n_rows); - for (unsigned int i = order(1); i < data.n_rows; i++) { - mat phi_psi_coefficient_part = zeros(order(1), order(0)), - psi_psi_coefficient_part = zeros(order(1), order(1)); - for (unsigned int j = 1; j <= order(1); j++) { - phi_psi_coefficient_part += - phi_psi_coefficient.slice(i - j) * theta(order(0) - 1 + j); - } - phi_psi_coefficient.slice(i) = -reversed_coef_phi.rows( - data.n_rows - i, data.n_rows - i + order(1) - 1 - ) - phi_psi_coefficient_part; - for (unsigned int j = 1; j <= order(1); j++) { - psi_psi_coefficient_part += - psi_psi_coefficient.slice(i - j) * theta(order(0) - 1 + j); - } - psi_psi_coefficient.slice(i) = -reversed_coef_psi.rows( - data.n_rows - i, data.n_rows - i + order(1) - 1 - ) - reversed_coef_psi.rows( - data.n_rows - i, data.n_rows - i + order(1) - 1 - ).t() - psi_psi_coefficient_part; - } - hessian = zeros(sum(order) + 1, sum(order) + 1); - hessian.submat(0, 0, order(0) - 1, order(0) - 1) = - phi_coefficient.row(data.n_rows - 1).t() * - phi_coefficient.row(data.n_rows - 1) / theta(sum(order)); - hessian.submat(0, order(0), order(0) - 1, sum(order) - 1) = ( - phi_psi_coefficient.slice(data.n_rows - 1).t() * - variance_term(data.n_rows - 1) + - phi_coefficient.row(data.n_rows - 1).t() * - psi_coefficient.row(data.n_rows - 1) - ) / theta(sum(order)); - hessian.submat(order(0), 0, sum(order) - 1, order(0) - 1) = - hessian.submat(0, order(0), order(0) - 1, sum(order) - 1).t(); - hessian.submat(0, sum(order), order(0) - 1, sum(order)) = - -phi_coefficient.row(data.n_rows - 1).t() * - variance_term(data.n_rows - 1) / theta(sum(order)) / theta(sum(order)); - hessian.submat(sum(order), 0, sum(order), order(0) - 1) = - hessian.submat(0, sum(order), order(0) - 1, sum(order)).t(); - hessian.submat(order(0), order(0), sum(order) - 1, sum(order) - 1) = ( - psi_coefficient.row(data.n_rows - 1).t() * - psi_coefficient.row(data.n_rows - 1) + - psi_psi_coefficient.slice(data.n_rows - 1) * - variance_term(data.n_rows - 1) - ) / theta(sum(order)); - hessian.submat(order(0), sum(order), sum(order) - 1, sum(order)) = - -psi_coefficient.row(data.n_rows - 1).t() * - variance_term(data.n_rows - 1) / theta(sum(order)) / theta(sum(order)); - hessian.submat(sum(order), order(0), sum(order), sum(order) - 1) = - hessian.submat(order(0), sum(order), sum(order) - 1, sum(order)).t(); - hessian(sum(order), sum(order)) = - std::pow(variance_term(data.n_rows - 1), 2) / - std::pow(theta(sum(order)), 3) - - 1.0 / 2.0 / std::pow(theta(sum(order)), 2); - } - return hessian; +CostHessian::CostHessian(Function cost_hessian) : + cost_hessian(cost_hessian) {} + +mat CostHessian::operator()(mat data, colvec theta) { + return as(cost_hessian(data, theta)); } } // namespace fastcpd::classes diff --git a/src/fastcpd_class.cc b/src/fastcpd_class_private.cc similarity index 79% rename from src/fastcpd_class.cc rename to src/fastcpd_class_private.cc index b0e413af..5dd51dd6 100644 --- a/src/fastcpd_class.cc +++ b/src/fastcpd_class_private.cc @@ -4,153 +4,11 @@ namespace fastcpd::classes { -Fastcpd::Fastcpd( - const double beta, - const double convexity_coef, - Nullable cost, - const string cost_adjustment, - Nullable cost_gradient, - Nullable cost_hessian, - const bool cp_only, - mat data, - const double epsilon, - const string family, - Nullable k, - colvec line_search, - const colvec lower, - const double momentum_coef, - const colvec order, - const int p, - const unsigned int p_response, - const bool r_progress, - const int segment_count, - const double trim, - const colvec upper, - const double vanilla_percentage, - const mat variance_estimate, - const bool warm_start -) : beta(beta), - convexity_coef(convexity_coef), - cost(cost), - cost_adjustment(cost_adjustment), - cost_gradient(cost_gradient), - cost_hessian(cost_hessian), - cp_only(cp_only), - data(data), - epsilon(epsilon), - family(family), - k(k), - line_search(line_search), - lower(lower), - momentum_coef(momentum_coef), - order(order), - p(p), - p_response(p_response), - r_progress(r_progress), - segment_count(segment_count), - trim(trim), - upper(upper), - vanilla_percentage(vanilla_percentage), - variance_estimate(variance_estimate), - warm_start(warm_start) { - n = data.n_rows; - segment_indices = vec(n); - segment_theta_hat = mat(segment_count, p); - err_sd = vec(segment_count); - act_num = vec(segment_count); - start = zeros(p, n); - theta_hat = mat(p, 1); - theta_sum = mat(p, 1); - hessian = cube(p, p, 1); - momentum = vec(p); - variance_data_mean = mean(data, 0); - - create_cost_function_wrapper(cost); - create_cost_gradient_wrapper(cost_gradient); - create_cost_hessian_wrapper(cost_hessian); - - // TODO(doccstat): Store environment functions from R. -} - -double Fastcpd::adjust_cost_value( - double value, - const unsigned int nrows -) { - if (cost_adjustment == "MDL") { - value = value * std::log2(M_E); - } - return value + get_cost_adjustment_value(nrows); -} - -double Fastcpd::get_cost_adjustment_value(const unsigned nrows) { - double adjusted = 0; - if (cost_adjustment == "MBIC" || cost_adjustment == "MDL") { - adjusted = data.n_cols * std::log(nrows) / 2; - } - if (cost_adjustment == "MDL") { - adjusted *= std::log2(M_E); - } - return adjusted; -} - -CostResult Fastcpd::get_optimized_cost(const mat data_segment) { - Function cost_ = cost.get(); - CostResult cost_result; - if (cost_gradient.isNull() && cost_hessian.isNull()) { - cost_result = {{colvec()}, {colvec()}, as(cost_(data_segment))}; - } else if (p == 1) { - Environment stats = Environment::namespace_env("stats"); - Function optim = stats["optim"]; - List optim_result = optim( - Named("par") = 0, - Named("fn") = InternalFunction( - +[](double theta, mat data, Function cost_) { - return cost_( - Named("data") = data, - Named("theta") = std::log(theta / (1 - theta)) - ); - } - ), - Named("method") = "Brent", - Named("lower") = 0, - Named("upper") = 1, - Named("data") = data_segment, - Named("cost") = cost_ - ); - colvec par = as(optim_result["par"]); - double value = as(optim_result["value"]); - cost_result = - {{log(par / (1 - par))}, {colvec()}, exp(value) / (1 + exp(value))}; - } else if (p > 1) { - Environment stats = Environment::namespace_env("stats"); - Function optim = stats["optim"]; - List optim_result = optim( - Named("par") = zeros(p), - Named("fn") = cost_, - Named("method") = "L-BFGS-B", - Named("data") = data_segment, - Named("lower") = lower, - Named("upper") = upper - ); - cost_result = - {{as(optim_result["par"])}, {colvec()}, optim_result["value"]}; - } else { - // # nocov start - stop("This branch should not be reached at classes.cc: 945."); - // # nocov end - } - return cost_result; -} - -mat Fastcpd::get_theta_sum() { - return theta_sum; -} - void Fastcpd::create_cost_function_wrapper(Nullable cost) { DEBUG_RCOUT(family); if (contain(FASTCPD_FAMILIES, family)) { cost_function_wrapper = std::bind( // # nocov start - &Fastcpd::negative_log_likelihood, // # nocov end + &Fastcpd::get_cost_result, // # nocov end this, std::placeholders::_1, std::placeholders::_2, @@ -290,62 +148,112 @@ void Fastcpd::create_segment_statistics() { } } -void Fastcpd::create_theta_sum( - const unsigned int col, colvec new_theta_sum -) { - theta_sum.col(col) = new_theta_sum; -} - -void Fastcpd::update_err_sd( - const unsigned int segment_index, const double err_var -) { - err_sd(segment_index) = sqrt(err_var); +double Fastcpd::get_cost_adjustment_value(const unsigned nrows) { + double adjusted = 0; + if (cost_adjustment == "MBIC" || cost_adjustment == "MDL") { + adjusted = data.n_cols * std::log(nrows) / 2; + } + if (cost_adjustment == "MDL") { + adjusted *= std::log2(M_E); + } + return adjusted; } -void Fastcpd::update_hessian( - const unsigned int slice, mat new_hessian +CostResult Fastcpd::get_cost_result( + mat data, + Nullable theta, + double lambda, + bool cv, + Nullable start ) { - hessian.slice(slice) = new_hessian; -} - -void Fastcpd::update_hessian(mat new_hessian) { - hessian = join_slices(hessian, new_hessian); + CostResult cost_result; + if (theta.isNull()) { + cost_result = negative_log_likelihood_wo_theta(data, lambda, cv, start); + } else { + cost_result = CostResult{ + {colvec()}, + {colvec()}, + negative_log_likelihood_wo_cv(data, as(theta), lambda) + }; + } + cost_result.value = update_cost_value(cost_result.value, data.n_rows); + return cost_result; } -void Fastcpd::update_hessian(ucolvec pruned_left) { - hessian = hessian.slices(pruned_left); -} +List Fastcpd::get_cp_set(const colvec raw_cp_set, const double lambda) { + colvec cp_set = update_cp_set(raw_cp_set); -void Fastcpd::update_start(const unsigned int col, const colvec start_col) { - start.col(col) = start_col; -} + if (cp_only) { + return List::create( + Named("raw_cp_set") = raw_cp_set, + Named("cp_set") = cp_set, + Named("cost_values") = R_NilValue, + Named("residual") = R_NilValue, + Named("thetas") = R_NilValue + ); + } -void Fastcpd::update_theta_hat(colvec new_theta_hat) { - theta_hat = join_rows(theta_hat, new_theta_hat); -} + colvec cp_loc_ = zeros(cp_set.n_elem + 2); + if (cp_set.n_elem) { + cp_loc_.rows(1, cp_loc_.n_elem - 2) = cp_set; + } + cp_loc_(cp_loc_.n_elem - 1) = n; + colvec cp_loc = unique(std::move(cp_loc_)); + colvec cost_values = zeros(cp_loc.n_elem - 1); + mat thetas = zeros(p, cp_loc.n_elem - 1); + mat residual; + if ( + family == "mean" || family == "variance" || + family == "meanvariance" || family == "mv" + ) { + residual = zeros(data.n_rows, data.n_cols); + } else if (family == "mgaussian") { + residual = zeros(data.n_rows, p_response); + } else { + residual = zeros(data.n_rows, 1); + } + unsigned int residual_next_start = 0; -void Fastcpd::update_theta_hat( - const unsigned int col, colvec new_theta_hat -) { - theta_hat.col(col) = new_theta_hat; -} + for (unsigned int i = 0; i < cp_loc.n_elem - 1; i++) { + colvec segment_data_index_ = + linspace(cp_loc(i), cp_loc(i + 1) - 1, cp_loc(i + 1) - cp_loc(i)); + ucolvec segment_data_index = + arma::conv_to::from(std::move(segment_data_index_)); -void Fastcpd::update_theta_sum( - const unsigned int col, colvec new_theta_sum -) { - theta_sum.col(col) += new_theta_sum; -} + mat data_segment = data.rows(segment_data_index); + CostResult cost_result; + if (!contain(FASTCPD_FAMILIES, family)) { + cost_result = get_optimized_cost(data_segment); + } else { + cost_result = cost_function_wrapper( + data_segment, R_NilValue, lambda, false, R_NilValue + ); + } -void Fastcpd::update_theta_sum(colvec new_theta_sum) { - theta_sum = join_rows(theta_sum, new_theta_sum); -} + cost_values(i) = cost_result.value; -void Fastcpd::update_theta_hat(ucolvec pruned_left) { - theta_hat = theta_hat.cols(pruned_left); -} + // Parameters are not involved for PELT. + if (vanilla_percentage < 1) { + thetas.col(i) = colvec(cost_result.par); + } -void Fastcpd::update_theta_sum(ucolvec pruned_left) { - theta_sum = theta_sum.cols(pruned_left); + // Residual is only calculated for built-in families. + if (contain(FASTCPD_FAMILIES, family)) { + mat cost_optim_residual = cost_result.residuals; + residual.rows( + residual_next_start, + residual_next_start + cost_optim_residual.n_rows - 1 + ) = cost_optim_residual; + residual_next_start += cost_optim_residual.n_rows; + } + } + return List::create( + Named("raw_cp_set") = raw_cp_set, + Named("cp_set") = cp_set, + Named("cost_values") = cost_values, + Named("residual") = residual, + Named("thetas") = thetas + ); } double Fastcpd::get_cval_for_r_t_set( @@ -432,181 +340,53 @@ double Fastcpd::get_cval_sen( return cval; } -List Fastcpd::process_cp_set(const colvec raw_cp_set, const double lambda) { - colvec cp_set = trim_cp_set(raw_cp_set); - - if (cp_only) { - return List::create( - Named("raw_cp_set") = raw_cp_set, - Named("cp_set") = cp_set, - Named("cost_values") = R_NilValue, - Named("residual") = R_NilValue, - Named("thetas") = R_NilValue +CostResult Fastcpd::get_optimized_cost(const mat data_segment) { + Function cost_ = cost.get(); + CostResult cost_result; + if (cost_gradient.isNull() && cost_hessian.isNull()) { + cost_result = {{colvec()}, {colvec()}, as(cost_(data_segment))}; + } else if (p == 1) { + Environment stats = Environment::namespace_env("stats"); + Function optim = stats["optim"]; + List optim_result = optim( + Named("par") = 0, + Named("fn") = InternalFunction( + +[](double theta, mat data, Function cost_) { + return cost_( + Named("data") = data, + Named("theta") = std::log(theta / (1 - theta)) + ); + } + ), + Named("method") = "Brent", + Named("lower") = 0, + Named("upper") = 1, + Named("data") = data_segment, + Named("cost") = cost_ ); - } - - colvec cp_loc_ = zeros(cp_set.n_elem + 2); - if (cp_set.n_elem) { - cp_loc_.rows(1, cp_loc_.n_elem - 2) = cp_set; - } - cp_loc_(cp_loc_.n_elem - 1) = n; - colvec cp_loc = unique(std::move(cp_loc_)); - colvec cost_values = zeros(cp_loc.n_elem - 1); - mat thetas = zeros(p, cp_loc.n_elem - 1); - mat residual; - if ( - family == "mean" || family == "variance" || - family == "meanvariance" || family == "mv" - ) { - residual = zeros(data.n_rows, data.n_cols); - } else if (family == "mgaussian") { - residual = zeros(data.n_rows, p_response); + colvec par = as(optim_result["par"]); + double value = as(optim_result["value"]); + cost_result = + {{log(par / (1 - par))}, {colvec()}, exp(value) / (1 + exp(value))}; + } else if (p > 1) { + Environment stats = Environment::namespace_env("stats"); + Function optim = stats["optim"]; + List optim_result = optim( + Named("par") = zeros(p), + Named("fn") = cost_, + Named("method") = "L-BFGS-B", + Named("data") = data_segment, + Named("lower") = lower, + Named("upper") = upper + ); + cost_result = + {{as(optim_result["par"])}, {colvec()}, optim_result["value"]}; } else { - residual = zeros(data.n_rows, 1); - } - unsigned int residual_next_start = 0; - - for (unsigned int i = 0; i < cp_loc.n_elem - 1; i++) { - colvec segment_data_index_ = - linspace(cp_loc(i), cp_loc(i + 1) - 1, cp_loc(i + 1) - cp_loc(i)); - ucolvec segment_data_index = - arma::conv_to::from(std::move(segment_data_index_)); - - mat data_segment = data.rows(segment_data_index); - CostResult cost_result; - if (!contain(FASTCPD_FAMILIES, family)) { - cost_result = get_optimized_cost(data_segment); - } else { - cost_result = cost_function_wrapper( - data_segment, R_NilValue, lambda, false, R_NilValue - ); - } - - cost_values(i) = cost_result.value; - - // Parameters are not involved for PELT. - if (vanilla_percentage < 1) { - thetas.col(i) = colvec(cost_result.par); - } - - // Residual is only calculated for built-in families. - if (contain(FASTCPD_FAMILIES, family)) { - mat cost_optim_residual = cost_result.residuals; - residual.rows( - residual_next_start, - residual_next_start + cost_optim_residual.n_rows - 1 - ) = cost_optim_residual; - residual_next_start += cost_optim_residual.n_rows; - } - } - return List::create( - Named("raw_cp_set") = raw_cp_set, - Named("cp_set") = cp_set, - Named("cost_values") = cost_values, - Named("residual") = residual, - Named("thetas") = thetas - ); -} - -List Fastcpd::run() { - // Set up the initial values. - double lambda = 0; - - ucolvec r_t_set = {0}; - DEBUG_RCOUT(r_t_set); - - std::vector cp_sets = {zeros(0)}; - linspace(1, n, n).for_each([&](int i) { - cp_sets.push_back(zeros(1)); - }); - - colvec fvec = zeros(n + 1); - fvec.fill(arma::datum::inf); - fvec(0) = -beta; - DEBUG_RCOUT(fvec(0)); - - RProgress::RProgress rProgress("[:bar] :current/:total in :elapsed", n); - - if (r_progress) { - rProgress.tick(0); - } - - if (contain(FASTCPD_FAMILIES, family) || vanilla_percentage < 1) { - create_segment_indices(); - create_segment_statistics(); - } - - DEBUG_RCOUT(n); - - if (vanilla_percentage < 1) { - create_gradients(); - } - - checkUserInterrupt(); - if (r_progress) { - rProgress.tick(); - } - - for (int t = 1; t <= n; t++) { - DEBUG_RCOUT(t); - unsigned int r_t_count = r_t_set.n_elem; - DEBUG_RCOUT(r_t_count); - - // Number of cost values is the same as the number of elements in R_t. - colvec cval = zeros(r_t_count); - - // For tau in R_t \ {t-1}. - for (unsigned int i = 1; i < r_t_count; i++) { - cval(i - 1) = get_cval_for_r_t_set(r_t_set, i, t, lambda); - } - - DEBUG_RCOUT(cval); - cval(r_t_count - 1) = 0; - - if (vanilla_percentage != 1) { - update_fastcpd_parameters(t); - } - - // `beta` adjustment seems to work but there might be better choices. - colvec obj = cval + fvec.rows(r_t_set) + beta; - double min_obj = min(obj); - double tau_star = r_t_set(index_min(obj)); - - // Step 4 - cp_sets[t] = join_cols(cp_sets[tau_star], colvec{tau_star}); - DEBUG_RCOUT(cp_sets[t]); - - // Pruning step. - ucolvec pruned_left = - find(cval + fvec.rows(r_t_set) + convexity_coef <= min_obj); - DEBUG_RCOUT(pruned_left); - ucolvec pruned_r_t_set = zeros(pruned_left.n_elem + 1); - DEBUG_RCOUT(pruned_r_t_set); - if (pruned_left.n_elem) { - pruned_r_t_set.rows(0, pruned_left.n_elem - 1) = r_t_set(pruned_left); - } - DEBUG_RCOUT(pruned_r_t_set); - pruned_r_t_set(pruned_left.n_elem) = t; - r_t_set = std::move(pruned_r_t_set); - DEBUG_RCOUT(r_t_set); - - if (vanilla_percentage != 1) { - update_theta_hat(pruned_left); - update_theta_sum(pruned_left); - update_hessian(pruned_left); - } - - // Objective function F(t). - fvec(t) = min_obj; - DEBUG_RCOUT(fvec.rows(0, t)); - - checkUserInterrupt(); - if (r_progress) { - rProgress.tick(); - } + // # nocov start + stop("This branch should not be reached at classes.cc: 945."); + // # nocov end } - - return process_cp_set(cp_sets[n], lambda); + return cost_result; } void Fastcpd::update_cost_parameters( @@ -627,67 +407,6 @@ void Fastcpd::update_cost_parameters( update_momentum(as(cost_update_result[3])); } -colvec Fastcpd::trim_cp_set(const colvec raw_cp_set) { - // Remove change points close to the boundaries. - colvec cp_set = raw_cp_set; - cp_set = cp_set(find(cp_set > trim * n)); - cp_set = cp_set(find(cp_set < (1 - trim) * n)); - colvec cp_set_ = zeros(cp_set.n_elem + 1); - if (cp_set.n_elem) { - cp_set_.rows(1, cp_set_.n_elem - 1) = std::move(cp_set); - } - cp_set = sort(unique(std::move(cp_set_))); - - // Remove change points close to each other. - ucolvec cp_set_too_close = find(diff(cp_set) <= trim * n); - if (cp_set_too_close.n_elem > 0) { - int rest_element_count = cp_set.n_elem - cp_set_too_close.n_elem; - colvec cp_set_rest_left = zeros(rest_element_count), - cp_set_rest_right = zeros(rest_element_count); - for (unsigned int i = 0, i_left = 0, i_right = 0; i < cp_set.n_elem; i++) { - if ( - ucolvec left_find = find(cp_set_too_close == i); - left_find.n_elem == 0 - ) { - cp_set_rest_left(i_left) = cp_set(i); - i_left++; - } - if ( - ucolvec right_find = find(cp_set_too_close == i - 1); - right_find.n_elem == 0 - ) { - cp_set_rest_right(i_right) = cp_set(i); - i_right++; - } - } - cp_set = floor((cp_set_rest_left + cp_set_rest_right) / 2); - } - return cp_set(find(cp_set > 0)); -} - -List Fastcpd::update_cost_parameters_steps( - const mat data, - const int tau, - const int i, - Function k, - colvec momentum, - const double lambda, - colvec line_search -) { - update_cost_parameters_step(data, i, 0, data.n_rows - 1, lambda, line_search); - - for (int kk = 1; kk <= as(k(data.n_rows - tau)); kk++) { - for (unsigned j = tau + 1; j <= data.n_rows; j++) { - update_cost_parameters_step(data, i, tau, j - 1, lambda, line_search); - } - } - - theta_sum.col(i - 1) += theta_hat.col(i - 1); - return List::create( - theta_hat.col(i - 1), theta_sum.col(i - 1), hessian.slice(i - 1), momentum - ); -} - void Fastcpd::update_cost_parameters_step( const mat data, const int i, @@ -769,6 +488,83 @@ void Fastcpd::update_cost_parameters_step( hessian.slice(i - 1) = std::move(hessian_i); } +List Fastcpd::update_cost_parameters_steps( + const mat data, + const int tau, + const int i, + Function k, + colvec momentum, + const double lambda, + colvec line_search +) { + update_cost_parameters_step(data, i, 0, data.n_rows - 1, lambda, line_search); + + for (int kk = 1; kk <= as(k(data.n_rows - tau)); kk++) { + for (unsigned j = tau + 1; j <= data.n_rows; j++) { + update_cost_parameters_step(data, i, tau, j - 1, lambda, line_search); + } + } + + theta_sum.col(i - 1) += theta_hat.col(i - 1); + return List::create( + theta_hat.col(i - 1), theta_sum.col(i - 1), hessian.slice(i - 1), momentum + ); +} + +double Fastcpd::update_cost_value( + double value, + const unsigned int nrows +) { + if (cost_adjustment == "MDL") { + value = value * std::log2(M_E); + } + return value + get_cost_adjustment_value(nrows); +} + +colvec Fastcpd::update_cp_set(const colvec raw_cp_set) { + // Remove change points close to the boundaries. + colvec cp_set = raw_cp_set; + cp_set = cp_set(find(cp_set > trim * n)); + cp_set = cp_set(find(cp_set < (1 - trim) * n)); + colvec cp_set_ = zeros(cp_set.n_elem + 1); + if (cp_set.n_elem) { + cp_set_.rows(1, cp_set_.n_elem - 1) = std::move(cp_set); + } + cp_set = sort(unique(std::move(cp_set_))); + + // Remove change points close to each other. + ucolvec cp_set_too_close = find(diff(cp_set) <= trim * n); + if (cp_set_too_close.n_elem > 0) { + int rest_element_count = cp_set.n_elem - cp_set_too_close.n_elem; + colvec cp_set_rest_left = zeros(rest_element_count), + cp_set_rest_right = zeros(rest_element_count); + for (unsigned int i = 0, i_left = 0, i_right = 0; i < cp_set.n_elem; i++) { + if ( + ucolvec left_find = find(cp_set_too_close == i); + left_find.n_elem == 0 + ) { + cp_set_rest_left(i_left) = cp_set(i); + i_left++; + } + if ( + ucolvec right_find = find(cp_set_too_close == i - 1); + right_find.n_elem == 0 + ) { + cp_set_rest_right(i_right) = cp_set(i); + i_right++; + } + } + cp_set = floor((cp_set_rest_left + cp_set_rest_right) / 2); + } + return cp_set(find(cp_set > 0)); +} + +void Fastcpd::update_err_sd( + const unsigned int segment_index, const double err_var +) { + err_sd(segment_index) = sqrt(err_var); +} + void Fastcpd::update_fastcpd_parameters(const unsigned int t) { // for tau = t-1 rowvec new_data = data.row(t - 1).tail(data.n_cols - 1); @@ -798,8 +594,48 @@ void Fastcpd::update_fastcpd_parameters(const unsigned int t) { update_hessian(hessian_new); } +void Fastcpd::update_hessian( + const unsigned int slice, mat new_hessian +) { + hessian.slice(slice) = new_hessian; +} + +void Fastcpd::update_hessian(mat new_hessian) { + hessian = join_slices(hessian, new_hessian); +} + +void Fastcpd::update_hessian(ucolvec pruned_left) { + hessian = hessian.slices(pruned_left); +} + void Fastcpd::update_momentum(colvec new_momentum) { momentum = new_momentum; } +void Fastcpd::update_start(const unsigned int col, const colvec start_col) { + start.col(col) = start_col; +} + +void Fastcpd::update_theta_hat(colvec new_theta_hat) { + theta_hat = join_rows(theta_hat, new_theta_hat); +} + +void Fastcpd::update_theta_hat( + const unsigned int col, colvec new_theta_hat +) { + theta_hat.col(col) = new_theta_hat; +} + +void Fastcpd::update_theta_hat(ucolvec pruned_left) { + theta_hat = theta_hat.cols(pruned_left); +} + +void Fastcpd::update_theta_sum(colvec new_theta_sum) { + theta_sum = join_rows(theta_sum, new_theta_sum); +} + +void Fastcpd::update_theta_sum(ucolvec pruned_left) { + theta_sum = theta_sum.cols(pruned_left); +} + } // namespace fastcpd::classes diff --git a/src/fastcpd_class_public.cc b/src/fastcpd_class_public.cc new file mode 100644 index 00000000..1added37 --- /dev/null +++ b/src/fastcpd_class_public.cc @@ -0,0 +1,455 @@ +#include "fastcpd_classes.h" +#include "fastcpd_constants.h" +#include "fastcpd_functions.h" +#include "RProgress.h" + +using ::fastcpd::functions::negative_log_likelihood_arma; +using ::fastcpd::functions::negative_log_likelihood_glm; +using ::fastcpd::functions::negative_log_likelihood_lasso_cv; +using ::fastcpd::functions::negative_log_likelihood_lasso_wo_cv; +using ::fastcpd::functions::negative_log_likelihood_mean; +using ::fastcpd::functions::negative_log_likelihood_meanvariance; +using ::fastcpd::functions::negative_log_likelihood_mgaussian; +using ::fastcpd::functions::negative_log_likelihood_variance; + +namespace fastcpd::classes { + +Fastcpd::Fastcpd( + const double beta, + const double convexity_coef, + Nullable cost, + const string cost_adjustment, + Nullable cost_gradient, + Nullable cost_hessian, + const bool cp_only, + mat data, + const double epsilon, + const string family, + Nullable k, + colvec line_search, + const colvec lower, + const double momentum_coef, + const colvec order, + const int p, + const unsigned int p_response, + const bool r_progress, + const int segment_count, + const double trim, + const colvec upper, + const double vanilla_percentage, + const mat variance_estimate, + const bool warm_start +) : beta(beta), + convexity_coef(convexity_coef), + cost(cost), + cost_adjustment(cost_adjustment), + cost_gradient(cost_gradient), + cost_hessian(cost_hessian), + cp_only(cp_only), + data(data), + epsilon(epsilon), + family(family), + k(k), + line_search(line_search), + lower(lower), + momentum_coef(momentum_coef), + order(order), + p(p), + p_response(p_response), + r_progress(r_progress), + segment_count(segment_count), + trim(trim), + upper(upper), + vanilla_percentage(vanilla_percentage), + variance_estimate(variance_estimate), + warm_start(warm_start) { + n = data.n_rows; + segment_indices = vec(n); + segment_theta_hat = mat(segment_count, p); + err_sd = vec(segment_count); + act_num = vec(segment_count); + start = zeros(p, n); + theta_hat = mat(p, 1); + theta_sum = mat(p, 1); + hessian = cube(p, p, 1); + momentum = vec(p); + variance_data_mean = mean(data, 0); + + create_cost_function_wrapper(cost); + create_cost_gradient_wrapper(cost_gradient); + create_cost_hessian_wrapper(cost_hessian); + + // TODO(doccstat): Store environment functions from R. +} + +colvec Fastcpd::cost_update_gradient(mat data, colvec theta) { + rowvec new_data = data.row(data.n_rows - 1); + rowvec x = new_data.tail(new_data.n_elem - 1); + double y = new_data(0); + colvec gradient; + if (family.compare("binomial") == 0) { + gradient = - (y - 1 / (1 + exp(-as_scalar(x * theta)))) * x.t(); + } else if (family.compare("poisson") == 0) { + gradient = - (y - exp(as_scalar(x * theta))) * x.t(); + } else if (family == "lasso" || family == "gaussian") { + gradient = - (y - as_scalar(x * theta)) * x.t(); + } else if (family == "arma") { + mat reversed_data = reverse(data, 0); + colvec reversed_theta = reverse(theta); + if (data.n_rows < max(order) + 1) { + return ones(theta.n_elem); + } + colvec variance_term = zeros(data.n_rows); + for (unsigned int i = max(order); i < data.n_rows; i++) { + variance_term(i) = data(i, 0) - dot( + reversed_theta.rows(order(1) + 1, sum(order)), + data.rows(i - order(0), i - 1) + ) - dot( + reversed_theta.rows(1, order(1)), + variance_term.rows(i - order(1), i - 1) + ); + } + colvec reversed_variance_term = reverse(variance_term); + mat phi_coefficient = zeros(data.n_rows, order(0)), + psi_coefficient = zeros(data.n_rows, order(1)); + for (unsigned int i = max(order); i < data.n_rows; i++) { + phi_coefficient.row(i) = -reversed_data.rows( + data.n_rows - i, data.n_rows - i + order(0) - 1 + ).t() - reversed_theta.rows(1, order(1)).t() * + phi_coefficient.rows(i - order(1), i - 1); + } + for (unsigned int i = order(1); i < data.n_rows; i++) { + psi_coefficient.row(i) = -reversed_variance_term.rows( + data.n_rows - i, data.n_rows - i + order(1) - 1 + ).t() - reversed_theta.rows(1, order(1)).t() * + psi_coefficient.rows(i - order(1), i - 1); + } + gradient = zeros(sum(order) + 1); + gradient.rows(0, order(0) - 1) = phi_coefficient.row(data.n_rows - 1).t() * + variance_term(data.n_rows - 1) / theta(sum(order)); + gradient.rows(order(0), sum(order) - 1) = + psi_coefficient.row(data.n_rows - 1).t() * + variance_term(data.n_rows - 1) / theta(sum(order)); + gradient(sum(order)) = 1.0 / 2.0 / theta(sum(order)) - + std::pow(variance_term(data.n_rows - 1), 2) / 2.0 / + std::pow(theta(sum(order)), 2); + } else { + // # nocov start + stop("This branch should not be reached at functions.cc: 188."); + // # nocov end + } + return gradient; +} + +mat Fastcpd::cost_update_hessian(mat data, colvec theta) { + rowvec new_data = data.row(data.n_rows - 1); + rowvec x = new_data.tail(new_data.n_elem - 1); + mat hessian; + if (family.compare("binomial") == 0) { + double prob = 1 / (1 + exp(-as_scalar(x * theta))); + hessian = (x.t() * x) * as_scalar((1 - prob) * prob); + } else if (family.compare("poisson") == 0) { + double prob = exp(as_scalar(x * theta)); + // Prevent numerical issues if `prob` is too large. + hessian = (x.t() * x) * std::min(as_scalar(prob), 1e10); + } else if (family == "lasso" || family == "gaussian") { + hessian = x.t() * x; + } else if (family == "arma") { + // TODO(doccstat): Maybe we can store all these computations + mat reversed_data = reverse(data, 0); + colvec reversed_theta = reverse(theta); + if (data.n_rows < max(order) + 1) { + return eye(theta.n_elem, theta.n_elem); + } + colvec variance_term = zeros(data.n_rows); + for (unsigned int i = max(order); i < data.n_rows; i++) { + variance_term(i) = data(i, 0) - dot( + reversed_theta.rows(order(1) + 1, sum(order)), + data.rows(i - order(0), i - 1) + ) - dot( + reversed_theta.rows(1, order(1)), + variance_term.rows(i - order(1), i - 1) + ); + } + colvec reversed_variance_term = reverse(variance_term); + mat phi_coefficient = zeros(data.n_rows, order(0)), + psi_coefficient = zeros(data.n_rows, order(1)); + for (unsigned int i = max(order); i < data.n_rows; i++) { + phi_coefficient.row(i) = -reversed_data.rows( + data.n_rows - i, data.n_rows - i + order(0) - 1 + ).t() - reversed_theta.rows(1, order(1)).t() * + phi_coefficient.rows(i - order(1), i - 1); + } + for (unsigned int i = order(1); i < data.n_rows; i++) { + psi_coefficient.row(i) = -reversed_variance_term.rows( + data.n_rows - i, data.n_rows - i + order(1) - 1 + ).t() - reversed_theta.rows(1, order(1)).t() * + psi_coefficient.rows(i - order(1), i - 1); + } + mat reversed_coef_phi = reverse(phi_coefficient, 0), + reversed_coef_psi = reverse(psi_coefficient, 0); + cube phi_psi_coefficient = zeros(order(1), order(0), data.n_rows), + psi_psi_coefficient = zeros(order(1), order(1), data.n_rows); + for (unsigned int i = order(1); i < data.n_rows; i++) { + mat phi_psi_coefficient_part = zeros(order(1), order(0)), + psi_psi_coefficient_part = zeros(order(1), order(1)); + for (unsigned int j = 1; j <= order(1); j++) { + phi_psi_coefficient_part += + phi_psi_coefficient.slice(i - j) * theta(order(0) - 1 + j); + } + phi_psi_coefficient.slice(i) = -reversed_coef_phi.rows( + data.n_rows - i, data.n_rows - i + order(1) - 1 + ) - phi_psi_coefficient_part; + for (unsigned int j = 1; j <= order(1); j++) { + psi_psi_coefficient_part += + psi_psi_coefficient.slice(i - j) * theta(order(0) - 1 + j); + } + psi_psi_coefficient.slice(i) = -reversed_coef_psi.rows( + data.n_rows - i, data.n_rows - i + order(1) - 1 + ) - reversed_coef_psi.rows( + data.n_rows - i, data.n_rows - i + order(1) - 1 + ).t() - psi_psi_coefficient_part; + } + hessian = zeros(sum(order) + 1, sum(order) + 1); + hessian.submat(0, 0, order(0) - 1, order(0) - 1) = + phi_coefficient.row(data.n_rows - 1).t() * + phi_coefficient.row(data.n_rows - 1) / theta(sum(order)); + hessian.submat(0, order(0), order(0) - 1, sum(order) - 1) = ( + phi_psi_coefficient.slice(data.n_rows - 1).t() * + variance_term(data.n_rows - 1) + + phi_coefficient.row(data.n_rows - 1).t() * + psi_coefficient.row(data.n_rows - 1) + ) / theta(sum(order)); + hessian.submat(order(0), 0, sum(order) - 1, order(0) - 1) = + hessian.submat(0, order(0), order(0) - 1, sum(order) - 1).t(); + hessian.submat(0, sum(order), order(0) - 1, sum(order)) = + -phi_coefficient.row(data.n_rows - 1).t() * + variance_term(data.n_rows - 1) / theta(sum(order)) / theta(sum(order)); + hessian.submat(sum(order), 0, sum(order), order(0) - 1) = + hessian.submat(0, sum(order), order(0) - 1, sum(order)).t(); + hessian.submat(order(0), order(0), sum(order) - 1, sum(order) - 1) = ( + psi_coefficient.row(data.n_rows - 1).t() * + psi_coefficient.row(data.n_rows - 1) + + psi_psi_coefficient.slice(data.n_rows - 1) * + variance_term(data.n_rows - 1) + ) / theta(sum(order)); + hessian.submat(order(0), sum(order), sum(order) - 1, sum(order)) = + -psi_coefficient.row(data.n_rows - 1).t() * + variance_term(data.n_rows - 1) / theta(sum(order)) / theta(sum(order)); + hessian.submat(sum(order), order(0), sum(order), sum(order) - 1) = + hessian.submat(order(0), sum(order), sum(order) - 1, sum(order)).t(); + hessian(sum(order), sum(order)) = + std::pow(variance_term(data.n_rows - 1), 2) / + std::pow(theta(sum(order)), 3) - + 1.0 / 2.0 / std::pow(theta(sum(order)), 2); + } + return hessian; +} + +void Fastcpd::create_theta_sum( + const unsigned int col, colvec new_theta_sum +) { + theta_sum.col(col) = new_theta_sum; +} + +mat Fastcpd::get_theta_sum() { + return theta_sum; +} + +CostResult Fastcpd::negative_log_likelihood_wo_theta( + mat data, + double lambda, + bool cv, + Nullable start +) { + CostResult cost_result; + if (family == "lasso" && cv) { + cost_result = negative_log_likelihood_lasso_cv(data); + } else if (family == "lasso" && !cv) { + cost_result = negative_log_likelihood_lasso_wo_cv(data, lambda); + } else if ( + family == "binomial" || family == "poisson" || family == "gaussian" + ) { + cost_result = negative_log_likelihood_glm(data, start, family); + } else if (family == "arma") { + cost_result = negative_log_likelihood_arma(data, order); + } else if (family == "mean") { + cost_result = negative_log_likelihood_mean(data, variance_estimate); + } else if (family == "variance") { + cost_result = negative_log_likelihood_variance(data, variance_data_mean); + } else if (family == "meanvariance" || family == "mv") { + cost_result = negative_log_likelihood_meanvariance(data, epsilon); + } else if (family == "mgaussian") { + cost_result = negative_log_likelihood_mgaussian( + data, p_response, variance_estimate + ); + } else { + // # nocov start + stop("This branch should not be reached at fastcpd_class_cost.cc: 193."); + // # nocov end + } + return cost_result; +} + +double Fastcpd::negative_log_likelihood_wo_cv( + mat data, + colvec theta, + double lambda +) { + vec y = data.col(0); + if (family == "lasso" || family == "gaussian") { + // Calculate negative log likelihood in gaussian family + double penalty = lambda * accu(abs(theta)); + mat x = data.cols(1, data.n_cols - 1); + return accu(square(y - x * theta)) / 2 + penalty; + } else if (family == "binomial") { + // Calculate negative log likelihood in binomial family + mat x = data.cols(1, data.n_cols - 1); + colvec u = x * theta; + return accu(-y % u + arma::log(1 + exp(u))); + } else if (family == "poisson") { + mat x = data.cols(1, data.n_cols - 1); + colvec u = x * theta; + colvec y_factorial(y.n_elem); + for (unsigned int i = 0; i < y.n_elem; i++) { + double log_factorial = 0; + for (int j = 1; j <= y(i); ++j) { + log_factorial += std::log(j); + } + y_factorial(i) = log_factorial; + } + + return accu(-y % u + exp(u) + y_factorial); + } else if (family == "arma") { + colvec reversed_theta = reverse(theta); + if (data.n_rows < max(order) + 1) { + return 0; + } + colvec variance_term = zeros(data.n_rows); + for (unsigned int i = max(order); i < data.n_rows; i++) { + variance_term(i) = data(i, 0) - dot( + reversed_theta.rows(order(1) + 1, sum(order)), + data.rows(i - order(0), i - 1) + ) - dot( + reversed_theta.rows(1, order(1)), + variance_term.rows(i - order(1), i - 1) + ); + } + return (std::log(2.0 * M_PI) + + std::log(theta(sum(order)))) * (data.n_rows - 2) / 2.0 + + dot(variance_term, variance_term) / 2.0 / theta(sum(order)); + } else { + // # nocov start + stop("This branch should not be reached at functions.cc: 248."); + // # nocov end + } +} + +List Fastcpd::run() { + // Set up the initial values. + double lambda = 0; + + ucolvec r_t_set = {0}; + DEBUG_RCOUT(r_t_set); + + std::vector cp_sets = {zeros(0)}; + linspace(1, n, n).for_each([&](int i) { + cp_sets.push_back(zeros(1)); + }); + + colvec fvec = zeros(n + 1); + fvec.fill(arma::datum::inf); + fvec(0) = -beta; + DEBUG_RCOUT(fvec(0)); + + RProgress::RProgress rProgress("[:bar] :current/:total in :elapsed", n); + + if (r_progress) { + rProgress.tick(0); + } + + if (contain(FASTCPD_FAMILIES, family) || vanilla_percentage < 1) { + create_segment_indices(); + create_segment_statistics(); + } + + DEBUG_RCOUT(n); + + if (vanilla_percentage < 1) { + create_gradients(); + } + + checkUserInterrupt(); + if (r_progress) { + rProgress.tick(); + } + + for (int t = 1; t <= n; t++) { + DEBUG_RCOUT(t); + unsigned int r_t_count = r_t_set.n_elem; + DEBUG_RCOUT(r_t_count); + + // Number of cost values is the same as the number of elements in R_t. + colvec cval = zeros(r_t_count); + + // For tau in R_t \ {t-1}. + for (unsigned int i = 1; i < r_t_count; i++) { + cval(i - 1) = get_cval_for_r_t_set(r_t_set, i, t, lambda); + } + + DEBUG_RCOUT(cval); + cval(r_t_count - 1) = 0; + + if (vanilla_percentage != 1) { + update_fastcpd_parameters(t); + } + + // `beta` adjustment seems to work but there might be better choices. + colvec obj = cval + fvec.rows(r_t_set) + beta; + double min_obj = min(obj); + double tau_star = r_t_set(index_min(obj)); + + // Step 4 + cp_sets[t] = join_cols(cp_sets[tau_star], colvec{tau_star}); + DEBUG_RCOUT(cp_sets[t]); + + // Pruning step. + ucolvec pruned_left = + find(cval + fvec.rows(r_t_set) + convexity_coef <= min_obj); + DEBUG_RCOUT(pruned_left); + ucolvec pruned_r_t_set = zeros(pruned_left.n_elem + 1); + DEBUG_RCOUT(pruned_r_t_set); + if (pruned_left.n_elem) { + pruned_r_t_set.rows(0, pruned_left.n_elem - 1) = r_t_set(pruned_left); + } + DEBUG_RCOUT(pruned_r_t_set); + pruned_r_t_set(pruned_left.n_elem) = t; + r_t_set = std::move(pruned_r_t_set); + DEBUG_RCOUT(r_t_set); + + if (vanilla_percentage != 1) { + update_theta_hat(pruned_left); + update_theta_sum(pruned_left); + update_hessian(pruned_left); + } + + // Objective function F(t). + fvec(t) = min_obj; + DEBUG_RCOUT(fvec.rows(0, t)); + + checkUserInterrupt(); + if (r_progress) { + rProgress.tick(); + } + } + + return get_cp_set(cp_sets[n], lambda); +} + +void Fastcpd::update_theta_sum( + const unsigned int col, colvec new_theta_sum +) { + theta_sum.col(col) += new_theta_sum; +} + +} // namespace fastcpd::classes diff --git a/src/fastcpd_classes.h b/src/fastcpd_classes.h index a43ca96f..8916505e 100644 --- a/src/fastcpd_classes.h +++ b/src/fastcpd_classes.h @@ -8,19 +8,9 @@ namespace fastcpd::classes { struct ColMat { mat data; - operator colvec() const { - // TODO(doccstat): Add a warning if the matrix has more than one column. - return data.as_col(); - } - - operator mat() const { - return data; - } - - operator rowvec() const { - // TODO(doccstat): Add a warning if the matrix has more than one column. - return data.as_col().t(); - } + operator colvec() const; + operator mat() const; + operator rowvec() const; }; struct CostResult { @@ -29,6 +19,42 @@ struct CostResult { double value; }; +class CostFunction { + public: + CostFunction(Function cost); + + CostResult operator()( + mat data, + Nullable theta, + double lambda, // UNUSED + bool cv, // UNUSED + Nullable start // UNUSED + ); + + private: + Function cost; +}; + +class CostGradient { + public: + CostGradient(Function cost_gradient); + + colvec operator()(mat data, colvec theta); + + private: + Function cost_gradient; +}; + +class CostHessian { + public: + CostHessian(Function cost_hessian); + + mat operator()(mat data, colvec theta); + + private: + Function cost_hessian; +}; + class Fastcpd { public: Fastcpd( @@ -102,7 +128,7 @@ class Fastcpd { private: // Adjust cost value for MBIC and MDL. - double adjust_cost_value(double value, const unsigned int nrows); + double update_cost_value(double value, const unsigned int nrows); void create_cost_function_wrapper(Nullable cost); void create_cost_gradient_wrapper(Nullable cost_gradient); @@ -164,7 +190,7 @@ class Fastcpd { // // @return Negative log likelihood of the corresponding data with the given // family. - CostResult negative_log_likelihood( + CostResult get_cost_result( mat data, Nullable theta, double lambda, @@ -172,9 +198,9 @@ class Fastcpd { Nullable start = R_NilValue ); - List process_cp_set(const colvec raw_cp_set, const double lambda); + List get_cp_set(const colvec raw_cp_set, const double lambda); - colvec trim_cp_set(const colvec raw_cp_set); + colvec update_cp_set(const colvec raw_cp_set); void update_cost_parameters( const unsigned int t, @@ -368,42 +394,6 @@ class Fastcpd { const bool warm_start; }; -class CostFunction { - public: - CostFunction(Function cost); - - CostResult operator()( - mat data, - Nullable theta, - double lambda, // UNUSED - bool cv, // UNUSED - Nullable start // UNUSED - ); - - private: - Function cost; -}; - -class CostGradient { - public: - CostGradient(Function cost_gradient); - - colvec operator()(mat data, colvec theta); - - private: - Function cost_gradient; -}; - -class CostHessian { - public: - CostHessian(Function cost_hessian); - - mat operator()(mat data, colvec theta); - - private: - Function cost_hessian; -}; - } // namespace fastcpd::classes #endif // FASTCPD_CLASSES_H_ diff --git a/src/fastcpd_cost_wrappers.cc b/src/fastcpd_cost_wrappers.cc deleted file mode 100644 index 3144db9d..00000000 --- a/src/fastcpd_cost_wrappers.cc +++ /dev/null @@ -1,34 +0,0 @@ -#include "fastcpd_classes.h" - -namespace fastcpd::classes { - -CostFunction::CostFunction(Function cost) : cost(cost) {} - -CostResult CostFunction::operator()( // # nocov - mat data, - Nullable theta, - double lambda, // UNUSED - bool cv, // UNUSED - Nullable start // UNUSED -) { - DEBUG_RCOUT(data.n_rows); - SEXP value = - theta.isNull() ? cost(data) : cost(data, as(theta)); // # nocov - return {{colvec()}, {colvec()}, as(value)}; // # nocov -} - -CostGradient::CostGradient(Function cost_gradient) : - cost_gradient(cost_gradient) {} - -colvec CostGradient::operator()(mat data, colvec theta) { - return as(cost_gradient(data, theta)); -} - -CostHessian::CostHessian(Function cost_hessian) : - cost_hessian(cost_hessian) {} - -mat CostHessian::operator()(mat data, colvec theta) { - return as(cost_hessian(data, theta)); -} - -} // namespace fastcpd::classes