-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStats_Fcns_Arma.cpp
327 lines (256 loc) · 11.2 KB
/
Stats_Fcns_Arma.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
// case: when linking against Rmath as a stand-alone library
#ifdef MATHLIB_STANDALONE
#include <armadillo>
#include <R.h> // rPsort
#include <Rmath.h> // rnorm
/* Creates the same alias when using Rmath as a stand-alone library as when we
* are calling this code through R via Rcpp. This allows us to use the same
* function call for both scenarios.
*/
namespace R {
inline double rnorm(double mu, double sigma) { return ::rnorm(mu, sigma); }
}
/* case: if not linking against Rmath as a stand-alone library, then assumes
* that we are linking against R via Rcpp
*/
#else
#include <RcppArmadillo.h>
#endif
#define TOLERANCE 0.000000000001 // 1e-12
// Statistical functions -------------------------------------------------------
/* Generates values for beta where the number of nonzero values in beta is given
* by floor(prop_nonzero * p) and are evenly spaced along the vector. The
* nonzero entries of beta are sampled from a N(0, true_beta_sd) distribution.
*/
arma::vec sample_beta(int p, double prop_nonzero, double true_beta_sd) {
arma::vec beta(p); // Allocate memory to store beta vector
int n_nonzero; // Number of nonzero elements of beta
double stepsize; // Fractional amount of zero entries between nonzero
double next_idx_dbl; // Fractional index of next nonzero entry
int next_idx_int; // Integer index of next nonzero entry
/* Note that in the following casts from double to int we need not consider
* overflow since p is known to be a representable number, and both
* prop_nonzero * p and next_idx_dbl are no bigger than p.
*/
n_nonzero = (int) (prop_nonzero * p);
stepsize = p / (double) (n_nonzero + 1);
next_idx_dbl = stepsize;
next_idx_int = (int) next_idx_dbl;
/* Each iteration sets the value of beta(j). Add a machine error smudge
* factor mainly to prevent the last element from being erroneously
* considered a nonzero element due to the sum of stepsize being less than
* (p - 1) due to machine error.
*/
for (int j = 0; j < p; j++) {
/* case: a nonzero value. Sample beta(j). Next, calculate where the
* next value would be placed if we could evenly specify it on the real
* line, and then set the next index for a nonzero value to be the floor
* of the true evenly spaced value.
*/
if (j == next_idx_int) {
beta(j) = R::rnorm(0, true_beta_sd);
next_idx_dbl += stepsize;
next_idx_int = (int) (next_idx_dbl + TOLERANCE);
}
// case: a zero value for beta
else {
beta(j) = 0;
}
}
return beta;
}
/* Generates a random sample from a N(m, V) distribution using an eigen
* decomposition of V. Note that the function uses the fact the the eigenvalues
* are provided by eig_sym in increasing order.
*
* PRE: assumes that V is symmetric and positive-definite, and that m has the
* same number of elements as V has rows.
*/
arma::vec mvrnorm_eigen(arma::vec& m, arma::mat& V) {
int p = m.n_elem;
arma::vec eigval(p); // Store eigenvalues of V
arma::vec root_eigval(p); // Store square root of e-values of V
arma::mat eigvec(p, p); // Store eigenvectors of V
arma::vec mvnsamp(p); // Store a random sample from indep N(0, 1)'s
// Perform eigenvalue decomposition
if (! eig_sym(eigval, eigvec, V)) {
throw std::runtime_error("V is not positive definite\n");
}
/* Calculate elementwise square root of eigenvalues. Ensure beforehand that
* eigenvalues that are negative due to machine error are set to zero.
*/
arma::vec::iterator eval_curr = eigval.begin(), eval_end = eigval.end();
// Iterate through eigenvalues until we reach positive eigenvalues
for ( ; (eval_curr != eval_end) && (*eval_curr < 0); eval_curr++) {
*eval_curr = 0;
}
root_eigval = sqrt(eigval);
/* Eigen decomposition provides the relation V = PDP^{T} where P is an
* orthogonal matrix where the columns are eigenvectors and D is a diagonal
* matrix composed of the corresponding eigenvalues. It follows that for
* independent normal r.v. z:
*
* Var[P D^{1/2} z] = P D^{1/2} Var[z] D^{1/2} P^{T}
* = P D^{1/2} I D^{1/2} P^{T}
* = PDP^{-1}
* = V
*/
return m + (eigvec * (root_eigval % mvnsamp.randn()));
}
/* Generates a random sample from a N(m, V) distribution using an Cholesky
* decomposition of V.
*
* PRE: assumes that V is symmetric and positive-definite, and that m has the
* same number of elements as V has rows.
*/
arma::vec mvrnorm_chol(arma::vec& m, arma::mat& V) {
int p = m.n_elem;
arma::mat R(p, p); // store Cholesky decomposition of V
arma::vec mvnsamp(p); // store a random sample from indep N(0, 1)'s
/* Perform Cholesky decomposition. Note: an exception is thrown if
* decomposition fails.
*/
R = chol(V, "lower");
/* Cholesky decomposition with Armadillo option "lower" provides the
* relation V = RR^{T} so that for independent normal r.v. z:
*
* Var[Rz] = R Var[z] R^{T} = RIR^{T} = RR^{T} = V
*/
return m + (R * mvnsamp.randn());
}
/* Calculates the empirical quantiles of the data pointed to by values and with
* quantile levels corresponding to the indices provided by quant_below_idx(k)
* and quant_below_idx(k) + 1 and interpolated by gamma(k) - see below for
* definitions of quant_below_idx (corresponds to j) and gamma. Return object
* is the vector of quantiles.
*
* PRE: assumes that data_len >= 2, that quant_below_idx is nondecreasing, and
* that gamma and quant_below_idx have the same length.
*
* - - - - - - - - - - - - - - - - - - - - - - - - - -
*
* The quantile calculation part of this function is an implementation of the R
* function stats::quantile under the default choice of empirical quantile. See
* the help page for stats::quantile for more details (the default there is type
* 7). The formula is (for n the number of observations in the data):
*
* F^{-1}(prob) = ((1 - gamma) * x_j) + (gamma * x_{j+1})
*
* where
*
* j = floor(prob * (n - 1))
* gamma = (prob * (n - 1)) - floor(prob * (n - 1))
*/
arma::vec quantile(double* values,
int data_len,
arma::Mat<int>& quant_below_idx,
arma::vec& gamma) {
int nprobs = quant_below_idx.n_elem; // num empirical quantiles
int curr_size = data_len; // num samples in data
int relative_below_idx[nprobs]; // index relative to previous index
double* curr_start; // start of the current subset of the data
double* end; // 1 past the end of the data
double* kplus1; // 1 past the start of the current subset of the data
double* curr; // pointer to the current value
double* curr_min; // pointer to the current minimum
int below_idx; // index leq to desired index location
int above_idx; // index one greater than below_idx(k)
double temp; // used for data swap
arma::vec out(nprobs); // storage for empirical quantiles
// Calculate initial starting location and absolute end of data
curr_start = values;
end = curr_start + data_len;
// Calculate indices relative to their previous index
relative_below_idx[0] = quant_below_idx(0);
for (int i = 1; i < nprobs; i++) {
relative_below_idx[i] = quant_below_idx(i) - quant_below_idx(i - 1);
}
/* Each iteration calculates the empirical quantile value of the k-th
* probability. The main task is to partially sort the data so that the
* values in the quant_below_idx(k)-th index and the index immediately
* following are the same is if the data had been fully sorted.
*
* This is done by partially sorting the data with indices higher than the
* partial sort in the previous iteration.
*/
for (int k = 0; k < nprobs; k++) {
/* Partial sorting on k-th quantile index; i.e. data is permuted so that
* every value with index less than k-th quantile index of the data is
* smaller than value at the k-th quantile index, and every value with
* index larger than k-th quantile index of the data is larger.
*/
rPsort(curr_start, curr_size, relative_below_idx[k]);
/* Find the index of the (k + 2)-th largest value. We start
* curr_min_ptr at the (k + 1)-th index b/c we know every value with
* smaller index is <= to the value there.
*/
kplus1 = curr_start + relative_below_idx[k] + 1;
curr_min = kplus1;
for (curr = curr_min; curr < end; curr++) {
if (*curr < *curr_min) {
curr_min = curr;
}
}
// Swap the value in the (k + 1)-th index with the (k + 2)-th largest value
temp = *kplus1;
*kplus1 = *curr_min;
*curr_min = temp;
/* Calculate empirical quantile value. Note that we transpose the
* indices so as to save the quantile data in variable by quantile
* level shape. It is (k + 1) rather than k b/c we fill the 0-th
* column with the true values of beta.
*/
below_idx = quant_below_idx(k);
above_idx = below_idx + 1;
out(k) = ((1 - gamma(k)) * values[below_idx]) + (gamma(k) * values[above_idx]);
// Reflect the subset of the data that needs partial sorting
curr_start += relative_below_idx[k];
curr_size = end - curr_start;
}
return out;
}
/* Calculate the empirical quantiles for each column of X and quantile levels
* given by prob, and return as elements of an arma::mat. The first column of
* the return object is the true values of the variables, and the (j, k + 1)
* element of the return object is the quantile value of the j-th variable at
* the k-th quantile level.
*
* PRE: assumes that true_beta is the same length as the number of columns of X,
* that X has >= 2 rows of data, and that prob is nondecreasing and has values
* in (0, 1).
*/
arma::mat quantile_table(arma::vec& true_beta, arma::mat& X, arma::vec& prob) {
int p = X.n_cols; // number of variables
int nprobs = prob.n_elem; // number of probabilities
int nsamp = X.n_rows; // number of samples
double* col_start; // first element in X.col(j)
// Container for quantile values
arma::mat out(nprobs + 1, p);
// Quantile indices, interpolation vals ----------------
double desired; // idx you might choose point if you had inf points
arma::Col<int> below_idx(nprobs); // closest index below desired point
arma::vec gamma(nprobs); // linear interpolation value
/* Calculate data indices at which the quantiles are calculated and the
* corresponding linear interpolation values
*/
for (int k = 0; k < nprobs; k++) {
// Minus 1 b/c of 0-based indexing
desired = prob(k) * (nsamp - 1);
below_idx(k) = (int) desired;
gamma(k) = desired - below_idx(k);
}
// Fill in values of table -----------------------------
// Fill in true values of beta in first row
out.row(0) = true_beta.t();
/* Each iteration fills in rows 1-3 of column j with the empirical quantile
* levels of the j-th column of X
*/
col_start = X.memptr();
for (int j = 0; j < p; j++) {
// Fill in empirical quantile levels
out.submat(1, j, 3, j) = quantile(col_start, nsamp, below_idx, gamma);
// Update col_start to point to first element of (j + 1)-th column
col_start += nsamp;
}
return out.t();
}