-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathapop_tests.m4.c
502 lines (444 loc) · 24.9 KB
/
apop_tests.m4.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
/** \file apop_tests.c */
/* Copyright (c) 2007 by Ben Klemens. Licensed under the GPLv2; see COPYING.
*/
#include "apop_internal.h"
static apop_data * produce_t_test_output(int df, double stat, double diff){
double pval, qval, two_tail;
if (!gsl_isnan(stat)){
pval = gsl_cdf_tdist_P(stat, df);
qval = 1-pval;
two_tail= 2*GSL_MIN(fabs(pval-.5),fabs(qval-0.5));
} else {
pval = GSL_NAN;
qval = GSL_NAN;
two_tail= GSL_NAN;
}
apop_data *out = apop_data_alloc();
apop_data_add_named_elmt(out, "mean left - right", diff);
apop_data_add_named_elmt(out, "t statistic", stat);
apop_data_add_named_elmt(out, "df", df);
apop_data_add_named_elmt(out, "p value, 1 tail", GSL_MIN(pval,qval));
apop_data_add_named_elmt(out, "confidence, 1 tail", 1 - GSL_MIN(pval,qval));
apop_data_add_named_elmt(out, "p value, 2 tail", 1- two_tail);
apop_data_add_named_elmt(out, "confidence, 2 tail", two_tail);
return out;
}
/** Answers the question: with what confidence can I say that the means of these two columns of data are different?
If \c apop_opts.verbose is >=1, then display some information to stdout, like the mean/var/count for both vectors and the t statistic.
\param a one column of data
\param b another column of data
\return an \ref apop_data set with the following elements: <br>
<tt>mean left - right</tt>: the difference in means; if positive, first vector has larger mean, and one-tailed test is testing \f$L > R\f$, else reverse if negative.<br>
<tt>t statistic</tt>: used for the test<br>
<tt>df</tt>: degrees of freedom<br>
<tt>p value, 1 tail</tt>: the p-value for a one-tailed test that one vector mean is greater than the other.<br>
<tt>confidence, 1 tail</tt>: 1- p value.<br>
<tt>p value, 2 tail</tt>: the p-value for the two-tailed test that left mean = right mean.<br>
<tt>confidence, 2 tail</tt>: 1-p value
Example usage:
\code
gsl_vector *L = apop_query_to_vector("select * from data where sex='M'");
gsl_vector *R = apop_query_to_vector("select * from data where sex='F'");
apop_data *test_out = apop_t_test(L, R);
printf("Reject the null hypothesis of no difference between M and F with %g%% confidence\n", apop_data_get(test_out, .rowname="confidence, 2 tail"));
\endcode
\see \ref apop_paired_t_test, which answers the question: with what confidence can I
say that the mean difference between the two columns is zero?
*/
apop_data * apop_t_test(gsl_vector *a, gsl_vector *b){
int a_count = a->size,
b_count = b->size;
double a_avg = apop_vector_mean(a);
double a_var = (a_count > 1) ? apop_vector_var(a) : 0,
b_avg = apop_vector_mean(b),
b_var = b_count > 1 ? apop_vector_var(b): 0,
stat = (a_avg - b_avg)/ sqrt(
(b_count > 1 ? b_var/(b_count-1) : 0)
+ (a_count > 1 ? a_var/(a_count-1) : 0)
);
if (apop_opts.verbose >=1){
printf("1st avg: %g; 1st std dev: %g; 1st count: %i.\n", a_avg, sqrt(a_var), a_count);
printf("2st avg: %g; 2st std dev: %g; 2nd count: %i.\n", b_avg, sqrt(b_var), b_count);
printf("t-statistic: %g.\n", stat);
}
int df = a_count+b_count-2;
return produce_t_test_output(df, stat, a_avg - b_avg);
}
/** Answers the question: with what confidence can I say that the mean difference between the two columns is zero?
If <tt>apop_opts.verbose >=2</tt>, then display some information, like the mean/var/count for both vectors and the t statistic, to stderr.
\param a A column of data
\param b A matched column of data
\return an \ref apop_data set with the following elements:
<tt>mean left - right</tt>: the difference in means; if positive, first vector has larger mean, and one-tailed test is testing \f$L > R\f$, else reverse if negative.<br>
<tt>t statistic</tt>: used for the test<br>
<tt>df</tt>: degrees of freedom<br>
<tt>p value, 1 tail</tt>: the p-value for a one-tailed test that one vector mean is greater than the other.<br>
<tt>confidence, 1 tail</tt>: 1- p value.<br>
<tt>p value, 2 tail</tt>: the p-value for the two-tailed test that left mean = right mean.<br>
<tt>confidence, 2 tail</tt>: 1-p value
\see \ref apop_t_test for an example, and for when the element-by-element difference between the vectors has no sensible interpretation.
*/
apop_data * apop_paired_t_test(gsl_vector *a, gsl_vector *b){
gsl_vector *diff = gsl_vector_alloc(a->size);
gsl_vector_memcpy(diff, a);
gsl_vector_sub(diff, b);
int count = a->size;
double avg = apop_vector_mean(diff),
var = apop_vector_var(diff),
stat = avg/ sqrt(var/(count-1));
gsl_vector_free(diff);
Apop_notify(2, "avg diff: %g; diff std dev: %g; count: %i; t-statistic: %g.\n", avg, sqrt(var), count, stat);
return produce_t_test_output(count-1, stat, avg);
}
/** Runs an F-test specified by \c q and \c c. See
the chapter on hypothesis testing in <a href="http://modelingwithdata.org">Modeling With Data</a>, p 309, which will tell you that:
\f[{N-K\over q}
{({\bf Q}'\hat\beta - {\bf c})' [{\bf Q}' ({\bf X}'{\bf X})^{-1} {\bf Q}]^{-1} ({\bf Q}' \hat\beta - {\bf c})
\over {\bf u}' {\bf u} } \sim F_{q,N-K},\f]
and that's what this function is based on.
\param est An \ref apop_model that you have already calculated. (No default)
\param contrast An \ref apop_data set whose matrix represents \f${\bf Q}\f$ and whose
vector represents \f${\bf c}\f$. Each row represents a hypothesis. (Defaults:
if matrix is \c NULL, it is set to the identity matrix with the top row missing. If
the vector is \c NULL, it is set to a zero matrix of length equal to the height of
the contrast matrix. Thus, if the entire \c apop_data set is NULL or omitted, we are
testing the hypothesis that all but \f$\beta_1\f$ are zero.)
\return An \c apop_data set with a few variants on the confidence with which we can reject the joint hypothesis.
\todo There should be a way to get OLS and GLS to store \f$(X'X)^{-1}\f$. In fact, if you did GLS, this is invalid, because you need \f$(X'\Sigma X)^{-1}\f$, and I didn't ask for \f$\Sigma\f$.
\exception out->error='a' Allocation error.
\exception out->error='d' dimension-matching error.
\exception out->error='i' matrix inversion error.
\exception out->error='m' GSL math error.
\li There are two approaches to an \f$F\f$-test: the ANOVA approach, which is typically
built around the claim that all effects but the mean are zero; and the more general
regression form, which allows for any set of linear claims about the data. If you send
a \c NULL contrast set, I will generate the set of linear contrasts that are equivalent
to the ANOVA-type approach. This is why the top row of the default \f${\bf Q}\f$
matrix is missing: there is no hypothesis test about the coefficient for the
constant term. See the example below.
\li This function uses the \ref designated syntax for inputs.
\include f_test.c
*/
APOP_VAR_HEAD apop_data * apop_f_test (apop_model *est, apop_data *contrast){
apop_model *apop_varad_var(est, NULL)
Nullcheck_m(est, NULL);
Nullcheck_d(est->data, NULL);
apop_data * apop_varad_var(contrast, NULL);
int free_data=0, free_matrix=0, free_vector=0;
if (!contrast) contrast = apop_data_alloc(), free_data=1;
if (!contrast->matrix) {
int size = est->parameters->vector->size;
contrast->matrix= gsl_matrix_calloc(size - 1, size);
for (int i=1; i< size; i++)
apop_data_set(contrast, i-1, i, 1);
}
if (!contrast->vector) contrast->vector = gsl_vector_calloc(contrast->matrix->size1), free_vector=1;
apop_data *out = apop_f_test_base(est, contrast);
if (free_data) {apop_data_free(contrast); return out;}
if (free_matrix) gsl_matrix_free(contrast->matrix);
if (free_vector) gsl_vector_free(contrast->vector);
return out;
APOP_VAR_ENDHEAD
apop_data *out = apop_data_alloc();
Asprintf(&out->names->title, "F test");
size_t contrast_ct = contrast->vector->size;
Apop_stopif(contrast->matrix->size1 != contrast_ct, out->error='d'; return out,
0, "I counted %zu contrasts by the size of either contrast->vector or "
"est->parameters->vector->size, but you gave me a matrix with %zu rows. Those should match."
, contrast_ct, contrast->matrix->size1);
double f_stat, pval;
Get_vmsizes(est->data); //msize1, msize2
int data_df = msize1 - contrast_ct;
//Find (\underbar x)'(\underbar x), where (\underbar x) = the data with means removed
long double means[msize2];
for (int i=1; i< msize2; i++)
means[i] = apop_vector_mean(Apop_cv(est->data, i));
means[0]=0;// don't screw with the ones column.
apop_data *xpx = apop_data_alloc(msize2, msize2);
Apop_stopif(xpx->error, apop_data_free(xpx); out->error='a'; return out, 0, "allocation error");
for (int i=0; i< msize2; i++)
for (int j=0; j< msize2; j++){ //at this loop, we calculate one cell in the dot prouct
long double total = 0;
for (int c=0; c<msize1; c++)
total += (gsl_matrix_get(est->data->matrix, c, i) - means[i])
*(gsl_matrix_get(est->data->matrix, c, j) - means[j]);
apop_data_set(xpx, i, j, total);
}
apop_data xpxinv = (apop_data){.matrix=apop_matrix_inverse(xpx->matrix)};
Apop_stopif(!xpxinv.matrix, out->error='i'; return out, 0, "inversion of X'X error");
apop_data *qprimexpxinv = apop_dot(contrast, &xpxinv, 'm', 'm');
apop_data *qprimexpxinvq = apop_dot(qprimexpxinv, contrast, 'm', 't');
Apop_stopif(qprimexpxinvq->error || qprimexpxinv->error, out->error='m'; return out, 0, "broken dot");
apop_data qprimexpxinvqinv = (apop_data){.matrix=apop_matrix_inverse(qprimexpxinvq->matrix)};
Apop_stopif(!qprimexpxinvqinv.matrix, out->error='i'; return out, 0, "inversion of Q'(X'X)^{-1}Q error");
apop_data_free(qprimexpxinvq);
apop_data_free(qprimexpxinv);
apop_data *qprimebeta = apop_dot(contrast, est->parameters, 'm', 'v');
Apop_stopif(qprimebeta->error, out->error='m'; return out, 0, "broken dot");
gsl_vector_sub(qprimebeta->vector, contrast->vector);
apop_data *qprimebetaminusc_qprimexpxinvqinv = apop_dot(&qprimexpxinvqinv, qprimebeta, .form2='v');
Apop_stopif(qprimebetaminusc_qprimexpxinvqinv->error, out->error='m'; return out, 0, "broken dot");
gsl_blas_ddot(qprimebeta->vector, qprimebetaminusc_qprimexpxinvqinv->vector, &f_stat);
apop_data_free(xpx);
apop_data_free(qprimebeta);
apop_data_free(qprimebetaminusc_qprimexpxinvqinv);
apop_data *r_sq_list = apop_estimate_coefficient_of_determination (est);
double variance = apop_data_get(r_sq_list, .rowname="sse");
f_stat *= data_df / (variance * contrast_ct);
pval = (contrast_ct > 0 && data_df > 0) ? gsl_cdf_fdist_Q(f_stat, contrast_ct, data_df): GSL_NAN;
apop_data_add_named_elmt(out, "F statistic", f_stat);
apop_data_add_named_elmt(out, "p value", pval);
apop_data_add_named_elmt(out, "confidence", 1- pval);
apop_data_add_named_elmt(out, "df1", contrast_ct);
apop_data_add_named_elmt(out, "df2", data_df);
return out;
}
static double one_chi_sq(apop_data *d, int row, int col, int n){
double rowexp = apop_vector_sum(Apop_rv(d, row))/n;
double colexp = apop_vector_sum(Apop_cv(d, col))/n;
double observed = apop_data_get(d, row, col);
double expected = n * rowexp * colexp;
return gsl_pow_2(observed - expected)/expected;
}
/** Run a Chi-squared test on an ANOVA table, i.e., an NxN table with the null hypothesis that all cells are equally likely.
\param d The input data, which is a crosstab of various elements. They don't have to sum to one.
\return A \ref apop_data set including elements named
<tt>"chi squared statistic"</tt>, <tt>"df"</tt>, and <tt>"p value"</tt>. Retrieve via,
e.g., <tt>apop_data_get(out, .rowname="p value")</tt>.
\see apop_test_fisher_exact
*/
apop_data * apop_test_anova_independence(apop_data *d){
Apop_stopif(!d || !d->matrix, return NULL, 0, "You sent me data with no matrix element. Returning NULL.");
double total = 0;
//You can have a one-column or one-row matrix if you want; else df = (rows-1)*(cols-1)
double df = d->matrix->size1==1 ? d->matrix->size2-1 : d->matrix->size2 == 1 ? d->matrix->size1
: (d->matrix->size1 - 1)* (d->matrix->size2 - 1);
Apop_stopif(!df, return NULL, 0, "You sent a degenerate matrix. Returning NULL.");
int n = apop_matrix_sum(d->matrix);
for (size_t row=0; row <d->matrix->size1; row++)
for (size_t col=0; col <d->matrix->size2; col++)
total += one_chi_sq(d, row, col, n);
apop_data *out = apop_data_alloc();
double chisq = gsl_cdf_chisq_Q(total, df);
apop_data_add_named_elmt(out, "chi squared statistic", total);
apop_data_add_named_elmt(out, "df", df);
apop_data_add_named_elmt(out, "p value", chisq);
return out;
}
static apop_data* apop_anova_one_way(char *table, char *data, char *grouping){
//ANOVA has always just been a process of filling in a form, and
//that's what this function does.
apop_data *out = apop_data_calloc(3, 6);
apop_name_add(out->names, "sum of squares", 'c');
apop_name_add(out->names, "df", 'c');
apop_name_add(out->names, "mean squares", 'c');
apop_name_add(out->names, "F ratio", 'c');
apop_name_add(out->names, "p value", 'c');
apop_name_add(out->names, "confidence", 'c');
apop_name_add(out->names, grouping, 'r');
apop_name_add(out->names, "residual", 'r');
apop_name_add(out->names, "total", 'r');
//total sum of squares:
apop_data* tss = apop_query_to_data("select var_pop(%s), count(*) from %s", data, table);
Apop_stopif(!tss, apop_return_data_error('q'), 0, "Query 'select var_pop(%s), count(*) from %s' returned NULL. Does that look right to you?", data, table);
apop_data_set(out, 2, 0, apop_data_get(tss, 0, 0)*apop_data_get(tss, 0, 1)); //total sum of squares
double total_df = apop_data_get(tss, 0, 1);
apop_data_set(out, 2, 1, apop_data_get(tss, 0, 1)); //total df.
//within group sum of squares:
apop_data* wss = apop_query_to_data("select var_pop(%s), count(*) from %s group by %s", data, table, grouping);
double group_df = wss->matrix->size1-1;
apop_data_set(out, 0, 0, apop_data_get(wss, 0, 0)*group_df); //within sum of squares
apop_data_set(out, 0, 1, group_df);
//residuals are just total-wss
apop_data_set(out, 1, 0, apop_data_get(out, 2, 0) - apop_data_get(out, 0,0)); //residual sum of squares
double residual_df = total_df - group_df;
apop_data_set(out, 1, 1, residual_df); //residual df
apop_data_set(out, 0, 2, apop_data_get(out, 0, 0)/apop_data_get(out, 0, 1));//mean SS within
apop_data_set(out, 1, 2, apop_data_get(out, 1, 0)/apop_data_get(out, 1, 1));//mean SS residual
apop_data_set(out, 0, 3, apop_data_get(out, 0, 2)/apop_data_get(out, 1, 2));//F ratio
apop_data_set(out, 0, 4, gsl_cdf_fdist_P(apop_data_get(out, 0, 3), group_df, residual_df));//pval
apop_data_set(out, 0, 5, 1- apop_data_get(out, 0, 4));//confidence
apop_data_free(tss);
apop_data_free(wss);
return out;
}
/** This function produces a traditional one- or two-way ANOVA table. It
works from data in an SQL table, using queries of a form like <tt>select
data from table group by grouping1, grouping2</tt>.
\param table The table to be queried. Anything that can go in an SQL <tt>from</tt> clause is OK, so this can be a plain table name or a temp table specification like <tt>(select ... )</tt>, with parens.
\param data The name of the column holding the count or other such data
\param grouping1 The name of the first column by which to group data
\param grouping2 If this is \c NULL, then the function will return a one-way ANOVA. Otherwise, the name of the second column by which to group data in a two-way ANOVA.
*/
APOP_VAR_HEAD apop_data* apop_anova(char *table, char *data, char *grouping1, char *grouping2){
char *apop_varad_var(table, NULL)
Apop_stopif(!table, return NULL, 0, "I need the name of a table in the SQL database.");
if (!strchr(table, <|')'|>)) //if you found ()s, then it is a temp table spec.
Apop_stopif(!apop_table_exists(table), return NULL, 0, "I couldn't find the table %s in the database.", table);
char *apop_varad_var(data, NULL)
Apop_stopif(!data, return NULL, 0, "I need the name of the column in the %s table with the count or other data.", table);
char *apop_varad_var(grouping1, NULL)
Apop_stopif(!data, return NULL, 0, "I need at least grouping1, a column in the %s table.", table);
char *apop_varad_var(grouping2, NULL)
APOP_VAR_ENDHEAD
apop_data *first = apop_anova_one_way(table, data, grouping1);
Apop_stopif(first->error, return first, 0, "Error (%c) running one-way ANOVA.", first->error);
if (!grouping2) return first;
apop_data *second = apop_anova_one_way(table, data, grouping2);
char *joined = NULL;
Asprintf(&joined, "%s, %s", grouping1, grouping2);
apop_data *interaction = apop_anova_one_way(table, data, joined);
apop_data *out = apop_data_calloc(5, 6);
apop_name_stack(out->names, first->names, 'c');
apop_data_add_names(out, 'r', first->names->row[0], second->names->row[0],
"interaction", "residual", "total");
gsl_vector *firstrow = Apop_rv(first, 0);
gsl_vector *secondrow = Apop_rv(second, 0);
gsl_vector *interrow = Apop_rv(interaction, 0);
gsl_matrix_set_row(out->matrix, 0, firstrow);
gsl_matrix_set_row(out->matrix, 1, secondrow);
gsl_matrix_set_row(out->matrix, 2, interrow);
gsl_matrix_set_row(out->matrix, 4, Apop_rv(first, 2));
//residuals are just total-wss
apop_data_set(out, 3, 0, apop_data_get(out, 4, 0)
- gsl_vector_get(firstrow, 0) - gsl_vector_get(secondrow, 0) - gsl_vector_get(interrow, 0)); //residual sum of squares
double residual_df = apop_data_get(out, 4, 1)
- gsl_vector_get(firstrow, 1) - gsl_vector_get(secondrow, 1) - gsl_vector_get(interrow, 1); //residual df
apop_data_set(out, 3, 1, residual_df);
apop_data_set(out, 3, 2, apop_data_get(out, 3, 0)/apop_data_get(out, 3, 1));//mean SS residual
apop_data_set(out, 0, 3, apop_data_get(out, 0, 2)/apop_data_get(out, 3, 2));//F ratio
apop_data_set(out, 0, 4, gsl_cdf_fdist_P(apop_data_get(out, 0, 3), gsl_vector_get(firstrow, 1), residual_df));//pval
apop_data_set(out, 0, 5, 1- apop_data_get(out, 0, 4));//confidence
apop_data_set(out, 1, 3, apop_data_get(out, 1, 2)/apop_data_get(out, 3, 2));//F ratio
apop_data_set(out, 1, 4, gsl_cdf_fdist_P(apop_data_get(out, 1, 3), gsl_vector_get(secondrow, 1), residual_df));//pval
apop_data_set(out, 1, 5, 1- apop_data_get(out, 1, 4));//confidence
apop_data_set(out, 2, 3, apop_data_get(out, 2, 2)/apop_data_get(out, 3, 2));//F ratio
apop_data_set(out, 2, 4, gsl_cdf_fdist_P(apop_data_get(out, 2, 3), gsl_vector_get(interrow, 1), residual_df));//pval
apop_data_set(out, 2, 5, 1- apop_data_get(out, 2, 4));//confidence
free(joined);
apop_data_free(first);
apop_data_free(second);
apop_data_free(interaction);
return out;
}
/** This is a convenience function to do the lookup of a given statistic along a given
distribution. You give me a statistic, its (hypothesized) distribution, and whether
to use the upper tail, lower tail, or both. I will return the odds of a Type I error
given the model---in statistician jargon, the \f$p\f$-value. [Type I error: odds of
rejecting the null hypothesis when it is true.]
For example,
\code
apop_test(1.3);
\endcode
will return the density of the standard Normal distribution that is more than 1.3 from zero.
If this function returns a small value, we can be confident that the statistic is significant. Or,
\code
apop_test(1.3, "t", 10, .tail='u');
\endcode
will give the appropriate odds for an upper-tailed test using the \f$t\f$-distribution with 10 degrees of freedom (e.g., a \f$t\f$-test of the null hypothesis that the statistic is less than or equal to zero).
Several more distributions are supported; see below.
\li For a two-tailed test (the default), this returns the density outside the range. I'll only do this for symmetric distributions.
\li For an upper-tail test ('u'), this returns the density above the cutoff
\li For a lower-tail test ('l'), this returns the density below the cutoff
\param statistic The scalar value to be tested.
\param distribution The name of the distribution; see below.
\param p1 The first parameter for the distribution; see below.
\param p2 The second parameter for the distribution; see below.
\param tail 'u' = upper tail; 'l' = lower tail; anything else = two-tailed. (default = two-tailed)
\return The odds of a Type I error given the model (the \f$p\f$-value).
Here are the distributions you can use and their parameters.
\c "normal" or \c "gaussian"
\li p1=\f$\mu\f$, p2=\f$\sigma\f$
\li default (0, 1)
\c "lognormal"
\li p1=\f$\mu\f$, p2=\f$\sigma\f$
\li default (0, 1)
\li Remember, \f$\mu\f$ and \f$\sigma\f$ refer to the Normal one would get after exponentiation
\li One-tailed tests only
\c "uniform"
\li p1=lower edge, p2=upper edge
\li default (0, 1)
\li two-tailed tests are run relative to the center, (p1+p2)/2.
\c "t"
\li p1=df
\li no default
\c "chi squared", \c "chi", \c "chisq":
\li p1=df
\li no default
\li One-tailed tests only; default='u' (\f$p\f$-value for typical cases)
\c "f"
\li p1=df1, p2=df2
\li no default
\li One-tailed tests only
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD double apop_test(double statistic, char *distribution, double p1, double p2, char tail){
double apop_varad_var(statistic, 0);
char* apop_varad_var(distribution, NULL);
double apop_varad_var(p1, 0);
double apop_varad_var(p2, 0);
int is_chi = !strcasecmp(distribution, "chi squared")|| !strcasecmp(distribution, "chi")
|| !strcasecmp(distribution, "chisq");
Apop_stopif(!strcasecmp(distribution, "f") && (!p1 || !p2), return NAN, 0, "I need both a p1 and p2 parameter specifying the degrees of freedom.");
Apop_stopif((!strcasecmp(distribution, "t") || !strcasecmp(distribution, "f") || is_chi)
&& !p1, return NAN, 0, "I need a p1 parameter specifying the degrees of freedom.");
if (!p2 && (!distribution || !strcasecmp(distribution, "normal") || !strcasecmp(distribution, "gaussian") ))
p2 = 1;
if (!p2 && p1 >= 0 && !strcasecmp(distribution, "uniform"))
p2 = 1;
char apop_varad_var(tail, 0);
if (!tail) tail = is_chi ? 'u' : 'a';
APOP_VAR_ENDHEAD
//This is a long and boring function. I am aware that there are
//clever way to make it shorter.
if (!distribution || !strcasecmp(distribution, "normal") || !strcasecmp(distribution, "gaussian") ){
if (tail == 'u')
return gsl_cdf_gaussian_Q(p1-statistic, p2);
else if (tail == 'l')
return gsl_cdf_gaussian_P(p1-statistic, p2);
else
return 2 * gsl_cdf_gaussian_Q(fabs(p1-statistic), p2);
}
else if (!strcasecmp(distribution, "lognormal")){
if (tail == 'u')
return gsl_cdf_lognormal_Q(statistic, p1, p2);
else if (tail == 'l')
return gsl_cdf_lognormal_P(statistic, p1, p2);
else
Apop_assert(0, "A two-tailed test doesn't really make sense for the lognormal. Please specify either tail= 'u' or tail= 'l'.");
}
else if (!strcasecmp(distribution, "t")){
if (tail == 'u')
return gsl_cdf_tdist_Q(statistic, p1);
else if (tail == 'l')
return gsl_cdf_tdist_P(statistic, p1);
else
return 2 * gsl_cdf_tdist_Q(fabs(statistic), p1);
}
else if (!strcasecmp(distribution, "f")){
if (tail == 'u')
return gsl_cdf_fdist_Q(statistic, p1, p2);
else if (tail == 'l')
return gsl_cdf_fdist_P(statistic, p1, p2);
else
Apop_assert(0, "A two-tailed test doesn't really make sense for the %s. Please specify either tail= 'u' or tail= 'l'.", distribution);
}
else if (!strcasecmp(distribution, "chi squared")|| !strcasecmp(distribution, "chi")
|| !strcasecmp(distribution, "chisq")){
if (tail == 'u')
return gsl_cdf_chisq_Q(statistic, p1);
else if (tail == 'l')
return gsl_cdf_chisq_P(statistic, p1);
else
Apop_assert(0, "A two-tailed test doesn't really make sense for the %s. Please specify either tail= 'u' or tail= 'l'.", distribution);
}
else if (!strcasecmp(distribution, "uniform")){
if (tail == 'u')
return gsl_cdf_flat_Q(statistic, p1, p2);
else if (tail == 'l')
return gsl_cdf_flat_P(statistic, p1, p2);
else
return 2 * gsl_cdf_flat_Q(fabs(statistic - (p1+p2)/2.), p1, p2);
}
Apop_assert(0, "Sorry, but I don't recognize %s as a distribution", distribution);
}