-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBayesTest.h
333 lines (305 loc) · 12.6 KB
/
BayesTest.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#ifndef BAYESTEST_H
#define BAYESTEST_H
#include <vector>
#include <cstdlib>
#include "TreeAndSetsDependencies.h"
#include "ProbabilityMatrixSet.h"
#include "VerbosityLevels.h"
/// The minimum value for class 2 sites probability to be a positive selection
/// site.
///
const static double MIN_PROB = 0.50;
const static double ONE_STAR_PROB = 0.95;
const static double TWO_STARS_PROB = 0.99;
///@cond Private
/// Helper class to compute BEB_N1D^BEB_DIMS at compile time (that is Y^N)
///
template <unsigned int Y, unsigned int N> class Pow {
public:
static const int value = Y * Pow<Y, N - 1>::value;
};
/// Specialization of the above class to end the recursion
///
template <unsigned int Y> class Pow<Y, 1> {
public:
static const int value = Y;
};
///@endcond
/// Tests to find the sites under positive selection.
///
/// @author Mario Valle - Swiss National Supercomputing Centre (CSCS)
/// @date 2010-12-22 (initial version)
/// @version 1.1
///
class BayesTest {
public:
/// Constructor.
///
/// @param[in] aForest The forest
/// @param[in] aVerbose The verbosity level
/// @param[in] aNoReduction If true the dependencies computed are for no
/// reduced forests
///
explicit BayesTest(Forest &aForest, unsigned int aVerbose = 0,
bool aNoReduction = true)
: mForest(aForest), mNumSites(mForest.getNumSites()),
mSiteClassProb(BEB_DIMS * mNumSites), mVerbose(aVerbose),
mPriors(mNumSites * BEB_NUM_CAT), mDependencies(mForest, aVerbose),
mBEBset(mForest.getNumBranches()) {
// Create the dependency list for forest likelihood computation
mDependencies.computeDependencies(1, aNoReduction);
if (mVerbose >= VERBOSE_ONLY_RESULTS)
mDependencies.print("TEST FOR BEB (before optimization)");
mDependencies.optimizeDependencies();
if (mVerbose >= VERBOSE_ONLY_RESULTS)
mDependencies.print("TEST FOR BEB");
}
/// Destructor.
///
~BayesTest() {}
/// Bayes Empirical Bayes (BEB) test.
///
/// @param[in] aVars The variables optimized at the end of the H1 run.
/// @param[in] aFgBranch The foreground branch under test.
/// @param[in] aScales The two scales ([0] bg; [1] fg) to rescale the branch
/// lengths. They are computed in H1.
///
void computeBEB(const std::vector<double> &aVars, size_t aFgBranch,
const std::vector<double> &aScales);
/// Print the sites under positive selection.
///
/// @param[in] aFgBranch Identifier of the branch marked as foreground branch
///
void printPositiveSelSites(size_t aFgBranch) const;
/// Extract the sites under positive selection and the corresponding
/// probabilities.
///
/// @param[out] aPositiveSelSites Vector of sites under positive selection
/// @param[out] aPositiveSelSitesProb Corresponding probabilities
///
void
extractPositiveSelSites(std::vector<unsigned int> &aPositiveSelSites,
std::vector<double> &aPositiveSelSitesProb) const;
private:
/// This sets up the grid (mPara[][]) according to the priors.
/// It calculates the probability of data at each site given w: f(f_h|w).
/// This is calculated using the branch model (NSsites = 0 model = 2), with
/// BayesEB=2 used to force the use of the correct scale factors in
/// GetPMatBranch().
///
///@verbatim
/// Order of site classes for iw or f(x_h|w):
/// back fore num.sets
/// Branchsite A (121 sets)
/// site class 0: w0 w0 10
/// site class 1: w1=1 w1=1 1
/// site class 2a: w0 w2 100
/// site class 2b: w1=1 w2 10
///@endverbatim
///
/// @param[in] aVars The variables optimized at the end of the H1 run.
/// @param[in] aSiteMultiplicity The site multiplicity vector.
/// @param[in] aFgBranch The foreground branch under test.
/// @param[in] aScales The two scales ([0] bg; [1] fg) to rescale the branch
/// lengths. They are computed in H1.
///
/// @return The computed scale.
///
double getGridParams(const std::vector<double> &aVars,
const std::vector<double> &aSiteMultiplicity,
size_t aFgBranch, const std::vector<double> &aScales);
/// This gives the indices (ix, iy) and the coordinates (aProbX, aProbY,
/// 1-aProbX-aProbY) for
/// the aTriangleIdx-th triangle, with aTriangleIdx from 0, 1, ...,
/// BEB_N1D*BEB_N1D-1.
///
/// The ternary graph (0-1 on each axis) is partitioned into BEB_N1D*BEB_N1D
/// equal-sized triangles.
/// In the first row (ix=0), there is one triangle (iy=0);
/// In the second row (ix=1), there are 3 triangles (iy=0,1,2);
/// In the i-th row (ix=i), there are 2*i+1 triangles (iy=0,1,...,2*i).
///
/// aProbX rises when ix goes up, but aProbY decreases when iy increases.
/// (aProbX, aProbY) is the
/// centroid in the ij-th small triangle. aProbX and aProbY each takes on
/// 2*BEB_N1D-1 possible values.
///
/// @param[out] aProbX The p0 value on the X axis of the triangular grid.
/// @param[out] aProbY The p1 value on the Y axis of the triangular grid.
/// @param[in] aTriangleIdx The index inside the triangular grid.
///
void getIndexTernary(double *aProbX, double *aProbY,
unsigned int aTriangleIdx);
private:
/// Disabled assignment operator to avoid warnings on Windows.
///
/// @fn BayesTest& operator=(const BayesTest& aObj)
///
/// @param[in] aObj The object to be assigned
///
/// @return The object receiving the assignment
///
BayesTest &operator=(const BayesTest &);
private:
const static unsigned int BEB_N1D = 10; ///< Number of intervals for w0 and w2
const static unsigned int BEB_DIMS =
4; ///< Number of codon classes (0, 1, 2a, 2b)
const static unsigned int BEB_NUM_CAT =
BEB_N1D + 1 + BEB_N1D * BEB_N1D + BEB_N1D; ///< Total number of categories
/// for w0 and w2 (it is
/// com.ncatG in codeml.c)
const static unsigned int BEB_NGRID =
Pow<BEB_N1D, BEB_DIMS>::value; ///< Number of points in the grid used to
/// evaluate the integral. It is
/// BEB_N1D^BEB_DIMS
private:
Forest &mForest; ///< The forest.
size_t mNumSites; ///< Number of sites.
std::vector<double> mSiteClassProb; ///< Probability of a site to pertain to a
/// given class (one row per class (4
/// classes), one column per site).
unsigned int mVerbose; ///< If greater than zero prints more info
std::vector<double>
mPriors; ///< Computed priors (each points to a list, one for each site)
TreeAndSetsDependencies
mDependencies; ///< Dependency list for likelihood computation
ProbabilityMatrixSetBEB
mBEBset; ///< Probability matrix set to be used for likelihood computation
};
class MfgBayesTest {
public:
/// Constructor.
///
/// @param[in] aForest The forest
/// @param[in] aVerbose The verbosity level
/// @param[in] aNoReduction If true the dependencies computed are for no
/// reduced forests
///
explicit MfgBayesTest(Forest &aForest, unsigned int aVerbose = 0,
bool aNoReduction = true)
: mForest(aForest), mNumSites(mForest.getNumSites()),
mSiteClassProb(BEB_DIMS * mNumSites), mVerbose(aVerbose),
mPriors(mNumSites * BEB_NUM_CAT), mDependencies(mForest, aVerbose),
mBEBset(mForest.getNumBranches()) {
// Create the dependency list for forest likelihood computation
mDependencies.computeDependencies(1, aNoReduction);
if (mVerbose >= VERBOSE_ONLY_RESULTS)
mDependencies.print("TEST FOR BEB (before optimization)");
mDependencies.optimizeDependencies();
if (mVerbose >= VERBOSE_ONLY_RESULTS)
mDependencies.print("TEST FOR BEB");
}
/// Destructor.
///
~MfgBayesTest() {}
/// Bayes Empirical Bayes (BEB) test.
///
/// @param[in] aVars The variables optimized at the end of the H1 run.
/// @param[in] aFgBranchSet The foreground branch set under test.
/// @param[in] aScales The two scales ([0] bg; [1] fg) to rescale the branch
/// lengths. They are computed in H1.
///
void computeBEB(const std::vector<double> &aVars, std::set<int> aFgBranchSet,
const std::vector<double> &aScales);
/// Print the sites under positive selection.
///
/// @param[in] aFgBranchSet Set of the branches marked as foreground branch
///
void printPositiveSelSites(std::set<int> aFgBranchSet) const;
/// Extract the sites under positive selection and the corresponding
/// probabilities.
///
/// @param[out] aPositiveSelSites Vector of sites under positive selection
/// @param[out] aPositiveSelSitesProb Corresponding probabilities
///
void
extractPositiveSelSites(std::vector<unsigned int> &aPositiveSelSites,
std::vector<double> &aPositiveSelSitesProb) const;
private:
/// This sets up the grid (mPara[][]) according to the priors.
/// It calculates the probability of data at each site given w: f(f_h|w).
/// This is calculated using the branch model (NSsites = 0 model = 2), with
/// BayesEB=2 used to force the use of the correct scale factors in
/// GetPMatBranch().
///
///@verbatim
/// Order of site classes for iw or f(x_h|w):
/// back fore num.sets
/// Branchsite A (121 sets)
/// site class 0: w0 w0 10
/// site class 1: w1=1 w1=1 1
/// site class 2a: w0 w2 100
/// site class 2b: w1=1 w2 10
///@endverbatim
///
/// @param[in] aVars The variables optimized at the end of the H1 run.
/// @param[in] aSiteMultiplicity The site multiplicity vector.
/// @param[in] aFgBranchSet The foreground branches under test.
/// @param[in] aScales The two scales ([0] bg; [1] fg) to rescale the branch
/// lengths. They are computed in H1.
///
/// @return The computed scale.
///
double getGridParams(const std::vector<double> &aVars,
const std::vector<double> &aSiteMultiplicity,
std::set<int> aFgBranchSet,
const std::vector<double> &aScales);
/// This gives the indices (ix, iy) and the coordinates (aProbX, aProbY,
/// 1-aProbX-aProbY) for
/// the aTriangleIdx-th triangle, with aTriangleIdx from 0, 1, ...,
/// BEB_N1D*BEB_N1D-1.
///
/// The ternary graph (0-1 on each axis) is partitioned into BEB_N1D*BEB_N1D
/// equal-sized triangles.
/// In the first row (ix=0), there is one triangle (iy=0);
/// In the second row (ix=1), there are 3 triangles (iy=0,1,2);
/// In the i-th row (ix=i), there are 2*i+1 triangles (iy=0,1,...,2*i).
///
/// aProbX rises when ix goes up, but aProbY decreases when iy increases.
/// (aProbX, aProbY) is the
/// centroid in the ij-th small triangle. aProbX and aProbY each takes on
/// 2*BEB_N1D-1 possible values.
///
/// @param[out] aProbX The p0 value on the X axis of the triangular grid.
/// @param[out] aProbY The p1 value on the Y axis of the triangular grid.
/// @param[in] aTriangleIdx The index inside the triangular grid.
///
void getIndexTernary(double *aProbX, double *aProbY,
unsigned int aTriangleIdx);
private:
/// Disabled assignment operator to avoid warnings on Windows.
///
/// @fn BayesTest& operator=(const BayesTest& aObj)
///
/// @param[in] aObj The object to be assigned
///
/// @return The object receiving the assignment
///
MfgBayesTest &operator=(const MfgBayesTest &);
private:
const static unsigned int BEB_N1D = 10; ///< Number of intervals for w0 and w2
const static unsigned int BEB_DIMS =
4; ///< Number of codon classes (0, 1, 2a, 2b)
const static unsigned int BEB_NUM_CAT =
BEB_N1D + 1 + BEB_N1D * BEB_N1D + BEB_N1D; ///< Total number of categories
///for w0 and w2 (it is
///com.ncatG in codeml.c)
const static unsigned int BEB_NGRID =
Pow<BEB_N1D, BEB_DIMS>::value; ///< Number of points in the grid used to
///evaluate the integral. It is
///BEB_N1D^BEB_DIMS
private:
Forest &mForest; ///< The forest.
size_t mNumSites; ///< Number of sites.
std::vector<double> mSiteClassProb; ///< Probability of a site to pertain to a
///given class (one row per class (4
///classes), one column per site).
unsigned int mVerbose; ///< If greater than zero prints more info
std::vector<double>
mPriors; ///< Computed priors (each points to a list, one for each site)
TreeAndSetsDependencies
mDependencies; ///< Dependency list for likelihood computation
mfgProbabilityMatrixSetBEB
mBEBset; ///< Probability matrix set to be used for likelihood computation
};
#endif