-
Notifications
You must be signed in to change notification settings - Fork 0
/
Arp53D_populationCage_functions.R
306 lines (271 loc) · 17.6 KB
/
Arp53D_populationCage_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
########### functions for mating infinite size population
##### mendelTable: set up a table that shows, for every possible pairwise combination of parent genotypes, the offspring genotype frequencies under Mendelian inheritance:
genotypePossibilities <- c("WThom", "het", "KOhom")
mendelTable <- data.frame(femaleID=rep(genotypePossibilities, each=3),
maleID=rep(genotypePossibilities, 3))
nextGenColumnNames <- paste("nextGen_",genotypePossibilities, sep="")
mendelTable[,nextGenColumnNames] <- NA
# mother is WThom
mendelTable[which(mendelTable[,"femaleID"]=="WThom" & mendelTable[,"maleID"]=="WThom"),
nextGenColumnNames] <- c(1,0,0)
mendelTable[which(mendelTable[,"femaleID"]=="WThom" & mendelTable[,"maleID"]=="het"),
nextGenColumnNames] <- c(0.5,0.5,0)
mendelTable[which(mendelTable[,"femaleID"]=="WThom" & mendelTable[,"maleID"]=="KOhom"),
nextGenColumnNames] <- c(0,1,0)
# mother is het
mendelTable[which(mendelTable[,"femaleID"]=="het" & mendelTable[,"maleID"]=="WThom"),
nextGenColumnNames] <- c(0.5,0.5,0)
mendelTable[which(mendelTable[,"femaleID"]=="het" & mendelTable[,"maleID"]=="het"),
nextGenColumnNames] <- c(0.25,0.5,0.25)
mendelTable[which(mendelTable[,"femaleID"]=="het" & mendelTable[,"maleID"]=="KOhom"),
nextGenColumnNames] <- c(0,0.5,0.5)
# mother is KOhom
mendelTable[which(mendelTable[,"femaleID"]=="KOhom" & mendelTable[,"maleID"]=="WThom"),
nextGenColumnNames] <- c(0,1,0)
mendelTable[which(mendelTable[,"femaleID"]=="KOhom" & mendelTable[,"maleID"]=="het"),
nextGenColumnNames] <- c(0,0.5,0.5)
mendelTable[which(mendelTable[,"femaleID"]=="KOhom" & mendelTable[,"maleID"]=="KOhom"),
nextGenColumnNames] <- c(0,0,1)
rm(nextGenColumnNames)
####### mateInfiniteSizePopulation - a function that takes as input
## (a) parental genotypes (b) table of offspring genotype freqs for each mating pair type
## it first figures out frequencies of each possible pairwise combination of mating pair genotypes (P-Gmat x Gpat)
## then for each mating pair type, it figures out frequency of offspring genotypes (O-wt, O-het, O-ko)
## it takes the product of those frequencies (PxO) and sums across all mating pair types to get the overall frequency of each genotype in the next generation
# (a) the first input, the parental genotypes, is a list of lists: first level of the list is female/male, each of which is a list with three elements whose values are genotype frequencies (female frequencies must sum to 1, same for male)
mateInfiniteSizePopulation <- function(genotypeFreqList, mendel=mendelTable) {
## make sure genotypes look OK
verifyGenotypeFreqList(genotypeFreqList)
# we will mate each combination of parents separately and add up the resulting offspring
nextGen <- data.frame(femaleID= rep(names(genotypeFreqList[["female"]]),each=3),
femaleFreq=rep(unlist(genotypeFreqList[["female"]], use.names=FALSE),each=3),
maleID= rep(names(genotypeFreqList[["male"]]), 3),
maleFreq=rep(unlist(genotypeFreqList[["male"]], use.names=FALSE), 3))
nextGen[,"parentCombFreq"] <- nextGen[,"femaleFreq"] * nextGen[,"maleFreq"]
if(!identical (mendel[,"femaleID"], nextGen[,"femaleID"])) {
cat("\n\nERROR - something is wrong in mateInfiniteSizePopulation - nextGen table and mendel table have different row orders\n\n")
return(list(mendel=mendel, nextGen=nextGen))
}
if(!identical (mendel[,"maleID"], nextGen[,"maleID"])) {
cat("\n\nERROR - something is wrong in mateInfiniteSizePopulation - nextGen table and mendel table have different row orders\n\n")
return(list(mendel=mendel, nextGen=nextGen))
}
nextGenColumnNames <- colnames(mendel)[3:5]
# check row totals of mendel table add up to 1
if (sum(apply(mendel[,nextGenColumnNames], 1, sum) != 1) > 0) {
badRows <- which(apply(mendel[,nextGenColumnNames], 1, sum) != 1)
cat("\n\nERROR in mateInfiniteSizePopulation - one or more rows of the mendel table do not sum to 1 - here they are:\n\n")
print(mendel[badRows,])
cat("\n\n")
stop("\n\nERROR in mateInfiniteSizePopulation - see above\n\n")
}
# get offspring freqs for each row using mendelian genetics
nextGen[,nextGenColumnNames] <- NA
for (i in 1:9) {
nextGen[i,nextGenColumnNames] <- nextGen[i,"parentCombFreq"] * mendel[i,nextGenColumnNames]
}
genotypeFreqTotals <- apply(nextGen[,nextGenColumnNames], 2, sum)
names(genotypeFreqTotals) <- gsub("nextGen_","",names(genotypeFreqTotals))
return(list(table=nextGen, totals=genotypeFreqTotals))
}
#### verifyGenotypeFreqList - a function to check that the genotype frequencies list looks as it should
verifyGenotypeFreqList <- function(genotypeFreqList, anyGenotype=genotypePossibilities, errorTolerance=0.0000001) {
if (length(genotypeFreqList) != 2) {
stop("\n\nERROR in verifyGenotypeFreqList - it should be a list object with two elements (female and male)\n\n")
}
if (sum(c("female","male") %in% names(genotypeFreqList)) != 2) {
stop("\n\nERROR in verifyGenotypeFreqList - elements of the genotype frequency list should be named female and male\n\n")
}
if (length(genotypeFreqList[["female"]]) != 3) {
stop("\n\nERROR in verifyGenotypeFreqList - female genotype frequencies should have 3 elements\n\n")
}
if (length(genotypeFreqList[["male"]]) != 3) {
stop("\n\nERROR in verifyGenotypeFreqList - male genotype frequencies should have 3 elements\n\n")
}
genotypeFreqSums <- lapply(genotypeFreqList, function(x) { sum(unlist(x)) })
## I think rounding errors mean these don't always QUITE sum to 1, so I'll be a little tolerant
if (abs(1-genotypeFreqSums[["female"]]) > errorTolerance) {
stop("\n\nERROR in verifyGenotypeFreqList - female genotype frequencies should sum to 1\n\n")
}
if (abs(1-genotypeFreqSums[["male"]]) > errorTolerance) {
stop("\n\nERROR in verifyGenotypeFreqList - male genotype frequencies should sum to 1\n\n")
}
if (sum(anyGenotype %in% names(genotypeFreqList[["female"]])) != 3) {
stop("\n\nERROR in verifyGenotypeFreqList - elements of the female genotype frequencies list should be named",anyGenotype,"\n\n")
}
if (sum(anyGenotype %in% names(genotypeFreqList[["male"]])) != 3) {
stop("\n\nERROR in verifyGenotypeFreqList - elements of the male genotype frequencies list should be named",anyGenotype,"\n\n")
}
return(TRUE)
}
#### getAlleleFreqsGivenGenotypeFreqs: a function to get the two allele frequencies, given a simple numeric vector of the three genotype frequencies
getAlleleFreqsGivenGenotypeFreqs <- function(genotypes, expectedGenotypes=genotypePossibilities) {
if(class(genotypes)=="list") { genotypes <- unlist(genotypes) }
## make sure the genotype freqs have the expected names:
if(!identical(sort(expectedGenotypes), sort(names(genotypes)))) {
stop("\n\nERROR in getAlleleFreqsGivenGenotypeFreqs function - I do not see all the expected genotype classes in the genotypes you supplied: should have:",expectedGenotypes,"\n\n")
}
alleleFreqs <- numeric()
alleleFreqs["WT"] <- ((2 * genotypes["WThom"]) + genotypes["het"]) / 2
alleleFreqs["KO"] <- ((2 * genotypes["KOhom"]) + genotypes["het"]) / 2
return(alleleFreqs)
}
#### runInfinitePopulationModelling - a function to iterate mateInfiniteSizePopulation over multiple generations, perhaps applying selection to the offspring genotypes at each generation
runInfinitePopulationModelling <- function(genotypeFreqsInitial,
numGenerations=NULL,
selectionCoefficients=NULL,
debug=FALSE) {
initialPopulation <- list()
initialPopulation[[1]] <- list()
initialPopulation[[1]][["totals"]] <- genotypeFreqsInitial
initialPopulation[[1]][["alleleFreqs"]] <- apply(sapply(genotypeFreqsInitial, function(x) {
getAlleleFreqsGivenGenotypeFreqs(unlist(x))
}), 1, mean)
## at first I tried normalizing the selectionCoefficients so that they add up to 1, but I realised it is better to do this on the selected genotypes: that works better especially if some genotypes have frequency=0
#if(!is.null(selectionCoefficients)) {
# normalizedSelectionCoefficients <- selectionCoefficients / mean(selectionCoefficients)
#}
outputPopulations <- initialPopulation
for (i in 2:numGenerations) {
if (debug) {cat("generation",i,"\n")}
prevGenGenotypeFreqs <- outputPopulations[[(i-1)]][["totals"]]
# when previous generation is 1, prevGenGenotypeFreqs looks a bit different, as we separately specified male and female
if( !"female" %in% names(prevGenGenotypeFreqs) ) {
prevGenGenotypeFreqs <- list(female=prevGenGenotypeFreqs, male=prevGenGenotypeFreqs)
}
outputPopulations[[i]] <- mateInfiniteSizePopulation(prevGenGenotypeFreqs)
if(!is.null(selectionCoefficients)) {
## we use selectionCoefficients to adjust the genotype totals of the offspring.
if(!identical(names(selectionCoefficients),
names(outputPopulations[[i]][["totals"]]))) {
stop("\n\nERROR in runInfinitePopulationModelling - the names of the selection coefficients are not what we expected\n\n")
}
outputPopulations[[i]][["totalsBeforeSelection"]] <- outputPopulations[[i]][["totals"]]
newGenotypeTotals <- selectionCoefficients * outputPopulations[[i]][["totals"]]
# normalize so that newGenotypeTotals still adds up to 1
newGenotypeTotals <- newGenotypeTotals / sum(newGenotypeTotals)
outputPopulations[[i]][["totals"]] <- newGenotypeTotals
}
outputPopulations[[i]][["alleleFreqs"]] <- getAlleleFreqsGivenGenotypeFreqs(outputPopulations[[i]][["totals"]])
rm(prevGenGenotypeFreqs)
}
return(outputPopulations)
}
### summarizeInfinitePopulationModelling - a function to summarize genotype and allele frequencies from runInfinitePopulationModelling
summarizeInfinitePopulationModelling <- function(modellingOutput) {
summaryTable <- data.frame(generation=1:length(modellingOutput))
summaryTable <- cbind(summaryTable,
as.data.frame(t(sapply(modellingOutput, "[[", "alleleFreqs"))))
genotypeFreqs <- as.data.frame(t(sapply(modellingOutput, function(x) {
g <- x[["totals"]]
# the unusual case where we have separate male and female freqs (we assume males and females are at equal freq):
if("female" %in% names(g)) { g <- (unlist(g[["female"]]) + unlist(g[["male"]]))/2 }
return(g)
})))
summaryTable <- cbind(summaryTable, genotypeFreqs)
return(summaryTable)
}
#### runModellingOnMultipleSelectiveRegimes is a function to run runInfinitePopulationModelling and summarizeInfinitePopulationModelling on a number of selective regimes.
# I only keep genotype and allele freq totals, not the other information that runInfinitePopulationModelling yields at each generation
runModellingOnMultipleSelectiveRegimes <- function(selectionTable=NULL,
initialGenotypeFreqs=NULL,
numGenerations=NULL,
debug=FALSE) {
if(is.null(selectionTable)) {
stop("\n\nERROR in runModellingOnMultipleSelectiveRegimes - must supply a table of selective pressures using selectionTable argument\n\n")
}
if(is.null(initialGenotypeFreqs)) {
stop("\n\nERROR in runModellingOnMultipleSelectiveRegimes - must supply initial genotype frequencies using initialGenotypeFreqs argument\n\n")
}
if(is.null(numGenerations)) {
stop("\n\nERROR in runModellingOnMultipleSelectiveRegimes - must supply number of generations using numGenerations argument\n\n")
}
resultsEachSelectiveRegime <- lapply(1:dim(selectionTable)[1], function(i){
thisModel <- runInfinitePopulationModelling(
initialGenotypeFreqs,
numGenerations=numGenerations,
selectionCoefficients=unlist(selectionTable[i,c("WThom","het","KOhom")]),
debug=debug)
summarizedResults <- summarizeInfinitePopulationModelling(thisModel)
summarizedResults[,"selectionWThom"] <- selectionTable[i,"WThom"]
summarizedResults[,"selectionHet"] <- selectionTable[i,"het"]
summarizedResults[,"selectionKOhom"] <- selectionTable[i,"KOhom"]
summarizedResults[,"selectionRegime"] <- selectionTable[i,"regime"]
return(summarizedResults)
})
resultsEachSelectiveRegime <- do.call("rbind",resultsEachSelectiveRegime)
resultsEachSelectiveRegime[,"selectionRegime"] <- factor(resultsEachSelectiveRegime[,"selectionRegime"],
levels=c("het_equalsWThom", "het_intermediate", "het_equalsKOhom"))
return(resultsEachSelectiveRegime)
}
######## a couple of functions to check the fit to the real data for each model.
##### calculateModelFitsOneModel gets the MAE score of a single model
## I use MAE (mean absolute error) to understand how well a model fits the real data, considering all data points supplied in realDataTable. Small numbers of MAE are good!
## I was initially playing around with fitting to the mean at every generation, or the mean at a single generation, but I realized MAE considering all data points is probably better
## we only consider the fit of WThom frequencies, because in our real experiment that's the data we actually have.
calculateModelFitsOneModel <- function(oneModelResults,realDataTable) {
allFits <- list()
### calculate overall fits using mean absolute error of model versus realDataTable
individualDiffs <- numeric()
for (thisGen in unique(realDataTable$generation)) {
thisGenModelResults <- oneModelResults %>%
filter(generation==thisGen)
if(dim(thisGenModelResults)[1] > 1) {
stop("\n\nERROR in calculateFitEachModel - there is more than one row in the model results from generation",thisGen,"\n\n")
}
if(dim(thisGenModelResults)[1] < 1) {
stop("\n\nERROR in calculateFitEachModel - there are no rows in the model results from generation",thisGen,"\n\n")
}
thisGenModelResults <- thisGenModelResults[1,"WThom"]
thisGenRealResults <- realDataTable %>% filter(generation==thisGen) %>% select(freqWThom) %>% unlist()
thisGenModelResults <- rep(thisGenModelResults, length(thisGenRealResults))
individualDiffs <- c(individualDiffs, abs(thisGenModelResults - thisGenRealResults) )
}
allFits[["meanAbsErr"]] <- mean(individualDiffs)
allFits <- unlist(allFits)
return(allFits)
}
###### calculateModelFitsMultipleModels is a function to split modelling results table into results from individual models, then get the fit for each model to the real data
calculateModelFitsMultipleModels <- function(allModelsResults,
realDataTable) {
# first, split the results table into results for each model separately.
# Together, selectionRegime and selectionKOhom should specify a unique selection regime. If they don't, I'll get an error from calculateModelFitsOneModel because there'll be >1 row for each generation
eachModelList <- split( allModelsResults, list(allModelsResults$selectionRegime, allModelsResults$selectionKOhom))
# for each modelling setup, get the fit
eachModelFits <- lapply(eachModelList, function(x) {
myFitsDF <- x[1,c("selectionWThom", "selectionHet", "selectionKOhom", "selectionRegime")]
myFits <- calculateModelFitsOneModel(x, realDataTable=realDataTable)
myFitsDF[,"fitType"] <- names(myFits)
myFitsDF <- cbind(myFitsDF, myFits)
return(myFitsDF)
})
eachModelFits <- do.call("rbind",eachModelFits)
row.names(eachModelFits) <- NULL
return(eachModelFits)
}
##### given a bunch of modelling results and their first, choose the best one for a given selectionRegime (e.g. het_equalsWThom) and fitType, which is likely meanAbsErr, but in previous versions I was fitting to the mean at generation 15 ('gen15') etc
getBestModelResults <- function(allModelResults,
allModelResultsFitTable,
modelTypeToChoose,
fitTypeToChoose="meanAbsErr") {
bestModel <- allModelResultsFitTable %>%
filter(fitType==fitTypeToChoose & selectionRegime==modelTypeToChoose) %>%
filter(myFits == min(myFits))
# check there's a single best model
if(dim(bestModel)[1]>1) {
stop("\n\nERROR in getModelResultsForSingleSelectionRegime - found >1 best model - don't know how to proceed\n\n")
}
if(dim(bestModel)[1]<1) {
stop("\n\nERROR in getModelResultsForSingleSelectionRegime - found NO best model - don't know how to proceed\n\n")
}
# now get results for that best model
bestModelResults <- allModelResults %>%
filter(selectionRegime == modelTypeToChoose) %>%
filter(selectionWThom == as.numeric(bestModel$selectionWThom)) %>%
filter(selectionHet == as.numeric(bestModel$selectionHet)) %>%
filter(selectionKOhom == as.numeric(bestModel$selectionKOhom))
# add the fitType and the fitScore as I may want it later:
bestModelResults[,"fitType"] <- fitTypeToChoose
bestModelResults[,"fitScore"] <- bestModel[1,"myFits"]
return(bestModelResults)
}