-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSQS_boostrap.r
259 lines (231 loc) · 8.62 KB
/
SQS_boostrap.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# Update from Jon Tennant, 30/04/2018
# Removed recent bug that stopped this from running (dimnames causing a problem)
# Should now run full function, with bootstrap add-on
# sqs version 3.2 by John Alroy
# performs shareholder quorum subsampling on an array of specimen counts
# can be used to perform classical rarefaction instead of SQS
# written 29 July 2010; version 2.0 completed 14 February 2011; versions 3.0,
# 3.1, 3.2, and 3.3 written 3 June, 12 July, 19-23 August, and 2 December 2011
# change in version 3.3: fixed bug in computation of maximum estimated coverage
# changes in version 3.2: specimens drawn and subsampled U are based on
# geometric means of counts; ignore.singletons option added
# change in version 3.1: an even better frequency adjustment involving
# singletons and doubletons
# changes in version 3.0: an even better subsampling algorithm involving a new
# adjustment to u combined with a new throwback criterion
# changes in version 2.0: improved subsampling algorithm; including the dominant
# taxon is now the default; improved reporting of errors and basic statistics
# warning: do not use this program with taxonomic occurrence data drawn from
# multiple published references because it is not designed to count
# single-reference taxa or adjust for long taxonomic lists
# warning: version 1.0 yields estimates that are downwards-biased when q < 0.6
# and abundance distributions are highly uneven
# Read in data
data<-read.csv("file.csv",header=TRUE)
#Convert occurrence list to occurrence table
data.occs<-table(data[,"genus"])
sqs<-function(ab,q,trials,method,ignore.singletons,dominant) {
params <- array(data=NA,dim=0)
if (missing(trials)) {
trials <- 100
}
if (missing(method)) {
method <- ""
} else if (method != "" && method != "rarefaction" && method != "CR") {
return(print('If the method is rarefaction enter method="rarefaction" or "CR"',quote=F))
}
if ((q <= 0 || q >= 1) && method != "rarefaction" && method != "CR") {
return(print("If the method is SQS the quota must be greater than zero and less than one",quote=F))
} else if (q < 1 && (method == "rarefaction" || method == "CR")) {
return(print("If the method is rarefaction the quota must be an integer",quote=F))
}
ignore <- 0
if (! missing(ignore.singletons) && (ignore.singletons == T || ignore.singletons == "yes" || ignore.singletons == "y")) {
ignore <- 1
}
if (missing(dominant)) {
dominant <- 0
} else if (dominant != "" && dominant != "exclude" && dominant != "no") {
return(print('To exclude the dominant taxon, enter dominant="exclude" or "no"',quote=F))
}
# compute basic statistics
specimens <- sum(ab)
singletons <- 0
doubletons <- 0
highest <- 0
for (i in 1:length(ab)) {
if (ab[i] == 1) {
singletons <- singletons + 1
} else if (ab[i] == 2) {
doubletons <- doubletons + 1
}
if (ab[i] > highest) {
highest <- ab[i]
mostfrequent <- i
}
}
u <- 1 - singletons / specimens
# optionally, exclude the dominant taxon
if (dominant == "exclude" || dominant == "no") {
u <- 1 - singletons / (specimens - highest)
}
if (u == 0) {
return(print("Coverage is zero because all taxa are singletons",quote=F))
}
# compute raw taxon frequencies (temporarily)
freq <- ab / specimens
# standard recursive equation for Fisher's alpha
alpha <- 10
oldalpha <- 0
while (abs(alpha - oldalpha) > 0.0000001) {
oldalpha <- alpha
alpha <- length(ab) / log(1 + specimens/alpha)
}
params["raw richness"] <- length(ab)
params["subsampled richness"] <- u
params["subsampled richness"] <- NA
params["subsampled u"] <- NA
params["Chao 1"] <- length(ab) + singletons**2/(2* doubletons)
params["subsampled Chao 1"] <- NA
# governing parameter of the geometric series distribution
params["k"] <- abs(lm(log(sort(freq)) ~ c(1:length(freq)))$coefficients[2])
params["Fisher's alpha"] <- alpha
params["Shannon's H"] <- -1 * sum(freq * log(freq))
params["Hurlbert's PIE"] <- (1 - sum(freq**2)) * length(ab) / (length(ab) - 1)
params["dominance"] <- highest / specimens
params["specimens"] <- specimens
params["singletons"] <- singletons
params["doubletons"] <- doubletons
params["specimens drawn"] <- 0
if (dominant != "exclude" && dominant != "no") {
highest <- 0
mostfrequent <- 0
}
# return if the rarefaction quota is equal to or higher than the
# specimen count
if (method == "rarefaction" && q >= specimens - highest) {
return(params)
}
# compute taxon frequencies (tweak added in version 3.1)
freq <- ab - (singletons + doubletons / 2) / length(ab)
freq <- freq / (specimens - highest)
# return if the quorum target is higher than estimated coverage
if ((q > sum(freq) && method != "rarefaction" && method != "CR") || (q >= sum(ab))) {
return(params)
}
# create an array in which each cell corresponds to one specimen
ids <- array()
n <- 0
for (i in 1:length(ab)) {
for (j in 1:ab[i]) {
n <- n + 1
ids[n] <- i
}
}
# subsampling trial loop
# s will be the subsampled taxon count
s <- array(rep(0,trials))
drawn <- array(rep(0,trials))
mostfrequentdrawn <- array(rep(0,trials))
subsingle <- array(rep(0,trials))
subdouble <- array(rep(0,trials))
subchao <- array(rep(0,trials))
for (trial in 1:trials) {
pool <- ids
left <- length(pool)
seen <- array(data=rep(0,length(ab)))
subfreq <- array(rep(0,length(ab)))
if (method != "rarefaction" && method != "CR") {
udrawn <- 0
# equation new to version 3.0
# the exponent corrects for downwards bias
while (udrawn < q) {
# draw a specimen
x <- floor(runif(1,min=1,max=left+1))
# add to frequency and taxon sums if species has
# not been drawn previously
subfreq[pool[x]] <- subfreq[pool[x]] + 1
if (seen[pool[x]] == 0) {
if (pool[x] != mostfrequent && (ignore == 0 || ab[pool[x]] > 1)) {
udrawn <- udrawn + freq[pool[x]]
}
seen[pool[x]] <- 1
# randomly throw back some draws that put the sum over q
# (an even better algorithm added in version 3.0)
if (runif(1) <= freq[pool[x]] || udrawn < q) {
s[trial] <- s[trial] + 1
} else {
subfreq[pool[x]] <- subfreq[pool[x]] - 1
}
}
# decrease pool of specimens not yet drawn
pool[x] <- pool[left]
left <- left - 1
}
} else {
i <- 0
draws <- 0
while (i < q) {
draws <- draws + 1
x <- floor(runif(1,min=1,max=length(ids)-draws+2))
subfreq[pool[x]] <- subfreq[pool[x]] + 1
if (pool[x] != mostfrequent) {
i <- i + 1
}
if (seen[pool[x]] == 0) {
seen[pool[x]] <- 1
s[trial] <- s[trial] + 1
}
pool[x] <- pool[length(ids)-draws+1]
}
}
for (i in 1:length(ab)) {
if (subfreq[i] == 1 && i != mostfrequent) {
subsingle[trial] <- subsingle[trial] + 1
} else if (subfreq[i] == 2 && i != mostfrequent) {
subdouble[trial] <- subdouble[trial] + 1
}
}
if (subsingle[trial] > 0 && subdouble[trial] > 0) {
subchao[trial] <- s[trial] + subsingle[trial]**2/(2*subdouble[trial])
} else {
subchao[trial] <- s[trial]
}
drawn[trial] <- sum(subfreq)
if (mostfrequent != 0) {
mostfrequentdrawn[trial] <- subfreq[mostfrequent]
}
}
# compute vectors of non-zero counts
options(warn=-1)
s2 <- sort(sqrt(s-1))^2+1
d2 <- sort(sqrt(drawn-1))^2+1
m2 <- sort(sqrt(mostfrequentdrawn-1))^2+1
ss2 <- sort(sqrt(subsingle-1))^2+1
options(warn=0)
# compute geometric means
params["subsampled richness"] <- exp(mean(log(s2))) * length(s2)/length(s)
params["specimens drawn"] <- exp(mean(log(d2))) * length(d2)/length(drawn)
meanmost <- 0
if (sum(mostfrequentdrawn) > 0) {
meanmost <- exp(mean(log(m2))) * length(m2)/length(mostfrequentdrawn)
}
meansubsingle <- exp(mean(log(ss2))) * length(ss2)/length(subsingle)
params["subsampled u"] <- 1 - meansubsingle / (params["specimens drawn"] - meanmost)
params["subsampled Chao 1"] <- exp(mean(log(subchao)))
return(params)
}
# Check real value with a quorum of 0.4
sqs(data.occs,0.4)['subsampled richness']
# Bootstrap
s<-rep(NA,1000)
for(i in 1:length(s)){
bootstrapped<-data.occs[sample(1:length(data.occs),replace=TRUE)]
s[i]<-sqs(bootstrapped,0.4)['subsampled richness']
}
# To remove any NAs (bootstrapping can produce a lot of pseudo-singletons)
s<-na.omit(s)
s<-as.numeric(s)
# Returns the median and 95% confidence intervals
quantile(s,probs=c(0.5,0.025,0.975))
hist(s)