-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget.quantiles.R
171 lines (141 loc) · 8.42 KB
/
get.quantiles.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
'get.quantiles' <- function(postfix, maxfiles = NULL, sourcepath = 'x:/LCC/GIS/Final/ecoRefugia/ecoConnect_final/') {
# Process samples from one or more sample.for.quantiles runs
# Arguments:
# postfix postfix of sample files
# maxfiles the maximum number of files to sample
# sourcepath path to sample files from sample.for.quantiles
#
# Reads:
# stateinfo.txt table of state classes and state names
# hucinfo.txt table of HUC8 classes and 8 digit HUC8 code
# all <sourcepath>ec_samples_<postfix>*.RDS, which contain:
# samples 4d array of sampled values (n x acres x systems x all/best)
# statehuc array of state and HUC for each sample (n x 2)
# acres vector of sampled block sizes
# layers vector of system names
#
# Result (written to sourcepath):
# ecoConnect_quantiles.RDS
# RDS with three arrays
# full, state, huc
# each with 5 dimensions
# 1. region (full = 1, state = 14, huc = 245)
# 2. acres (5 or 6)
# 3. systems (4)
# 4. all/best (2)
# 5. percentiles (100)
# and 2 tables
# stateinfo and hucinfo
# sample_sizes_full.txt, sample_sizes_final_full.txt
# sample_sizes_state.txt, sample_sizes_final_state.txt
# sample_sizes_huc.txt, sample_sizes_final_huc.txt
# tables of region/state name/huc ID by acres with actual sample sizes and final sample sizes after sweeping
#
# realized.acres (printed to console)
# a conversion from nominal acres to actual acerage of pseudoparcels. Use this in make.report.percentiles when interpolating percentiles for a given parcel.
#
# Note: this is a true memory hog, so run it on an idle machine. The chain of rbinds is ugly, thus slow. You can only run for about 100 x 1e5 sample files in 128 GB!
#
# B. Compton, 18 Oct 2024: split into sample.for.quantiles and get.quantiles from ecoConnect.quantiles
library(lubridate)
library(abind)
source('sweep.quantiles.R')
launch <- now()
ts <- Sys.time()
sinfo <- read.table(paste0(sourcepath, 'stateinfo.txt'), sep = '\t', header = TRUE) # we'll use these for sample size tables
hinfo <- read.table(paste0(sourcepath, 'hucinfo.txt'), sep = '\t', header = TRUE)
files <- list.files(sourcepath, paste0('ec_samples_', postfix, '.*\\.RDS$'))
cat('Found ', length(files), ' sample files\n', sep = '')
if(!is.null(maxfiles)) {
files <- files[1:min(maxfiles, length(files))]
cat('Using first ', length(files), 'files\n', sep = '')
}
for(i in 1:length(files)) {
cat(paste0('Reading ',sourcepath, files[i], '...\n'))
x <- readRDS(paste0(sourcepath, files[i]))
if(i == 1) {
z <- x$samples
statehuc <- x$statehuc
acres <- x$acres
layers <- x$layers
realized.acres <- x$realized.acres
}
else {
z <- abind(z, x$samples, along = 1)
statehuc <- rbind(statehuc, x$statehuc)
}
cat('Total samples: ', format(dim(z)[1], big.mark = ','), ' (', i, ' of ', length(files), ' files)\n', sep = '')
}
# Intermediate result z is 4d array of row x acres x systems x all/best
# Result quantiles.* are 5d arrays of region x acres x systems x all/best x 100 with percentiles
# Result samples.* are 2d arrays of region x acres with sample sizes
cat('Calculating percentiles...\n')
rc <- c(1, max(sinfo$class), max(hinfo$class)) # number of classes in each region (1 for full, 14 states, 245 HUCs)
# Now take percentiles and sample sizes
for(h in 1:length(rc)) { # For each set of regions,
dn <- list(regions = 1:rc[h], acres = gsub(' ', '', format(acres)), layers = layers,
all.best = c('all', 'best'), percentile = 1:100) # array names
qu <- array(NA, dim = c(rc[h], length(acres), length(layers), 2, 100), dimnames = dn) # quantiles for region set
ss <- array(NA, dim = c(rc[h], length(acres))) # sample sizes for region set
for(i in 1:rc[h]) { # For each region,
if(h == 1) # Select samples that fall in region
y <- z
else {
if(!i %in% statehuc[, h - 1]) # just to be safe, though all states and HUCs should be represented
next
y <- z[statehuc[, h - 1] == i, , , , drop = FALSE]
}
for(j in 1:dim(y)[2]) { # For each block size (we might not have all acerages),
ss[i, j] <- sum(!is.na(y[, j, 1, 1])) # collect sample sizes (identical across systems and all/best)
for(k in 1:length(layers)) { # For each layer,
for(l in 1:2) { # For all/best,
qu[i, j, k, l, ] <- quantile(y[, j, k, l],
prob = seq(0.01, 1, by = 0.01),
na.rm = TRUE, names = FALSE) # take percentiles
}
}
}
}
ss[is.na(ss)] <- 0
switch(h, { # save results to appropriate region variable
quantiles.full <- qu
samples.full <- data.frame(ss)
names(samples.full) <- acres
},
{
quantiles.state <- qu
samples.state <- data.frame(cbind(sinfo$postal[match(sinfo$class, 1:dim(sinfo)[1])]), ss)
names(samples.state) <- c('Postal', acres)
},
{
quantiles.huc <- qu
samples.huc <- data.frame(cbind(hinfo$HUC8_code[match(hinfo$class, 1:dim(hinfo)[1])]), ss)
names(samples.huc) <- c('HUC8_code', acres)
})
cat('Finished calculating quantiles for ', c('full region', 'states', 'HUC8s')[h], '.\n', sep = '')
}
x <- sweep.quantiles(quantiles.full, samples.full)
quantiles.full <- x$quantiles
samples.full.final <- x$final.samples
x <- sweep.quantiles(quantiles.state, samples.state)
quantiles.state <- x$quantiles
samples.state.final <- x$final.samples
x <- sweep.quantiles(quantiles.huc, samples.huc)
quantiles.huc <- x$quantiles
samples.huc.final <- x$final.samples
cat('Writing results...\n')
if(postfix != '')
postfix <- paste0('_', postfix)
x <- list(full = quantiles.full, state = quantiles.state, huc = quantiles.huc, stateinfo = sinfo, hucinfo = hinfo)
saveRDS(x, f <- paste0(sourcepath, 'ecoConnect_quantiles', postfix, '.RDS')) # write quantiles to RDS
write.table(samples.full, paste0(sourcepath, 'sample_sizes_full', postfix, '.txt'), sep = '\t', row.names = FALSE, quote = FALSE) # write sample size text files
write.table(samples.state, paste0(sourcepath, 'sample_sizes_state', postfix, '.txt'), sep = '\t', row.names = FALSE, quote = FALSE)
write.table(samples.huc, paste0(sourcepath, 'sample_sizes_huc', postfix, '.txt'), sep = '\t', row.names = FALSE, quote = FALSE)
write.table(samples.full.final, paste0(sourcepath, 'sample_sizes_final_full', postfix, '.txt'), sep = '\t', row.names = FALSE, quote = FALSE) # write sample size text files
write.table(samples.state.final, paste0(sourcepath, 'sample_sizes_final_state', postfix, '.txt'), sep = '\t', row.names = FALSE, quote = FALSE)
write.table(samples.huc.final, paste0(sourcepath, 'sample_sizes_final_huc', postfix, '.txt'), sep = '\t', row.names = FALSE, quote = FALSE)
elapsed <- format(seconds_to_period(round(as.duration(interval(ts, Sys.time())), 1)))
cat('Results written to ', f, '\n', sep = '')
cat('and sample sizes to ', paste0(sourcepath, 'sample_sizes_*', postfix, '.txt'), '\n')
cat('Total time taken: ', elapsed, '\n', sep = '')
}