-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path2_variable_importance.rmd
182 lines (138 loc) · 5.67 KB
/
2_variable_importance.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
2. variable importance and model summaries
============
Load the required libraries:
```{r load_libs, message=FALSE, warning=FALSE}
library(dismo, quietly=T)
library(phytools, quietly = T)
library(phylolm, quietly = T)
library(dplyr, quietly = T)
source("../script/Data.R")
```
Initialize global variables:
```{r globals}
REPO_HOME <- paste(getwd(),'/../',sep='')
```
Load the list of taxa:
```{r taxa}
taxa.names <- scan(paste(REPO_HOME, "/data/filtered/taxa.txt", sep = ""), sep = "\n", what = character())
```
Define the list of GIS layers, i.e. the traits:
```{r layers}
gis.layers <- raster::getData(
"worldclim",
var = "bio",
res = 5,
path = paste(REPO_HOME, "/data/GIS", sep = ""),
download = T
)
# The location of the TIFF file with the stacked layers described directly above
files.names <- list.files(paste(REPO_HOME, "/data/GIS/5_deg", sep = ""))
# Turn the file names into layer names: strip the prefix (which might include
# the resolution) and strip the file extension
gis.layers.names <- files.names
gis.layers.names <- gsub('current_5arcmin_','',gis.layers.names)
gis.layers.names <- gsub('.tif','',gis.layers.names)
# Combine the layer names with those we've already read from BIOCLIM
gis.layers.names <- c(names(gis.layers),gis.layers.names)
# Iterate over files
for (i in 1:length(files.names)) {
# Stack with previously read layers
gis.layers <- stack(
gis.layers,
# Read as raster
raster(
# Construct file name
paste(REPO_HOME, "/data/GIS/5_deg/", files.names[i], sep = "")
)
)
}
# Apply all names
names(gis.layers) <- gis.layers.names
plot(gis.layers)
```
Contribution of layers to maxent models
---------------------------------------
Populate a data.frame of taxa x layers and record the contribution of each of these layers:
```{r models}
traits.contributions.csv <- paste(REPO_HOME, "results/maxent/model_summaries/traits_contribution_maxent.csv", sep="")
if (!file.exists(traits.contributions.csv)) {
# instantiate variable contributions frame from a matrix of taxa x layers
traits.contributions <- data.frame(matrix(
nrow = length(taxa.names),
ncol = length(gis.layers.names),
dimnames = list(taxa.names,gis.layers.names))
)
# start iterating over taxa
for (i in 1: length(taxa.names)) {
# load maxent model if it exists
maxent.model <- get_maxent_model( REPO_HOME, taxa.names[i] )
if (!is.null(maxent.model)) {
# the plot function returns a named list with the variable contributions
var.contrib <- plot(maxent.model)
# iterate over layers, store variable contribution
for ( j in 1:length(gis.layers.names) ) {
if ( gis.layers.names[j] %in% colnames(maxent.model@presence) ) {
traits.contributions[i,j] <- var.contrib[gis.layers.names[j]]
}
}
}
}
write.csv(traits.contributions, traits.contributions.csv)
}
traits.contributions <- read.csv(traits.contributions.csv)
```
Now let's summarize the variable importance:
```{r contributions}
# make a box plot. as a side effect, the summary statistics are returned
traits.contributions.summary <- boxplot.default(traits.contributions[2:42], varwidth = T, outline = F, col = c("darkorange", "firebrick1", "goldenrod1"))
mean.traits.contributions.csv <- paste(REPO_HOME, "results/maxent/model_summaries/mean_traits_contribution_maxent.csv", sep="")
if (!file.exists(mean.traits.contributions.csv)) {
# we will populate a data frame with the returned summary statistics
traits.contributions.df <- data.frame(matrix(
nrow = length(gis.layers.names),
ncol = 2,
dimnames = list(gis.layers.names,c("mean","n")))
)
for ( i in 1:length(traits.contributions.summary$names) ) {
traits.contributions.df[i,1] <- traits.contributions.summary$stats[i*5-2]
traits.contributions.df[i,2] <- traits.contributions.summary$n[i]
}
write.csv(traits.contributions.df, mean.traits.contributions.csv)
}
```
As a last step we summarize all models: AUC values, number of occurrence points and the descending order of variable importance per model.
```{r model summaries}
output.csv <- paste(REPO_HOME, "/results/maxent/model_summaries/summary_df.csv", sep = "")
if (!file.exists(output.csv)) {
# create empty dataframe with columns "Species_name", "AUC values, "n", "important_variables"
df.summary <- data.frame(matrix(ncol = 4, nrow = length(taxa.names)))
x <- c("species", "n", "AUC", "important_variables")
colnames(df.summary) <- x
# load AUC df
df.auc <- sprintf("%s/Results/maxent/model_summaries/AUCvalues.csv", REPO_HOME)
df.auc <- read.csv(df.auc, header = T)
# load trait contribution
df.traits <- sprintf("%s/Results/maxent/model_summaries/traits_contribution_maxent.csv", REPO_HOME)
df.traits <- read.csv(df.traits, header = T, row.names = 1)
y <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41")
colnames(df.traits) <- y
for (i in 1:length(taxa.names)) {
# set species name
df.summary[i, 1] <- taxa.names[i]
# count number of occurence points
csv.file <- sprintf("%s/data/filtered/%s.csv", REPO_HOME, taxa.names[i])
csv <- read.csv(csv.file, header = T)
df.summary[i, 2] <- nrow(csv)
# select the AUC value
df.summary[i, 3] <- df.auc[i, "trainingAUC"]
# order the trait contribution
traits.row <- df.traits[i, ]
traits.row <- na.omit(t(traits.row))
traits.row <- traits.row[order(traits.row, decreasing = TRUE), ]
traits.row <- as.data.frame(traits.row)
traits.row <- rownames(traits.row)
df.summary[i, 4] <- paste(traits.row, collapse = ",")
}
write.csv(df.summary, output.csv)
}
```