doc/2_variable_importance.rmd


2. variable importance and model summaries
============

Load the required libraries:

```{r load_libs, message=FALSE, warning=FALSE}
library(dismo, quietly=T)
library(phytools, quietly = T)
library(phylolm, quietly = T)
library(dplyr, quietly = T)
source("../script/Data.R")
```

Initialize global variables:

```{r globals}
REPO_HOME <- paste(getwd(),'/../',sep='')
```

Load the list of taxa:

```{r taxa}
taxa.names <- scan(paste(REPO_HOME, "/data/filtered/taxa.txt", sep = ""), sep = "\n", what = character())
```

Define the list of GIS layers, i.e. the traits:

```{r layers}
gis.layers <- raster::getData(
  "worldclim",
  var = "bio",
  res = 5,
  path = paste(REPO_HOME, "/data/GIS", sep = ""),
  download = T
)

# The location of the TIFF file with the stacked layers described directly above
files.names <- list.files(paste(REPO_HOME, "/data/GIS/5_deg", sep = ""))

# Turn the file names into layer names: strip the prefix (which might include
# the resolution) and strip the file extension
gis.layers.names <- files.names
gis.layers.names <- gsub('current_5arcmin_','',gis.layers.names)
gis.layers.names <- gsub('.tif','',gis.layers.names)

# Combine the layer names with those we've already read from BIOCLIM
gis.layers.names <- c(names(gis.layers),gis.layers.names)

# Iterate over files
for (i in 1:length(files.names)) {
  
  # Stack with previously read layers
  gis.layers <- stack(
    gis.layers,
    
    # Read as raster
    raster(
      
      # Construct file name
      paste(REPO_HOME, "/data/GIS/5_deg/", files.names[i], sep = "")
    )
  )
}

# Apply all names
names(gis.layers) <- gis.layers.names
plot(gis.layers)
```

Contribution of layers to maxent models
---------------------------------------

Populate a data.frame of taxa x layers and record the contribution of each of these layers:

```{r models}

traits.contributions.csv <- paste(REPO_HOME, "results/maxent/model_summaries/traits_contribution_maxent.csv", sep="")

 if (!file.exists(traits.contributions.csv)) {
# instantiate variable contributions frame from a matrix of taxa x layers
traits.contributions <- data.frame(matrix(
  nrow = length(taxa.names), 
  ncol = length(gis.layers.names), 
  dimnames = list(taxa.names,gis.layers.names))
)

# start iterating over taxa
for (i in 1: length(taxa.names)) {
  
  # load maxent model if it exists
  maxent.model <- get_maxent_model( REPO_HOME, taxa.names[i] )
  if (!is.null(maxent.model)) {
    
    # the plot function returns a named list with the variable contributions
    var.contrib <- plot(maxent.model)
      
    # iterate over layers, store variable contribution
    for ( j in 1:length(gis.layers.names) ) {
      if ( gis.layers.names[j] %in% colnames(maxent.model@presence) ) {
        traits.contributions[i,j] <- var.contrib[gis.layers.names[j]]
      }
    }
  }
}

write.csv(traits.contributions, traits.contributions.csv)
 }

traits.contributions <- read.csv(traits.contributions.csv)
    

```

Now let's summarize the variable importance:

```{r contributions}

# make a box plot. as a side effect, the summary statistics are returned
traits.contributions.summary <- boxplot.default(traits.contributions[2:42], varwidth = T, outline = F, col = c("darkorange", "firebrick1", "goldenrod1"))

mean.traits.contributions.csv <- paste(REPO_HOME, "results/maxent/model_summaries/mean_traits_contribution_maxent.csv", sep="")

if (!file.exists(mean.traits.contributions.csv)) {
# we will populate a data frame with the returned summary statistics
traits.contributions.df <- data.frame(matrix(
  nrow = length(gis.layers.names), 
  ncol = 2, 
  dimnames = list(gis.layers.names,c("mean","n")))
)
for ( i in 1:length(traits.contributions.summary$names) ) {
  traits.contributions.df[i,1] <- traits.contributions.summary$stats[i*5-2]
  traits.contributions.df[i,2] <- traits.contributions.summary$n[i]
}

write.csv(traits.contributions.df, mean.traits.contributions.csv)
}
```

As a last step we summarize all models: AUC values, number of occurrence points and the descending order of variable importance per model. 

```{r model summaries}
output.csv <- paste(REPO_HOME, "/results/maxent/model_summaries/summary_df.csv", sep = "")

if (!file.exists(output.csv)) {

  # create empty dataframe with columns "Species_name", "AUC values, "n", "important_variables"
  df.summary <- data.frame(matrix(ncol = 4, nrow = length(taxa.names)))
  x <- c("species", "n", "AUC", "important_variables")
  colnames(df.summary) <- x

  # load AUC df
  df.auc <- sprintf("%s/Results/maxent/model_summaries/AUCvalues.csv", REPO_HOME)
  df.auc <- read.csv(df.auc, header = T)

  # load trait contribution
  df.traits <- sprintf("%s/Results/maxent/model_summaries/traits_contribution_maxent.csv", REPO_HOME)
  df.traits <- read.csv(df.traits, header = T, row.names = 1)
  y <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41")
  colnames(df.traits) <- y

  for (i in 1:length(taxa.names)) {
    # set species name
    df.summary[i, 1] <- taxa.names[i]
    # count number of occurence points
    csv.file <- sprintf("%s/data/filtered/%s.csv", REPO_HOME, taxa.names[i])
    csv <- read.csv(csv.file, header = T)
    df.summary[i, 2] <- nrow(csv)
    # select the AUC value
    df.summary[i, 3] <- df.auc[i, "trainingAUC"]
    # order the trait contribution
    traits.row <- df.traits[i, ]
    traits.row <- na.omit(t(traits.row))
    traits.row <- traits.row[order(traits.row, decreasing = TRUE), ]
    traits.row <- as.data.frame(traits.row)
    traits.row <- rownames(traits.row)
    df.summary[i, 4] <- paste(traits.row, collapse = ",")
  }

  write.csv(df.summary, output.csv)
}
```