TCGA_DEGs.Rmd

---
title: Differential expression analysis of TCGA data between groups with high/low
  expression of selected genes
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
output:
  pdf_document:
    toc: no
  html_document:
    theme: united
    toc: no
csl: styles.ref/genomebiology.csl
bibliography: data.TCGA/TCGA.bib
---

```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is') #out.width=700, 
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
library(dplyr)
options(stringsAsFactors = FALSE)
```

```{r results='hide'}
library(TCGA2STAT)
library(dplyr)
library(knitr)
library(ggplot2)
library(reshape2)
library(gridExtra)
library(limma)
library(openxlsx)
library(MDmisc)
library(org.Hs.eg.db)
library(enrichR)
```

```{r}
# A function to load TCGA data, from remote repository, or a local R object
load_data <- function(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE) {
  FILE = paste0(data_dir, "/mtx_", disease, "_", data.type, "_", type, ".rda") # R object with data
  if (all(file.exists(FILE), !(force_reload))) {
    # If the data has been previously saved, load it
    load(file = FILE)
  } else {
    # If no saved data exists, get it from the remote source
    mtx <- getTCGA(disease = disease, data.type = data.type, type = type, clinical = TRUE)
    save(file = FILE, list = c("mtx")) # Save it
  }
  return(mtx)
}

# A function to get data overview
summarize_data <- function(mtx = mtx) {
  print(paste0("Dimensions of expression matrix, genex X patients: ", paste(dim(mtx$dat), collapse = " ")))
  print(paste0("Dimensions of clinical matrix, patients X parameters: ", paste(dim(mtx$clinical), collapse = " ")))
  print(paste0("Dimensions of merged matrix, patients X parameters + genes: ", paste(dim(mtx$merged.dat), collapse = " ")))
  print("Head of the merged matrix")
  print(mtx$merged.dat[1:5, 1:10])
  print("Head of the clinical matrix")
  print(mtx$clinical[1:5, 1:7])
  print("List of clinical values, and frequency of each variable: ")
  clin_vars <- apply(mtx$clinical, 2, function(x) length(table(x[ !(is.na(x) & x != "" )]))) %>% as.data.frame()
  # Filter clinical variables to have at least 2, but no more than 10 categories,
  # And they are not dates
  clin_vars <- clin_vars[ as.numeric(clin_vars$.) > 1 & as.numeric(clin_vars$.) < 10 & !grepl("years|days|date|vital", rownames(clin_vars), perl = TRUE) , , drop = FALSE]
  print(kable(clin_vars))
  return(rownames(clin_vars))
}
# A wrapper function to perform all functional enrichment analyses.
# Helper function to save non-empty results
save_res <- function(res, fileName = fileName, wb = wb, sheetName = "KEGG") {
  if (nrow(res) > 0) {
    openxlsx::addWorksheet(wb = wb, sheetName = sheetName)
    openxlsx::writeData(wb, res, sheet = sheetName)
    openxlsx::saveWorkbook(wb, fileName, overwrite = TRUE)
  }
}

```

# Settings

```{r echo=TRUE}
# Cancer type
cancer = "LUSC" 
# Gene(s) of interest
selected_genes = c("LIN52") # Can be multiple
# Define quantile qutoffs
quantile_up <- 0.75 # 0.75
quantile_lo <- 0.25 # 0.25
# Which pathway enrichment analysis to run
run_gsea <- FALSE # If TRUE, GSEA pathway enrichment analysis is run, otherwise, standard hypergeometric-based enrichment
```

## Differential expression analysis

Samples in the selected cancer cohort were sorted by expression of the selected genes. Differentially expressed genes were detected between samples in the upper `r quantile_up` percentile of the expression gradient and samples in the lower `r quantile_lo` percentile using `limma` v `r packageVersion("limma")` R package [@Ritchie:2015aa; @Smyth:2004aa]. P-values were corrected for multiple testing using False Discovery Rate (FDR) method [@Benjamini:1995aa]. Genes differentially expressed at FDR < 0.01 were selected for further analysis.


```{r}
# Path where the downloaded data is stored
data_dir = "/Users/mdozmorov/Documents/Data/GenomeRunner/TCGAsurvival/data" # Mac

# General settings
useTPM    = FALSE # Whether or not to convert expression counts to TPM
data.type = "RNASeq2"
type = "" 

# Differential expression cutoff
p_val_cutoff   <- 0.05 # Regular p-value cutoff
p_adj_cutoff   <- 0.05 # FDR cutoff
nplot          <- 50 # How many genes to plot on a heatmap
nbox           <- 9  # How many genes to plot on a boxplot
ntable         <- 15 # Number of genes to output in a DEG table
nkegg          <- 35 # Number of genes to output in a KEGG table
min_kegg_genes <- 20 # Minimum number of genes to run enrichment analysis on
max_kegg_genes <- 2000 # Maximum number of genes to run enrichment analysis on
up_dn_separate <- FALSE # Whether to run KEGG separately on up- and downregulated genes. FALSE - do not distinguish directionality

# Filename to same the results
fileNameRes <- paste0("results/TCGA_DEGs_", selected_genes, "_", cancer, ".xlsx")
```

```{r results='hide'}
mtx <- load_data(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE)

clinical_annotations <- summarize_data(mtx = mtx)
# source("Supplemental_R_script_1.R")

# Prepare expression data
expr <- mtx$merged.dat[ , 4:ncol(mtx$merged.dat)] %>% as.matrix
# Filter out low expressed genes
# Should be more than 90% of non-zero values
# ff <- genefilter::pOverA(p = 0.9, A = 0, na.rm = TRUE) 
# expr <- expr[, apply(expr, 2, ff)] 
expr <- data.frame(AffyID = mtx$merged.dat$bcr, expr, stringsAsFactors = FALSE)

# Prepare clinical data
clin <- mtx$merged.dat[, 1:3]
colnames(clin)[1] <- "AffyID"
```

```{r}
# Do TPM conversion, if needed
if (useTPM) {
  # Check if the TPMs have already been precalculated for a given cancer
  fileNameTPM <- paste0(data_dir, "/", cancer, "_TPM.Rda")
  if (!file.exists(fileNameTPM)) {
    source("calcTPM.R")
    load(file = "data/feature_length.Rda")
    common_genes <- intersect(colnames(expr), feature_length$Symbol) # Common gene symbols
    expr         <- expr[, c("AffyID", common_genes)] # Subset expression matrix
    feature_length <- feature_length[feature_length$Symbol %in% common_genes, ] # Subset feature length
    feature_length <- feature_length[match(colnames(expr)[ -1 ], feature_length$Symbol), ] # Match order
    all.equal(colnames(expr), c("AffyID", feature_length$Symbol)) # Should be true
    expr_tpm <- calcTPM(t(expr[, -1]), feature_length = feature_length) # Convert to TPM, takes time
    expr_tpm <- data.frame(AffyID = expr[, "AffyID"], t(expr_tpm), stringsAsFactors = FALSE)
    expr <- expr_tpm
    save(list = c("expr"), file = fileNameTPM) # Save the object
  } else {
    load(file = fileNameTPM)
  }
}
```

# Expression distribution for `r selected_genes` gene

Informational plot only. Expected log2-transformed RSEM expression range distribution: $\sim 0 - 16$. Median close to $0$ indicate overall low expression, should be avoided.

```{r fig.height=3}
selected_genes_expression <- melt(expr[, colnames(expr) %in% selected_genes, drop = FALSE])
p1 <- ggplot(selected_genes_expression, aes(x = variable, y = log2(value))) + geom_boxplot()
p2 <- ggplot(selected_genes_expression, aes(x = log2(value))) + geom_density()
grid.arrange(p1, p2, ncol = 2)
```

```{r}
# View and subset by expression and quantiles of the selected genes
selected_genes_quantiles_up <- with(selected_genes_expression, tapply(value, variable, quantile, probs = quantile_up)) # Select upper quantile!
selected_genes_quantiles_lo <- with(selected_genes_expression, tapply(value, variable, quantile, probs = quantile_lo)) # Select lower quantile!
# Subset exprs by top expression
selected_index_up <- list() # Collect boolean indexes for each gene
selected_index_lo <- list() # Collect boolean indexes for each gene
for (gene in selected_genes) {
  ind_up <- expr[, gene] > selected_genes_quantiles_up[ gene ] # True, if expressed above the selected upper quantile
  ind_lo <- expr[, gene] < selected_genes_quantiles_lo[ gene ] # True, if expressed below the selected lower quantile
  selected_index_up <- c(selected_index_up, list(ind_up))
  selected_index_lo <- c(selected_index_lo, list(ind_lo))
}
ind_up <- apply(as.data.frame(selected_index_up), 1, all) # Collapse indexes from multiple selected genes, all selected genes should be expressed in the upper quantile
ind_lo <- apply(as.data.frame(selected_index_lo), 1, all) # Collapse indexes from multiple selected genes, all selected genes should be expressed in the lower quantile

# For differential analysis, create group labels
group <- vector(mode = "numeric", length = nrow(expr)) # Empty bector
group[ind_up] <- 1 # Assign numeric groups
group[ind_lo] <- 2
# table(group) # How many patients we have
expr <- expr[group != 0, ] # Remove those that are not in quantiles
group <- group[ group != 0 ]
```

# Differential expression analysis

```{r}
# Prerequisites, prepared by the survival.R script
# expr - expression matrix separated by the high/low expression of the selected genes
# group - labeling of samples having high/low expression of the selected genes

# Reshape expression matrix
expr <- (t(expr))
colnames(expr) <- expr[1, ]
expr <- expr[-1, ]
class(expr) <- "numeric"
# expr <- log2(expr + 1)
expr <- voom(expr)$E
# boxplot(expr)

# Limma
design <- model.matrix(~0 + factor(group))
colnames(design) <- c("up", "lo")
fit <- lmFit(expr, design)
contrast.matrix <- makeContrasts(up-lo, levels = design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)

degs <- topTable(fit2, coef = 1, number = Inf, p.value = p_val_cutoff)
```

We split `r cancer` cohort into `r length(group[group == 1])` x `r length(group[group == 2])` patients having `r paste(selected_genes, collapse = ", ") ` in lower `r quantile_up` and upper `r quantile_lo` expression quantiles. We have a total of `r nrow(degs)` differentially expressed genes at FDR corrected p-value `r p_val_cutoff`. `r nrow(degs[ degs$logFC > 0, ])` are upregulated, `r nrow(degs[ degs$logFC < 0, ])` are downregulated.

Top 50 the most differentially expressed genes are shown

```{r fig.height=8.5}
index.to.plot <- order(group)
matrix.to.plot <- expr[rownames(degs)[1:50], index.to.plot]
genes.to.plot <- rownames(degs)[1:50]
group.to.plot <- group[index.to.plot]
group.to.plot <- ifelse(group.to.plot == 1, "UP", "LO")

NMF::aheatmap(matrix.to.plot, color=colorRampPalette(c('blue', 'gray', 'yellow'))(20), Colv = NA, Rowv = FALSE, hclust = "ward", scale = "row", annCol = group.to.plot, annColors = list(c("blue", "red")), labRow = genes.to.plot, fontsize = 10, cexRow = 10) # color="-RdYlBu"
```

```{r}
# Save results
# Create (or, load)  Excel file
unlink(fileNameRes)
wb <- openxlsx::createWorkbook(fileNameRes) # loadWorkbook(fileNameRes) # 
save_res(data.frame(Gene = rownames(degs), degs), fileName = fileNameRes, wb = wb, sheetName = "DEGS")
```

Results are stored in the Excel file `r fileNameRes`

- Legend for gene lists: "Gene" - gene annotations; "logFC" - log fold change; "AveExpr" - average expression, log2; "t" - t-statistics; "P.Val"/"adj.P.Val" - non-/FDR-adjusted p-value, "B" - another statistics.

```{r}
# DT::datatable(degs)
pander(degs[1:min(ntable, nrow(degs)), ])
```

# Functional enrichment analysis

Up- and downregulated genes are tested for functional enrichment `r paste(ifelse(up_dn_separate, "separately", "jointly"))`. `r paste(ifelse(up_dn_separate, "Each table has enrichment results for both up-/downregulated genes. The \"direction\" column indicate which pathways are enriched in \"UP\"- or \"DN\"-regulated genes.", ""))`. FDR cutoff of the significant enrichments - `r p_adj_cutoff`. Top `r ntable` genes shown.

## KEGG pathway enrichment analysis 

**Legend:** "database" - source of functional annotations, "category" - name of functional annotation,  "pval" - unadjusted enrichment p-value,  "qval" - FDR-adjusted p-value,  "genes" - comma-separated differentially expressed genes enriched in a corresponding functional category. `r paste(ifelse(up_dn_separate, "\"direction\" - UP/DN, an indicator whether genes are up- or downregulated.", ""))` 

```{r}
if( run_gsea == FALSE) {
  # Subset the number of DEGs for KEGG analysis to the maximum
  if (nrow(degs) > max_kegg_genes) {
    degs_subset <- degs[1:max_kegg_genes, ]
  } else {
    degs_subset <- degs
  }
  # Get list of up- and downregulated genes
  up.genes <- sort(unique(rownames(degs_subset)[ degs_subset$t > 0 ]))
  dn.genes <- sort(unique(rownames(degs_subset)[ degs_subset$t < 0 ]))
  # Run KEGG
  if (up_dn_separate) {
    # Analyze up- and downregulated genes separately
    print(paste0("KEGG pathway run on ", length(up.genes), " upregulated and ", length(dn.genes), " downregulated genes."))
    res.kegg <- save_enrichr(up.genes = up.genes, dn.genes = dn.genes, databases = "KEGG_2016", fdr.cutoff = p_adj_cutoff, fileName = fileNameRes, wb = wb)
  } else {
    # Analyze up- and downregulated genes together
    print(paste0("KEGG pathway run on ", length(unique(c(up.genes, dn.genes))), " genes without distinguishing them by directionality."))
    res.kegg <- MDmisc::save_enrichr(up.genes = unique(c(up.genes, dn.genes)), databases = "KEGG_2016", fdr.cutoff = p_adj_cutoff, fileName = fileNameRes, wb = wb)
  }
}
```

## KEGG pathway GSEA analysis 

**Legend:** "ID", "Description" - KEGG pathway ID/description, respectively; "NES" - [normalized enrichment score](http://software.broadinstitute.org/gsea/doc/GSEAUserGuideFrame.html); "pvalue", "p.adjust" - raw and FDR-adjusted p-values, respectively; "core_enrichment" - genes enriched in the corresponding pathway.

```{r}
if (run_gsea == TRUE) {
  library(clusterProfiler)
  library(DOSE)
  ## GSEA using clusterProfiler
  # All DEGs
  degs.all <- topTable(fit2, coef = 1, number = Inf)
  # Convert symbols to entrezids
  eid <- bitr(rownames(degs.all), fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
  # Attach converted entrezids
  degs.all <- left_join(data.frame(SYMBOL = rownames(degs.all), degs.all), eid, by = "SYMBOL")
  degs.all <- degs.all[ degs.all$ENTREZID != "", ]
  # List of t-statistics
  geneList <- degs.all$t
  # Make it named
  names(geneList) <- degs.all$ENTREZID
  # And decreasing sorted
  geneList <- sort(geneList, decreasing = TRUE)
  # Actual GSEA
  set.seed(1)
  ego3 <- gseKEGG(geneList     = geneList,
                  organism     = "hsa",
                  nPerm        = 1000,
                  minGSSize    = 10,
                  pvalueCutoff = 0.1,
                  verbose      = FALSE)
  # Get summary
  ego3 <- setReadable(ego3, OrgDb = org.Hs.eg.db, keytype = "ENTREZID")
  res.kegg <- as.data.frame(ego3)
  # Save the full results
  save_res(res.kegg, fileName = fileNameRes, wb = wb, sheetName = "KEGG_GSEA")
  # Prepare for table output
  res.kegg <- res.kegg[, c("ID", "Description", "NES", "pvalue", "p.adjust", "core_enrichment")]
  res.kegg <- res.kegg[order(res.kegg$NES, decreasing = TRUE), ]
  res.kegg <- res.kegg[res.kegg$p.adjust < p_adj_cutoff, ]
  res.kegg$NES       <- round(res.kegg$NES, digits = 2)
  res.kegg$pvalue    <- formatC(res.kegg$pvalue, format = "e", digits = 2)
  res.kegg$p.adjust  <- formatC(res.kegg$p.adjust, format = "e", digits = 2)
  rownames(res.kegg) <- NULL
}
```

A total of `r nrow(res.kegg)` KEGG pathways were detected as significantly affected at FDR `r p_adj_cutoff`. Top `r ntable` shown.

```{r}
# Display the results
# DT::datatable(res.kegg)
if (nrow(res.kegg) > 0 ) {
  kable(res.kegg[1:min(ntable, nrow(res.kegg)), ])
}
```

## Selected pathway

Red/Green - up/downregulated genes in upper vs. lower `r selected_genes` expressing samples. Gray - marginal fold change, yet significant. White - gene is not differentially expressed

```{r eval=FALSE}
library(pathview)
library(openxlsx)
degs <- read.xlsx(fileNameRes, cols = c(1, 2))
degs.genes <- degs$logFC
names(degs.genes) <- degs$Gene
# Adjust as needed
pv.out <- pathview(gene.data = degs.genes, pathway.id = "04972", species = "hsa", gene.idtype = "SYMBOL", gene.annotpkg = "org.Hs.eg.db", out.suffix = paste(selected_genes, collapse = "-"))
```

```{r echo=FALSE, out.height='300px', eval=FALSE}
knitr::include_graphics('hsa04972.LIN52.png')
```


```{r eval = FALSE}
# ![](hsa04210.CTBP2.png)
# KEGG canonical pathway enrichment analysis 

# - Legend for GO/KEGG functional enrichment results: "ID" - unique identifier of functional category; "Pvalue" - non-adjusted p-value; "OddsRatio" - enrichment odds ratio; "ExpCount" - number of genes expected to be selected in a category; "Count" - number of genes observed in the current list; "Size" - total number of genes in a category; "Term" - category description; "p.adj" - false discovery rate; "SYMBOL", "ENTREZ" - genes observed in the current list as annotated with a category
# 
# Pathways for **differentially expressed genes** identified above

# # Gene ontology, molecular function
# res <- gene_enrichment(selected = rownames(degs), all.universe = rownames(expr), id="symbol", organism = "Hs", use="GO", ont="MF")
# save_res(res, fileName, wb = wb, sheetName = "GOMF")
# # Gene ontology, biological process 
# res <- gene_enrichment(selected = rownames(degs), all.universe = rownames(expr), id="symbol", organism = "Hs", use="GO", ont="BP")
# save_res(res, fileName, wb = wb, sheetName = "GOBP")
# # Gene ontology, cellular component
# res <- gene_enrichment(selected = rownames(degs), all.universe = rownames(expr), id="symbol", organism = "Hs", use="GO", ont="CC")
# save_res(res, fileName, wb = wb, sheetName = "GOCC")
# KEGG canonical pathways
res <- gene_enrichment(selected = rownames(degs), all.universe = rownames(expr), id="symbol", organism = "Hs", use="KEGG")
DT::datatable(res)
```

```{r eval=FALSE}
# GSEA analysis

## KEGGUP

# **Upregulated** pathways

library(gage)
data("kegg.gs")
# Convert Symbols to EntrezIDs
ids <- clusterProfiler::bitr(rownames(expr), fromType="SYMBOL", toType="ENTREZID", OrgDb = "org.Hs.eg.db")
expr <- left_join(data.frame(SYMBOL = rownames(expr), expr), ids, by = "SYMBOL") # Append converted EntrezIDs
expr$SYMBOL <- NULL # Remove symbols
expr <- expr[!is.na(expr$ENTREZID), ] # Remove unmapped rows
rownames(expr) <- expr$ENTREZID # Add EntrezIDs as rows
expr$ENTREZID <- NULL # Remove them from the data frame
# Analysis for pathways overrepresented by up/downregulated genes
kegg.p <- gage(expr,
               gsets = kegg.gs,
               ref = which(group == 2, arr.ind = TRUE),
               samp = which(group == 1, arr.ind = TRUE),
               same.dir = TRUE,
               rank.test = TRUE,
               saaTest = gs.KSTest,
               compare = "unpairedd")
```

```{r eval=FALSE}
DT::datatable(data.frame(pathway = rownames(kegg.p$greater), kegg.p$greater[, c("p.val", "q.val", "set.size")]))
```

```{r eval=FALSE}
## KEGGDN

# **Downregulated** pathways

DT::datatable(data.frame(pathway = rownames(kegg.p$less), kegg.p$less[, c("p.val", "q.val", "set.size")]))
```


```{r session_info, eval = FALSE}
diagnostics <- devtools::session_info()
platform <- data.frame(diagnostics$platform %>% unlist, stringsAsFactors = FALSE)
colnames(platform) <- c("description")
pander(platform)

packages <- as.data.frame(diagnostics$packages)
pander(packages[ packages$`*` == "*", ])
```

# References