-
Notifications
You must be signed in to change notification settings - Fork 5
/
SepsisGenomics_preprocess.R
93 lines (74 loc) · 2.8 KB
/
SepsisGenomics_preprocess.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#Author : Akram Mohammed and Rishikesan Kamaleswaran
install.packages("magrittr")
install.packages("dplyr")
install.packages("AnnotatedDataFrame")
#install the core bioconductor packages, if not already installed
source("http://bioconductor.org/biocLite.R")
biocLite()
biocLite(BiocUpgrade)
# install additional bioconductor libraries, if not already installed
biocLite("GEOquery")
biocLite("affy")
biocLite("gcrma")
biocLite("hgu133plus2cdf")
biocLite("hgu133plus2probe")
biocLite("hgu133plus2.db")
library(limma)
library(magrittr)
#Load the necessary libraries
library(GEOquery)
library(affy)
library(gcrma)
library(hgu133plus2cdf)
library(hgu133plus2probe)
library(hgu133plus2.db)
library(dplyr)
library(AnnotationDbi)
#Set working directory for download
setwd("C:/Users/amoham18/Documents/SepsisGenomics")
#Download the CEL file package for this dataset (by GSE - Geo series id)
getGEOSuppFiles("GSE66099")
#Unpack the CEL files
setwd("C:/Users/amoham18/Documents/SepsisGenomics/GSE66099")
untar("GSE66099_RAW.tar", exdir="data")
cels = list.files("data/", pattern = "CEL")
sapply(paste("data", cels, sep="/"), gunzip)
cels = list.files("data/", pattern = "CEL")
setwd("C:/Users/amoham18/Documents/SepsisGenomics/GSE66099/data")
pd<-read.AnnotatedDataFrame("SepticShock_ClassLabel_SS.csv",sep=",", header=T)
rawData=ReadAffy(verbose=TRUE, filenames=cels, cdfname="hgu133plus2", phenoData=pd) #From bioconductor
#perform RMA normalization
normData=rma(rawData)
#Get the important stuff out of the data - the expression estimates for each array
rma=exprs(normData)
#Filter Affy controls
rma=rma[which(!grepl("AFFX", rownames(rma))),]
#Format values to 5 decimal places
rma=format(rma, digits=5)
#Map probe set identifiers to Entrez gene symbols and IDs and then combine with raw data.
#Map probe sets to gene symbols or other annotations
#To see all available mappings for this platform
ls("package:hgu133plus2.db") #Annotations at the exon probeset level
probes=rownames(rma)
collapser <- function(x){
x %>% unique %>% sort %>% paste(collapse = "|")
}
# Redefinition of ae.annots
newSymbols <- AnnotationDbi::select(
x = hgu133plus2.db,
keys = rownames(rma),
columns = c("PROBEID", "ENSEMBL", "ENTREZID", "SYMBOL"),
keytype = "PROBEID"
) %>%
group_by(PROBEID) %>%
summarise_all(funs(collapser)) %>%
ungroup
fd <- new("AnnotatedDataFrame",
data = data.frame(rma, stringsAsFactors = FALSE)
)
rownames(fd) <- newSymbols$PROBEID
#Combine gene annotations with raw data
newrma=cbind(newSymbols,rma)
#Write RMA-normalized, mapped data to file
write.table(newrma, file = "rma2.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
# Run the local jupyternotebook to extract the input files for sepsisGenomics_DGE.R