-
Notifications
You must be signed in to change notification settings - Fork 0
/
supplement_run.R
123 lines (83 loc) · 3.9 KB
/
supplement_run.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#'
#' Code to recreate results from Supplement for "Measuring the contribution of genomic predictors through estimator precision gain"
#'
# load necessary libraries and dataset
load("genesigprecision_data.Rda")
library(Biobase)
library(BatchJobs)
library(xtable)
library(genefu)
source("par_fun.R")
source("par_fun_internal.R")
source("par_fun_double.R")
source("par_fun_eset_double.R")
source("par_fun_fewer.R")
source("prep_eset_sim.R")
source("make_table.R")
# We recreate Table 2 in the main text, but double the sample size for each study
# MammaPrint validation data - doubled sample size
reg <- makeRegistry(id="full_double",seed=53845) # This sets the seed for the entire job set
ids <- batchMap(reg, par_fun_double, 31389:31488) # The values here are placeholders only
done <- submitJobs(reg, wait=function(retries) 100, max.retries=10)
waitForJobs(reg)
y <- loadResults(reg)
make_table(y)
# Dataset GSE19615 - doubled sample size
dat_19615 <- prep_eset_sim(GSE19615)
reg <- makeRegistry(id="GSE19615_double",seed=25870)
ids <- batchMap(reg, par_fun_eset_double, 10289:10388, more.args=list(pd=dat_19615))
done <- submitJobs(reg, wait=function(retries) 100, max.retries=10)
waitForJobs(reg)
y <- loadResults(reg)
make_table(y)
# Dataset GSE11121 - doubled sample size
dat_11121 <- prep_eset_sim(GSE11121)
reg <- makeRegistry(id="GSE11121_double",seed=92163)
ids <- batchMap(reg, par_fun_eset_double, 10289:10388, more.args=list(pd=dat_11121))
done <- submitJobs(reg, wait=function(retries) 100, max.retries=10)
waitForJobs(reg)
y <- loadResults(reg)
make_table(y)
# Dataset GSE7390 - doubled sample size
dat_7390 <- prep_eset_sim(GSE7390)
# We need to drop two observations where tumor grade is missing
dat_7390 <- dat_7390[-which(is.na(dat_7390$grade)),]
reg <- makeRegistry(id="GSE7390_double",seed=50382)
ids <- batchMap(reg, par_fun_eset_double, 10289:10388, more.args=list(pd=dat_7390))
done <- submitJobs(reg, wait=function(retries) 100, max.retries=10)
waitForJobs(reg)
y <- loadResults(reg)
make_table(y)
# We extend the permuted example concept to examine what happens when we double the sample size
reg <- makeRegistry(id="full_permute_double",seed=85037) # This sets the seed for the entire job set
ids <- batchMap(reg, par_fun_double, 31389:31488, more.args=list(rand=T)) # The values here are placeholders only
done <- submitJobs(reg, wait=function(retries) 100, max.retries=10)
waitForJobs(reg)
y <- loadResults(reg)
make_table(y)
# We extend the permuted example concept to examine what happens when we use fewer covariates
reg <- makeRegistry(id="full_permute_fewer",seed=75030)
ids <- batchMap(reg, par_fun_fewer, 31389:31488, more.args=list(rand=T))
done <- submitJobs(reg, wait=function(retries) 100, max.retries=10)
waitForJobs(reg)
y <- loadResults(reg)
make_table(y)
# Generate histogram figure of variability of gain approximations (moved to supplement)
id <- letters[1:10]
seed <- seq(31389, 31389 + 102*9, 102)
input <- data.frame("id"=id,"seed"=seed, stringsAsFactors=F)
input <- lapply(1:10, function(x) input[x,])
# NOTE: This is going to kick off 10 sets of 100 jobs each (1000 jobs).
# This may take a while or you may run into job limits. Please be
# careful about executing this code!
reg <- makeRegistry(id="full_resample",seed=10284)
ids <- batchMap(reg, par_fun_internal, input)
done <- submitJobs(reg, wait=function(retries) 100, max.retries=10)
waitForJobs(reg)
y <- loadResults(reg)
vars <- lapply(y, function(x){apply(x,2,var)})
g_clin <- lapply(vars, function(x){(x[4]-x[6])/x[4]})
g_cpg <- lapply(vars, function(x){(x[10]-x[12])/x[10]})
par(mfrow=c(1,2), mar = c(5,4.5,4,2))
hist(unlist(g_clin)*100, main="Distribution of Percent Gain, Clinical Only", xlab="% Gain Due to Clinical Factors",cex.lab=1.5, cex.axis=1.5, cex.main=1.5)
hist(unlist(g_cpg)*100, main="Distribution of Percent Gain, Clinical + Genomic", xlab="% Gain Due to Clinical + Genomic Factors",cex.lab=1.5, cex.axis=1.5, cex.main=1.5)