-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdrive_hours_km.R
161 lines (149 loc) · 5.45 KB
/
drive_hours_km.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# 10 june 2017 - removed my versions of ggsurv because GGally has now been patched
# updated june 2017 to generate km curves for restricted periods to make sense of
# drives with short observation periods
# for data from https://www.backblaze.com/hard-drive-test-data.html
# ross lazarus me fecit february 18 2016
# ggsurv modified from
# http://www.r-statistics.com/2013/07/creating-good-looking-survival-curves-the-ggsurv-function/
# the legend now corresponds to the order of the very last survival point so
# it's easier to see which is which...
library(survival)
library(rms)
library(GGally)
library(gdata)
setwd('/data/drivefail')
fixres = function(modkm)
{
## reformat and reorder the km model results
oediff = modkm$obs - modkm$exp
oesign = sign(oediff)
oegood = oesign < 0
oechi = (oediff) ^ 2 / modkm$exp
oesort = oechi * oesign
resd = data.frame(
modkm$n,
'Observed' = modkm$obs,
'Expected' = modkm$exp,
'chisq' = oechi,
'sort' = oesort
)
rownames(resd) = names(modkm$n)
sresd = resd[order(resd$sort), ]
sresd$rank = c(1:nrow(resd))
return(sresd)
}
# changeme!!!
setwd('/data/drivefail')
options(width=512)
runtag = 'july6_no_resurrection'
fn = paste('drivefail_res',runtag,sep='')
titl = 'KM plots - Backblaze drive data (hours) to end Q1 2018'
subtitl = paste('Run on',runtag,sep=' ')
# changeme!!!
runtag = paste(runtag,'hours',sep='_')
f = paste(fn, 'xls', sep = '.')
d = read.delim(f, sep = '\t', head = T)
sillyhours = (d$smarthours > 80000)
sh = paste(d$smarthours[sillyhours],collapse=',')
print(paste('found >100000 hour smart9 records = ',sh,sep=''))
d$smarthours[sillyhours] = 80000 #
tm = table(d$manufact)
ds = subset(d, tm[d$manufact] > 200)
s = with(ds, Surv(time = smarthours, event = status))
km.manu = npsurv(s ~ manufact, data = ds)
ofnpdf = paste(fn, '_hours_manufacturer.pdf', sep = '')
ofnpng = paste(fn, '_hours_manufacturer.png', sep = '')
ofnsvg = paste(fn, '_hours_manufacturer.svg', sep = '')
png(ofnpng, height = 1000, width = 1600)
ggsurv(km.manu, main = titl, CI = F, lty.ci = 2, size.ci = 0.1)
dev.off()
pdf(ofnpdf, height = 10, width = 20)
ggsurv(km.manu, main = titl, CI = F, lty.ci = 2, size.ci = 0.1)
dev.off()
svg(ofnsvg)
ggsurv(km.manu, main = titl, CI = F, lty.ci = 2, size.ci = 0.1)
dev.off()
tmo = table(d$model)
dm = subset(d, tmo[d$model] > 200)
# dm = subset(dm,)
sm = with(dm, Surv(time = smart9hours, event = status))
km.mod = npsurv(sm ~ model, data = dm)
ofnpdf = paste(fn, '_hours_model.pdf', sep = '')
ofnpng = paste(fn, '_hours_model.png', sep = '')
ofnsvg = paste(fn, '_hours_model.svg', sep = '')
png(ofnpng, height = 1000, width = 1600)
ggsurv(km.mod, main = titl, CI = F, lty.ci = 2, size.ci = 0.1)
dev.off()
pdf(ofnpdf, height = 10, width = 20)
ggsurv(km.mod, main = titl, CI = F, lty.ci = 2, size.ci = 0.1)
dev.off()
svg(ofnsvg)
ggsurv(km.mod, main = titl, CI = F, lty.ci = 2, size.ci = 0.1)
dev.off()
survdiff(sm ~ model, data = dm, rho = 0)
survdiff(s ~ manufact, data = ds, rho = 0)
# km assumes similar observation duration records for all strata
# some drives (eg seagate 8TB) only have 30 days at best with Q2 2016 data.
# this makes the right hand side of the usual full period KM plots misleading,
# because there's no new information being added about some of the drives over time
# so try only modelling the first week and so on
# some interesting urban myths about early vs late failures?
#
cutps = c(72, 1440, 2880, 10000, 3000, 10000, 20000, 30000, 40000)
ncut = length(cutps)
for (i in c(1:ncut))
{
nmax = cutps[i]
titl = paste('KM first',
nmax,
'hours of observation curves from Backblaze drive data to Q1 2018')
dti = dm
fixme = (dti$smart9hours > nmax) # ignore all data, failing or not beyond nmax, by censoring at nmax.
okmodels = subset(dti,fixme,select=c(model))
models = levels(factor(as.character(okmodels$model)))
modcol = models[order(models)]
if (i == 1) {
mdf = data.frame('model' = modcol)
}
dti$status[fixme] = 0
dti$smart9hours[fixme] = nmax
inmodels = (dti$model %in% models)
dt = subset(dti, inmodels)
stm = with(dt, Surv(time = smart9hours, event = status))
km.mod = npsurv(stm ~ model, data = dt)
kmmod = survdiff(stm ~ model, data = dt, rho = 0)
pres = fixres(kmmod)
rnk = data.frame(rank=pres[order(pres$groups), ]$rank)
mdf = cbindX(mdf, rnk)# cbind(mdf, rnk)
s = paste('*** KM statistics for first',nmax,'SMART hours')
print.noquote(s)
print(pres)
ofnroot = paste('km_first',
nmax,
'_hours_model_',
runtag,
sep = '')
ofnpdf = paste(ofnroot,'.pdf',sep = '')
ofnpng = paste(ofnroot,'.png',sep = '')
png(ofnpng, height = 1000, width = 1600)
print(ggsurv(km.mod, main = titl))
## ggplot requires explicit printing inside loops
dev.off()
pdf(ofnpdf, height = 10, width = 20)
print(ggsurv(km.mod, main = titl, CI = F, lty.ci = 2, size.ci = 0.1))
dev.off()
}
colnames(mdf) = c('Model', paste('Rank_hour', cutps, sep = '_'))
mdfs = mdf[, 2:ncut]
vmdf = apply(mdfs, 1, var,na.rm=TRUE)
mmdf = apply(mdfs, 1, mean,na.rm=TRUE)
mdfo = cbind(mdf, 'mean' = mmdf, 'var' = vmdf)
mdfo = mdfo[order(mdfo$mean), ]
print(mdfo)
# some piccys of the awful smart data
lm = d$manufact
plot(d$obsdays,d$smarthours,col=unclass(lm)+1,cex=0.1)
legend(1550,80000, legend=levels(lm),col=(1:length(levels(lm))), pch=19, cex=0.6)
lm = d$model
plot(d$obsdays,d$smarthours,col=unclass(lm)+1,cex=0.1)
legend(1550,80000, legend=levels(lm),col=(1:length(levels(lm))), pch=19, cex=0.4)