clearer names for actions, and infer actions better
[monitor.git] / statistics / myops_restoration.r
1 source("functions.r");
2
3 print ("fuckyou 0");
4
5 nsh <- read.csv('node_status_history.csv', sep=',', header=TRUE)
6
7 # system("./harvest_nodehistory.py > node_status_history_nopcu.csv")
8 nsh_nopcu <- read.csv('node_status_history_nopcu.csv', sep=',', header=TRUE)
9
10 nsh_m1 <- read.csv('node_status_history_m1.csv', sep=',', header=TRUE)
11 # system("stats-m1/harvest_nodehistory_m1.py > ./node_status_history_m1_nopcu.csv")
12 nsh_m1_nopcu_total <- read.csv('node_status_history_m1_nopcu_total.csv', sep=',', header=TRUE)
13 nsh_m1_nopcu_notice <- read.csv('node_status_history_m1_nopcu.csv', sep=',', header=TRUE)
14 nsh_m1_nopcu_kernel <- read.csv('node_status_history_m1_nopcu_may08sep08.csv', sep=',', header=TRUE)
15 nsh_m1_pcu <- read.csv('node_status_history_m1_pcu.csv', sep=',', header=TRUE)
16
17 print ("fuckyou 0a");
18 node_hist_image <- function (t, year, from, to, max=0, type="week", title="")
19 {
20     dates <-seq(as.Date(from), as.Date(to), type)
21     months <- format(dates, "%b-%d")
22     hbreaks<-unclass(as.POSIXct(dates))
23
24     image <- matrix(data=0, nrow=max(as.numeric(t$hostname)), ncol=length(hbreaks))
25
26     for ( i in seq(1, length(hbreaks)) )
27     {
28         # find the range : d plus a day
29         d <- hbreaks[i]
30         d_end <- d+60*60*24
31         # find unique hosts in this day range
32         t_sub <- t[which(t$start > d & t$start <= d_end & t$status == 'down'),]
33         unique_hosts <- unique(t_sub$hostname)
34         if (length(unique_hosts) == 0 ) { next }
35
36         host_n_list <- unique_hosts
37         host_s_list <- as.character(unique_hosts)
38
39         for ( hi in seq(1, length(unique_hosts))  ) 
40         {
41             host_s <- host_s_list[hi]
42             host_n <- host_n_list[hi]
43             # events for this host after d (to avoid already identified events)
44             ev <- t[which(t$hostname == host_s & t$start > d ),]
45             print (sprintf("events length for host %s %s", host_s, length(ev$start)));
46             # get down events for this host
47             down_ev_index_list <- which(ev$status == 'down')
48             for ( e_i in down_ev_index_list )
49             {
50                 if ( e_i == length(ev$status) ) { 
51                     # then the node ends down, so fill in the rest with 1.
52                     for ( j in seq(i,length(hbreaks)) ) {
53                         image[host_n,j] <- 1
54                     }
55                 } else {
56                     # then there is a subsequent 'good' event
57                     good_ev <- ev[e_i+1,]
58                     down_ev <- ev[e_i,]
59                     dbreaks <- seq(d,good_ev$start+60*60*24,60*60*24)
60                     # for every index for time d, to good_ev$start
61                     l<-length(dbreaks)
62                     print (sprintf("length %s",l));
63                     for ( j in seq(i,i+l) )
64                     {
65                         image[host_n,j] <- 1
66                     }
67                 }
68             }
69         }
70     }
71     myImagePlot(image, xLabels=months, yLabels=c(""), title=title)
72     return (image);
73 }
74
75
76
77 node_hist_dist <- function (t, year, from, to, max=0, type="week", title="")
78 {
79     dates <-seq(as.Date(from), as.Date(to), type)
80     months <- format(dates, "%b-%d")
81     hbreaks<-unclass(as.POSIXct(dates))
82     current_ts <- unclass(as.POSIXct(Sys.Date()))
83
84     dist <- NULL
85
86     unique_hosts <- unique(t$hostname)
87     host_n_list <- unique_hosts
88     host_s_list <- as.character(unique_hosts)
89
90     down <- 0
91
92     for ( hi in seq(1, length(unique_hosts))  ) 
93     {
94         host_s <- host_s_list[hi]
95         host_n <- host_n_list[hi]
96         # events for this host after d (to avoid already identified events)
97         ev <- t[which( t$hostname == host_s ),]
98         print (sprintf("events length for host %s %s", host_s, length(ev$start)));
99         # get down events for this host
100         down_ev_index_list <- which(ev$status == 'down')
101         for ( e_i in down_ev_index_list )
102         {
103             # when equal, there is no resolution so leave it as down
104             if ( e_i != length(ev$status) ) { 
105                 good_ev <- ev[e_i+1,]
106                 down_ev <- ev[e_i,]
107                 dist <- c(dist, good_ev$start - down_ev$start)
108             } else if ( e_i == length(ev$status) && length(ev$status) == 1) { 
109                 print (sprintf("DOWN FOREVER! %s", length(ev$start) ))
110                 down <- down + 1
111                 dist <- c(dist, 10*current_ts - ev$start)
112             }
113         }
114     }
115     print(down);
116     return (dist);
117 }
118
119
120
121 # data collected from M2 pickle files
122 dnc <- read.csv('daily-available-node-count.csv', sep=',', header=TRUE)
123
124 dnc2<-add_timestamp(dnc)
125
126 tstamp_08 <-unclass(as.POSIXct("2008-05-07", origin="1970-01-01"))[1]
127 dnc2 <- dnc2[which( dnc2$start >  tstamp_08 ),]
128
129
130 dates <-seq(as.Date('2008-05-07'), as.Date('2009-05-07'), 'week')
131 months <- format(dates, "%b")
132 hbreaks<-unclass(as.POSIXct(dates))
133
134 x_start<-unclass(as.POSIXct("2008-05-01", origin="1970-01-01"))[1]
135 x_end  <-unclass(as.POSIXct("2009-06-1", origin="1970-01-01"))[1]
136
137 print ("fuckyou 0b");
138
139 tstamp_0510 <-abline_at_date("2008-05-10", col='grey20', lty=0, height=570)
140 # dates takes from reboot_image() output for API events.
141 # green
142 tstamp_0610 <-abline_at_date("2008-06-10", col='grey40', lty=5, height=570)
143 tstamp_0815 <-abline_at_date("2008-08-15", col='grey70', lty=1, height=570)
144
145 # red
146 #tstamp_0905 <-abline_at_date("2008-09-05", col='grey70', height=570)
147 tstamp_0924 <-abline_at_date("2008-09-24", col='grey70', lty=1, height=570)
148 tstamp_1015 <-abline_at_date("2008-10-15", col='grey40', lty=5, height=570)
149 # blue
150 #tstamp_1105 <-abline_at_date("2008-11-05", col='white', lty=2, height=570)
151 #tstamp_1214 <-abline_at_date("2008-12-14", col='grey70', height=570)
152 tstamp_0223 <-abline_at_date("2009-02-23", col='grey70', height=570)
153 # red
154 #tstamp_0313 <-abline_at_date("2009-03-13", col='grey70', height=570)
155
156 print ("fuckyou 0c");
157 start_image("myops_restore_nopcu.eps")
158 par(mfrow=c(2,1))
159 par(mai=c(.9,.8,.1,.1))
160 print ("fuckyou 1");
161 plot(dnc2$start[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0815 & dnc2$start <= tstamp_1015) )], 
162     dnc2$available[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0815 & dnc2$start <= tstamp_1015) )], 
163     type='l', col='red', ylim=c(0,600), xlim=c(x_start, x_end),
164     xlab="", ylab="a) Online Node Count", axes=F)
165
166 print ("fuckyou 2");
167 lines(dnc2$start[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0223) )], 
168     dnc2$available[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0223) )], 
169     type='l', col='red')
170
171 lines(dnc2$start[which(!is.na(dnc2$available) & dnc2$start > tstamp_1015 & dnc2$start <= tstamp_0223)], dnc2$available[which(!is.na(dnc2$available)& dnc2$start > tstamp_1015 & dnc2$start <= tstamp_0223)], lty=2, type='l', col='blue')
172
173 print ("fuckyou 4");
174
175 lines(dnc2$start[which(!is.na(dnc2$available) & dnc2$start > tstamp_0510 & dnc2$start <= tstamp_0815)], dnc2$available[which(!is.na(dnc2$available)& dnc2$start > tstamp_0510 & dnc2$start <= tstamp_0815)], lty=3, type='l', col='darkgreen')
176
177 #lines(dnc2$start[which(!is.na(dnc2$available))], dnc2$available[which(!is.na(dnc2$available))], 
178 #type='l', col='red', ylim=c(0,1000))
179 axis(2, las=1)
180 axis(1, cex.axis=0.7, labels=months, at=hbreaks)
181        
182
183
184 tstamp_0510 <-abline_at_date("2008-05-10", col='grey20', lty=0, height=570)
185 # dates takes from reboot_image() output for API events.
186 # green
187 tstamp_0610 <-abline_at_date("2008-06-10", col='grey40', lty=5, height=570)
188 tstamp_0815 <-abline_at_date("2008-08-15", col='grey70', lty=1, height=570)
189
190 # red
191 #tstamp_0905 <-abline_at_date("2008-09-05", col='grey70', height=570)
192 tstamp_0924 <-abline_at_date("2008-09-24", col='grey70', lty=1, height=570)
193 tstamp_1015 <-abline_at_date("2008-10-15", col='grey40', lty=5, height=570)
194 # blue
195 #tstamp_1105 <-abline_at_date("2008-11-05", col='white', lty=2, height=570)
196 #tstamp_1214 <-abline_at_date("2008-12-14", col='grey70', height=570)
197 tstamp_0223 <-abline_at_date("2009-02-23", col='grey70', height=570)
198 # red
199 #tstamp_0313 <-abline_at_date("2009-03-13", col='grey70', height=570)
200
201 #text(x=c(tstamp_0610+(tstamp_0815-tstamp_0610)/2,
202 #         tstamp_0815+(tstamp_0905-tstamp_0815)/2,
203 #         tstamp_0924+(tstamp_1015-tstamp_0924)/2, 
204 #         tstamp_1015+(tstamp_1214-tstamp_1015)/2, 
205 #         tstamp_1214+(tstamp_0223-tstamp_1214)/2, 
206 #         tstamp_0223+(tstamp_0313-tstamp_0223)/2), 
207 #     y=c(0),
208 #     labels=c("bug1", 'fix1', 'fix2', 'fix3', 'bug2', 'fix4')) #, 'fix 2', 'fix 3', 'fix 4'))
209
210 text(x=c( tstamp_0815,
211          tstamp_0924,
212          tstamp_0223),
213      y=c(610),
214      adj=c(1, 0.5),
215      labels=c('fix1', 'fix2', 'fix3'))
216
217
218 text(x=c(tstamp_0510-(60*60*24*10), 
219         tstamp_0610,
220         tstamp_1015),
221      adj=c(0, 0.5),
222      y=c(610),
223      labels=c('Events:', 'bug1', 'bug2'))
224
225 mtext("2008                                 2009", 1,2)
226 legend(unclass(as.POSIXct("2009-02-23", origin="1970-01-01"))[1], 200,
227         cex=0.7,
228         legend=c("Typical MyOps", "Bug1", "Bug2", 'Bug Added', 'Fix Added'),
229         pch=c('-', '-', '-'),
230         col=c('red', 'darkgreen', 'blue', 'grey20', 'grey70'),
231         lty=c(1, 3, 2, 5, 1), merge=T)
232
233         #legend=c("Registered", "Online", 'Kernel Update', 'MyOps Event'),
234         #pch=c('-', '-', '-', '-'),
235         #col=c('blue', 'red', 'grey20', 'grey70'),
236         #lty=c(1, 1, 2, 1), merge=T)
237
238 ###################################
239
240 t_0815 <- unclass(as.POSIXct("2008-08-15", origin="1970-01-01"))[1]
241 t_0905 <- unclass(as.POSIXct("2008-09-05", origin="1970-01-01"))[1]
242
243 t_0924 <- unclass(as.POSIXct("2008-09-24", origin="1970-01-01"))[1]
244 t_1015 <- unclass(as.POSIXct("2008-10-15", origin="1970-01-01"))[1]
245
246 t_0223 <- unclass(as.POSIXct("2009-02-23", origin="1970-01-01"))[1]
247 t_0313 <- unclass(as.POSIXct("2009-03-13", origin="1970-01-01"))[1]
248
249 nsh_m1_short <- nsh_m1_nopcu_total[which( 
250         (nsh_m1_nopcu_total$start > t_0815 & nsh_m1_nopcu_total$start <= t_0313) ),]
251 nsh_dist_m1 <- node_hist_dist(nsh_m1_short, '2008', '2008-05-01', '2009-05-22', 0, 'day')
252 d_m1_total<- ecdf(nsh_dist_m1/(60*60*24))
253
254 # NOTE: something happened betweeen 10-2 and 10-3
255 # NOTICE BUG
256 t_1015 <- unclass(as.POSIXct("2008-10-15", origin="1970-01-01"))[1]
257 t_0224 <- unclass(as.POSIXct("2009-02-24", origin="1970-01-01"))[1]
258 nsh_m1_short <- nsh_m1_nopcu_notice[which(nsh_m1_nopcu_notice$start > t_1015 & nsh_m1_nopcu_notice$start <= t_0224),]
259 nsh_dist_m1 <- node_hist_dist(nsh_m1_short, '2008', '2008-10-01', '2009-03-22', 0, 'day')
260 d_m1_notice_bug <- ecdf(nsh_dist_m1/(60*60*24))
261
262
263 # KERNEL BUG
264 t_0530 <- unclass(as.POSIXct("2008-05-30", origin="1970-01-01"))[1]
265 t_0815 <- unclass(as.POSIXct("2008-08-15", origin="1970-01-01"))[1]
266 nsh_m1_short <- nsh_m1_nopcu_kernel[which(nsh_m1_nopcu_kernel$start > t_0530 & nsh_m1_nopcu_kernel$start <= t_0815),]
267 nsh_dist_m1 <- node_hist_dist(nsh_m1_short, '2008', '2008-05-10', '2008-08-15', 0, 'day')
268 d_m1_kernel_bug <- ecdf(nsh_dist_m1/(60*60*24))
269
270
271 nsh_m1_short <- nsh_m1_pcu[which(nsh_m1_pcu$start > t_0815 & nsh_m1_pcu$start <= t_0224),]
272 nsh_dist_m1 <- node_hist_dist(nsh_m1_short, '2008', '2008-05-10', '2009-03-22', 0, 'day')
273 d_m1_pcu <- ecdf(nsh_dist_m1/(60*60*24))
274
275
276
277 # d<-ecdf(nsh_dist[which(nsh_dist/(60*60*24) < 90 )]/(60*60*24)), 
278 # 180 ~= 6 months.
279 par(mai=c(.9,.9,.1,.3))
280 #plot(d, xlim=c(0,180), ylim=c(0,1), axes=F, xlab="Days to Resolve", ylab="Percentile",
281 #   col.hor='red', col.vert='red', pch='.', col.points='red', main="")
282
283 x_lim_max <- 150
284
285 print ("fuckyou 4a");
286 plot(d_m1_total, xlim=c(0,x_lim_max), ylim=c(0,1), axes=F, xlab="Days to Resolve", 
287     ylab="b) Fraction of Offline Nodes Restored", col.hor='red', col.vert='red', pch='.', 
288     col.points='red', main="")
289 print ("fuckyou 5");
290
291 plot(d_m1_notice_bug, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve", 
292     col.hor='blue', col.vert='blue', pch='.', 
293     col.points='blue', lty=2, add=TRUE)
294 print ("fuckyou 6");
295
296 plot(d_m1_kernel_bug, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve", 
297     col.hor='darkgreen', col.vert='darkgreen', pch='.', 
298     col.points='darkgreen', lty=3, add=TRUE)
299
300 #plot(d_m1_pcu, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve", 
301 #    col.hor='purple', col.vert='purple', pch='.', 
302 #    col.points='purple', lty=4, add=TRUE)
303
304 weeks <- c(0,7,14,21,28,60,90,120,150,180)
305 axis(1, labels=weeks, at=weeks)
306 percentages <- c(0,0.25, 0.5, 0.75, 0.85, 0.95, 1)
307 axis(2, las=1, labels=percentages, at=percentages)
308
309 abline(v=c(7,14,21,28), col='grey80', lty=2)
310 abline(h=c(0.5, 0.6, 0.75, 0.85, 0.95 ), col='grey80', lty=2)
311 abline(v=c(91), col='grey80', lty=2)
312
313
314 legend(92, 0.25,
315        cex=0.7,
316        legend=c("Typical MyOps", "Only Notices", "No Notices"),
317        pch=c('-', '-', '-'),
318        col=c('red', 'blue', 'darkgreen'),
319        lty=c(1, 2, 3), merge=T)
320
321 end_image()