tg <- time_graph_setup('2004/1/1', '2010/6/28')
-start_image("platform_availability.png")
-par(mfrow=c(2,1))
+#start_image("platform_availability.png")
+par(mfrow=c(3,1))
# bottom, left, top, right margins
par(mai=c(0.2, 1, 0.3, 0.1))
-plot(cm$ts, cm$online, type='l', col='black',
+plot(cm$ts, cm$total, type='l', col='grey60',
axes=F,
xlab="",
xlim=tg$xlim,
- ylim=c(0,900),
+ ylim=c(0,1000),
ylab="a) Online Node Count")
+lines(cm$ts, cm$online, type='l', col='black' )
#axis(1, labels=tg$month_str, at=tg$month_ts, cex.axis=0.7)
#axis(1, tick=F, labels=tg$year_str, at=tg$year_ts, cex.axis=0.7, line=1)
axis(1, labels=c("","","","","","",""), at=tg$year_ts, cex.axis=0.7, line=-0.5)
axis(2, las=1)
-planetlab_releases(800)
+planetlab_releases(1000)
par(mai=c(1, 1, 0.2, 0.1))
planetlab_releases(18)
-end_image()
+#end_image()
--- /dev/null
+#!/usr/bin/python
+
+import sys, os, time, bz2
+
+if len(sys.argv) != 2 :
+ print 'usage: bz2comonlogfile'
+ sys.exit()
+
+filename = sys.argv[1]
+start_time = time.time()
+if not ('dump_comon_' in filename and filename.endswith('bz2')) :
+ print 'not a comon log file:'
+ sys.exit()
+
+def str_to_ts(date_str, format="%Y-%m-%d"):
+ ts = time.mktime(time.strptime(date_str, format))
+ return ts
+
+
+# .../dump_cotop_20080101 -> 2008-01-01
+indx = filename.rfind('dump_comon_') + len('dump_comon') + 1
+date = filename[indx:indx+8]
+date = date[0:4] + '-' + date[4:6] + '-' + date[6:8]
+ts = str_to_ts(date)
+
+# read in bz2 log file
+bzobj = bz2.BZ2File(filename, mode = 'r')
+lines = bzobj.readlines()
+
+last_time = 0
+entry = {}
+processed_tags = set()
+
+# keep track of malformed entries
+n_badTS = 0
+n_ambigTS = 0
+n_NA = {}
+
+important_tags = ['Start:', 'Name:', 'RespTime:']
+
+def get_field(table, *args):
+ pos = table
+ l = len(args)
+ for i,v in enumerate(args[:-1]):
+ if v not in pos:
+ pos[v] = {}
+ pos = pos[v]
+ v = args[-1]
+ if args[-1] not in pos:
+ pos[v] = 0
+ return pos[v]
+
+def set_field(table, *args):
+ pos = table
+ l = len(args)
+ #get_field(table, *args[0:-1])
+ for i,v in enumerate(args[:-2]):
+ pos = pos[v]
+ pos[args[-2]] = args[-1]
+
+def isValid(entry):
+ # check important_tags
+ for t in important_tags:
+ if t not in entry:
+ #print "bad entry", entry
+ return False
+
+ try:
+ if 'Uptime:' in entry:
+ float(entry['Uptime:'])
+ except:
+ #print "uptime fault"
+ return False
+
+ return True
+
+hs = {} # HOST SUMMARY
+
+# Process log
+for line in lines :
+ line = line.strip()
+ if line == '' :
+ #Process timestamp
+ try :
+ this_time = int(entry['Start:'][0])
+ fmtime = time.strftime('%D %T', time.localtime(this_time))
+ ambigTS = this_time < last_time
+ if ambigTS :
+ n_ambigTS += 1
+ else :
+ last_time = this_time
+ #outcsv.write('%d,%s' % (this_time, ambigTS))
+ except KeyError :
+ continue
+ except :
+ n_badTS += 1
+ entry = {}
+ processed_tags = set()
+ continue
+ #Process other fields
+ #try :
+
+
+ if True:
+
+ if not isValid(entry):
+ entry = {}
+ processed_tags = set()
+ continue
+
+ h = entry['Name:']
+
+ if h not in hs:
+ get_field(hs,h,'offline')
+ get_field(hs,h,'online')
+ get_field(hs,h,'uptime')
+
+ try:
+ if len(entry['RespTime:'].split()) > 1:
+ set_field(hs,h,'offline', get_field(hs,h,'offline') + 1)
+ else:
+ set_field(hs,h,'online', get_field(hs,h,'online') + 1)
+ set_field(hs,h,'uptime', max(get_field(hs,h,'uptime'),entry['Uptime:']) )
+ except:
+ #print "except resptime"
+ continue
+
+
+ #except KeyError :
+ ## print "key error! on hostname: %s" % h
+ # continue
+
+ entry = {}
+ processed_tags = set()
+ else :
+ words = line.split()
+ tag = words[0]
+ if tag in processed_tags : # start over, since a tag is repeating
+ entry = {}
+ processed_tags = set()
+ entry[tag] = " ".join(words[1:len(words)])
+ processed_tags.add(tag)
+
+# finished processing log file
+
+# clean up memory objs
+#outcsv.close()
+bzobj.close()
+lines = ''
+
+online = 0
+offline = 0
+uptimes = []
+#print "number of hosts:%s" % len(hs)
+for h in hs:
+ if hs[h]['uptime'] > 0: uptimes.append(float(hs[h]['uptime']))
+ if hs[h]['online'] > hs[h]['offline']:
+ online += 1
+ else:
+ offline += 1
+
+l = len(uptimes)
+uptimes.sort()
+end_time = time.time()
+print date, ts, online+offline, online, offline, uptimes[0], uptimes[l/4], uptimes[l/2], uptimes[l/2+l/4], uptimes[-1], end_time-start_time, filename
--- /dev/null
+#!/usr/bin/python
+
+# Collect statistics from myops db on node downtimes.
+# For every node that goes down we need:
+# * node down time
+# * node reboot time
+# * node notice time
+# * node up time
+
+# then for each node, order events by time
+# for each event sequence extract sub-sequences like:
+# down xx up
+# for each such sub-sequence extract
+# time between down and up
+
+from monitor.database.info.model import *
+from math import *
+import sys
+from datetime import datetime, timedelta
+from monitor.common import *
+
+if len(sys.argv) > 1 and sys.argv[1] == "--pcu":
+ args = sys.argv[2:]
+ find_pcu = True
+else:
+ args = sys.argv[1:]
+ find_pcu = False
+
+
+for index,node in enumerate(FindbadNodeRecord.query.all()):
+
+ if find_pcu:
+ if not node.plc_pcuid:
+ # NOTE: if we're looking for nodes with pcus, then skip the ones that don't have them.
+ continue
+ else:
+ if node.plc_pcuid:
+ # NOTE: if we're not looking for nodes with pcus, then skip those that have them
+ continue
+
+ print >>sys.stderr, index, node.hostname
+
+ for v in node.versions:
+ if find_pcu:
+ # we should be guaranteed that the node has pcuids here, but it could be deleted or added later
+ pcu = FindbadPCURecord.get_by(plc_pcuid=v.plc_pcuid)
+ if pcu:
+ v_pcu = pcu.get_as_of(v.timestamp)
+ if v_pcu:
+ print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), v.hostname, v.observed_status, v_pcu.reboot_trial_status, v_pcu.plc_pcu_stats['model'])
+ else:
+ print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), v.hostname, v.observed_status, "NOPCU", "NONE")
+
+ else:
+ print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), v.hostname, v.observed_status, "NOPCU", "NONE")
--- /dev/null
+#!/usr/bin/python
+
+# Collect statistics from myops db on site failures and recovery times.
+# For every site that goes down we need:
+# * site down time
+# * site up time
+
+from monitor.database.info.model import *
+from math import *
+import sys
+from datetime import datetime, timedelta
+from monitor.common import *
+
+d_start = datetime(2009, 6, 10)
+d_end = datetime.now()
+
+#print "date_checked,date_changed,timestamp,last_changed,sitename,status"
+print "date,t,sitename,status,length"
+
+if len(sys.argv[1:]) > 0:
+ l = enumerate(sys.argv[1:])
+ l = [(i,HistorySiteRecord.get_by(loginbase=lb)) for i,lb in l ]
+else:
+ l = enumerate(HistorySiteRecord.query.all())
+
+for index,site in l:
+ d = d_start
+
+ time_at_level = 0
+ not_printed = True
+ start = False
+
+ last_level = site.versions[0].penalty_level
+ last_time = site.versions[0].timestamp
+
+ length = len(site.versions)
+ for i,v in enumerate(site.versions[1:]):
+
+ if v.nodes_up < 2 and not start:
+ last_time = v.timestamp
+ start = True
+
+ if v.nodes_up >= 2 and start and v.penalty_level == 0:
+ time_at_level = Time.dt_to_ts(v.timestamp) - Time.dt_to_ts(last_time)
+ print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp),
+ v.timestamp.strftime("%Y-%m-%d"), v.loginbase, last_level, time_at_level)
+ start = False
+
+ if last_level != v.penalty_level:
+ last_level = v.penalty_level
+
+ #print i,length-2, last_level != v.penalty_level, i == length-2
+ #if last_level != v.penalty_level or i == length-2:
+ #if last_level > v.penalty_level: # or i == length-2:
+ #print "last_time ", last_time
+ #print "diff times", v.timestamp
+ #print "timestamp ", v.timestamp
+ #print "lastchange", v.last_changed
+ # time_at_level = Time.dt_to_ts(v.timestamp) - Time.dt_to_ts(last_time)
+ # print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp),
+ # v.timestamp.strftime("%Y-%m-%d"), v.loginbase, last_level, time_at_level)
+
+ #if last_level != v.penalty_level:
+ # #print "levels mis-match", last_level, v.penalty_level
+ # last_level = v.penalty_level
+ # if last_level != 2:
+ # # print "recording time", v.timestamp
+ # last_time = v.timestamp
+
+
+
+ #while d < d_end:
+#
+# v = site.get_as_of(d)
+# if v:
+# print "%s,%s,%s,%s" % (Time.dt_to_ts(d),
+# d.strftime("%Y-%m-%d"), v.loginbase, v.penalty_level)
+##
+# d += timedelta(1)
+
--- /dev/null
+#!/usr/bin/python
+
+import time, datetime
+import os
+import sys
+
+start = datetime.datetime(int(sys.argv[1]),int(sys.argv[2]), int(sys.argv[3]))
+#end = datetime.datetime(2010,06,10)
+end = datetime.datetime(2004,08,31)
+
+while start < end:
+ file = "dump_comon_%s.bz2" % ( start.strftime("%Y%m%d") )
+ cmd = "scp -q soltesz@opus:/n/fs/codeenlogs/comon_dumps/%s/%s ./" % (start.year, file)
+ os.system(cmd)
+
+ start += datetime.timedelta(1)
+
+ try:
+ os.stat(file)
+ cmd = "./comon_summary_parser.py %s" % file
+ os.system(cmd)
+ os.remove(file)
+ except OSError:
+ continue
+
+
source("functions.r");
+print ("fuckyou 0");
nsh <- read.csv('node_status_history.csv', sep=',', header=TRUE)
nsh_m1_nopcu_total <- read.csv('node_status_history_m1_nopcu_total.csv', sep=',', header=TRUE)
nsh_m1_nopcu_notice <- read.csv('node_status_history_m1_nopcu.csv', sep=',', header=TRUE)
nsh_m1_nopcu_kernel <- read.csv('node_status_history_m1_nopcu_may08sep08.csv', sep=',', header=TRUE)
+nsh_m1_pcu <- read.csv('node_status_history_m1_pcu.csv', sep=',', header=TRUE)
+print ("fuckyou 0a");
node_hist_image <- function (t, year, from, to, max=0, type="week", title="")
{
dates <-seq(as.Date(from), as.Date(to), type)
x_start<-unclass(as.POSIXct("2008-05-01", origin="1970-01-01"))[1]
x_end <-unclass(as.POSIXct("2009-06-1", origin="1970-01-01"))[1]
+print ("fuckyou 0b");
tstamp_0510 <-abline_at_date("2008-05-10", col='grey20', lty=0, height=570)
# dates takes from reboot_image() output for API events.
# red
#tstamp_0313 <-abline_at_date("2009-03-13", col='grey70', height=570)
-start_image("myops_restore_nopcu.png")
+print ("fuckyou 0c");
+start_image("myops_restore_nopcu.eps")
par(mfrow=c(2,1))
par(mai=c(.9,.8,.1,.1))
+print ("fuckyou 1");
plot(dnc2$start[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0815 & dnc2$start <= tstamp_1015) )],
dnc2$available[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0815 & dnc2$start <= tstamp_1015) )],
type='l', col='red', ylim=c(0,600), xlim=c(x_start, x_end),
- xlab="", ylab="Online Node Count", axes=F)
+ xlab="", ylab="a) Online Node Count", axes=F)
+print ("fuckyou 2");
lines(dnc2$start[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0223) )],
dnc2$available[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0223) )],
type='l', col='red')
lines(dnc2$start[which(!is.na(dnc2$available) & dnc2$start > tstamp_1015 & dnc2$start <= tstamp_0223)], dnc2$available[which(!is.na(dnc2$available)& dnc2$start > tstamp_1015 & dnc2$start <= tstamp_0223)], lty=2, type='l', col='blue')
+print ("fuckyou 4");
+
lines(dnc2$start[which(!is.na(dnc2$available) & dnc2$start > tstamp_0510 & dnc2$start <= tstamp_0815)], dnc2$available[which(!is.na(dnc2$available)& dnc2$start > tstamp_0510 & dnc2$start <= tstamp_0815)], lty=3, type='l', col='darkgreen')
#lines(dnc2$start[which(!is.na(dnc2$available))], dnc2$available[which(!is.na(dnc2$available))],
d_m1_kernel_bug <- ecdf(nsh_dist_m1/(60*60*24))
+nsh_m1_short <- nsh_m1_pcu[which(nsh_m1_pcu$start > t_0815 & nsh_m1_pcu$start <= t_0224),]
+nsh_dist_m1 <- node_hist_dist(nsh_m1_short, '2008', '2008-05-10', '2009-03-22', 0, 'day')
+d_m1_pcu <- ecdf(nsh_dist_m1/(60*60*24))
+
+
+
# d<-ecdf(nsh_dist[which(nsh_dist/(60*60*24) < 90 )]/(60*60*24)),
# 180 ~= 6 months.
par(mai=c(.9,.9,.1,.3))
x_lim_max <- 150
+print ("fuckyou 4a");
plot(d_m1_total, xlim=c(0,x_lim_max), ylim=c(0,1), axes=F, xlab="Days to Resolve",
- ylab="Fraction of Offline Nodes Restored", col.hor='red', col.vert='red', pch='.',
+ ylab="b) Fraction of Offline Nodes Restored", col.hor='red', col.vert='red', pch='.',
col.points='red', main="")
+print ("fuckyou 5");
plot(d_m1_notice_bug, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve",
col.hor='blue', col.vert='blue', pch='.',
col.points='blue', lty=2, add=TRUE)
+print ("fuckyou 6");
plot(d_m1_kernel_bug, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve",
col.hor='darkgreen', col.vert='darkgreen', pch='.',
col.points='darkgreen', lty=3, add=TRUE)
+#plot(d_m1_pcu, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve",
+# col.hor='purple', col.vert='purple', pch='.',
+# col.points='purple', lty=4, add=TRUE)
+
weeks <- c(0,7,14,21,28,60,90,120,150,180)
axis(1, labels=weeks, at=weeks)
percentages <- c(0,0.25, 0.5, 0.75, 0.85, 0.95, 1)
--- /dev/null
+#!/usr/bin/python
+
+import sys
+import os
+
+def recent(q, val, length):
+ return [val] + q[:length-1]
+
+def add(d, path, v):
+ if path not in d:
+ d[path] = []
+ d[path] = recent(d[path], v, 6)
+ #d[path].append(v)
+
+def stateof(l):
+ if len(set(l)) == 1:
+ return l[0]
+ else:
+ return "BOOT"
+
+def in_hard_state(l):
+ if len(set(l)) == 1:
+ return True
+ else:
+ return False
+
+recent_node_ts = {}
+recent_node_states = {}
+last_node_state = {}
+last_node_ts = {}
+print "ts,host,status,pcustatus,model,length"
+
+while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ line = line.strip()
+ # NOTE assumes ts are ordered.
+ try:
+ (ts, host, state, pcu, pcumodel) = line.split(',')
+ except:
+ print >>sys.stderr, "EXCEPTION:", line
+ while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ line = line.strip()
+ if line[0] == ',':
+ pcumodel = line[1:]
+ break
+ #sys.exit(1)
+
+ ts = int(float(ts))
+ add(recent_node_states, host, state)
+ add(recent_node_ts, host, ts)
+
+ if host not in last_node_state:
+ last_node_state[host] = "UNKNOWN"
+ last_node_ts[host] = ts
+
+ if ( (in_hard_state(recent_node_states[host]) and stateof(recent_node_states[host]) == "DOWN") or \
+ not in_hard_state(recent_node_states[host]) ) and \
+ last_node_state[host] != stateof(recent_node_states[host]):
+
+ if stateof(recent_node_states[host]) == "DOWN":
+ ts = recent_node_ts[host][-1] # get earliest time down
+ print "%s,%s,%s,%s,%s,%s" % (ts, host, stateof(recent_node_states[host]), pcu, pcumodel, ts-last_node_ts[host] )
+ last_node_state[host] = stateof(recent_node_states[host])
+ last_node_ts[host] = ts
+
+