new files for dumping and parsing logs
authorStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 26 Jul 2010 16:49:17 +0000 (16:49 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 26 Jul 2010 16:49:17 +0000 (16:49 +0000)
statistics/comon_analysis.r
statistics/comon_summary_parser.py [new file with mode: 0755]
statistics/harvest_nodestatus.py [new file with mode: 0755]
statistics/harvest_site_history.py [new file with mode: 0755]
statistics/iterate_days.py [new file with mode: 0755]
statistics/myops_restoration.r
statistics/parse_nodestatus.py [new file with mode: 0755]

index b0c0747..76445d2 100644 (file)
@@ -59,25 +59,26 @@ time_graph_setup <- function (from, to)
 
 tg <- time_graph_setup('2004/1/1', '2010/6/28')
 
-start_image("platform_availability.png")
-par(mfrow=c(2,1))
+#start_image("platform_availability.png")
+par(mfrow=c(3,1))
 # bottom, left, top, right margins
 par(mai=c(0.2, 1, 0.3, 0.1))
 
 
 
-plot(cm$ts, cm$online, type='l', col='black', 
+plot(cm$ts, cm$total, type='l', col='grey60', 
         axes=F,
         xlab="",
         xlim=tg$xlim,
-        ylim=c(0,900),
+        ylim=c(0,1000),
         ylab="a) Online Node Count")
+lines(cm$ts, cm$online, type='l', col='black' )
 
 #axis(1, labels=tg$month_str, at=tg$month_ts, cex.axis=0.7)
 #axis(1, tick=F, labels=tg$year_str, at=tg$year_ts, cex.axis=0.7, line=1)
 axis(1, labels=c("","","","","","",""), at=tg$year_ts, cex.axis=0.7, line=-0.5)
 axis(2, las=1)
-planetlab_releases(800)
+planetlab_releases(1000)
 
 
 par(mai=c(1, 1, 0.2, 0.1))
@@ -107,4 +108,4 @@ abline(h=log(7*60*60*24), col='grey80', lty=2)
 
 planetlab_releases(18)
 
-end_image()
+#end_image()
diff --git a/statistics/comon_summary_parser.py b/statistics/comon_summary_parser.py
new file mode 100755 (executable)
index 0000000..f7e7a0b
--- /dev/null
@@ -0,0 +1,165 @@
+#!/usr/bin/python
+
+import sys, os, time, bz2
+
+if len(sys.argv) != 2 :
+       print 'usage: bz2comonlogfile'
+       sys.exit()
+
+filename = sys.argv[1]
+start_time = time.time()
+if not ('dump_comon_' in filename and filename.endswith('bz2')) :
+       print 'not a comon log file:'
+       sys.exit()
+
+def str_to_ts(date_str, format="%Y-%m-%d"):
+   ts = time.mktime(time.strptime(date_str, format))
+   return ts
+
+
+# .../dump_cotop_20080101 -> 2008-01-01
+indx = filename.rfind('dump_comon_') + len('dump_comon') + 1
+date = filename[indx:indx+8]
+date = date[0:4] + '-' + date[4:6] + '-' + date[6:8]
+ts = str_to_ts(date)
+
+# read in bz2 log file
+bzobj = bz2.BZ2File(filename, mode = 'r')
+lines = bzobj.readlines()
+
+last_time = 0
+entry = {}
+processed_tags = set()
+
+# keep track of malformed entries
+n_badTS = 0
+n_ambigTS = 0
+n_NA = {}
+
+important_tags = ['Start:', 'Name:', 'RespTime:']
+
+def get_field(table, *args):
+       pos = table
+       l = len(args)
+       for i,v in enumerate(args[:-1]):
+               if v not in pos:
+                       pos[v] = {}
+               pos = pos[v]
+       v = args[-1]
+       if args[-1] not in pos:
+               pos[v] = 0
+       return pos[v]
+
+def set_field(table, *args):
+       pos = table
+       l = len(args)
+       #get_field(table, *args[0:-1])
+       for i,v in enumerate(args[:-2]):
+               pos = pos[v]
+       pos[args[-2]] = args[-1]
+
+def isValid(entry):
+       # check important_tags
+       for t in important_tags:
+               if t not in entry: 
+                       #print "bad entry", entry
+                       return False
+
+       try:
+               if 'Uptime:' in entry:
+                       float(entry['Uptime:'])
+       except:
+               #print "uptime fault"
+               return False
+
+       return True
+
+hs = {} # HOST SUMMARY 
+
+# Process log
+for line in lines :
+       line = line.strip()
+       if line == '' :
+               #Process timestamp
+               try :
+                       this_time = int(entry['Start:'][0])
+                       fmtime = time.strftime('%D %T', time.localtime(this_time))
+                       ambigTS = this_time < last_time
+                       if ambigTS :
+                               n_ambigTS += 1
+                       else :
+                               last_time = this_time
+                       #outcsv.write('%d,%s' % (this_time, ambigTS))
+               except KeyError :
+                       continue
+               except :
+                       n_badTS += 1
+                       entry = {}
+                       processed_tags = set()
+                       continue
+               #Process other fields
+               #try : 
+
+
+               if True:
+
+                       if not isValid(entry):
+                               entry = {}
+                               processed_tags = set()
+                               continue
+
+                       h = entry['Name:']
+
+                       if h not in hs:
+                               get_field(hs,h,'offline') 
+                               get_field(hs,h,'online') 
+                               get_field(hs,h,'uptime') 
+
+                       try:
+                               if len(entry['RespTime:'].split()) > 1:
+                                       set_field(hs,h,'offline', get_field(hs,h,'offline') + 1)
+                               else:
+                                       set_field(hs,h,'online', get_field(hs,h,'online') + 1)
+                                       set_field(hs,h,'uptime', max(get_field(hs,h,'uptime'),entry['Uptime:']) )
+                       except:
+                               #print "except resptime"
+                               continue
+
+
+               #except KeyError :
+               ##      print "key error! on hostname: %s" % h
+               #       continue
+
+               entry = {}
+               processed_tags = set()
+       else :
+               words = line.split()
+               tag = words[0]
+               if tag in processed_tags :      # start over, since a tag is repeating
+                       entry = {}
+                       processed_tags = set()
+               entry[tag] = " ".join(words[1:len(words)])
+               processed_tags.add(tag)
+
+# finished processing log file
+                       
+# clean up memory objs
+#outcsv.close()
+bzobj.close()
+lines = ''
+
+online = 0
+offline = 0
+uptimes = []
+#print "number of hosts:%s" % len(hs)
+for h in hs:
+       if hs[h]['uptime'] > 0: uptimes.append(float(hs[h]['uptime']))
+       if hs[h]['online'] > hs[h]['offline']:
+               online += 1
+       else:
+               offline += 1
+
+l = len(uptimes)
+uptimes.sort()
+end_time = time.time()
+print date, ts, online+offline, online, offline, uptimes[0], uptimes[l/4], uptimes[l/2], uptimes[l/2+l/4], uptimes[-1], end_time-start_time, filename
diff --git a/statistics/harvest_nodestatus.py b/statistics/harvest_nodestatus.py
new file mode 100755 (executable)
index 0000000..279cb16
--- /dev/null
@@ -0,0 +1,55 @@
+#!/usr/bin/python
+
+# Collect statistics from myops db on node downtimes.
+# For every node that goes down we need:
+#    * node down time
+#    * node reboot time
+#    * node notice time
+#    * node up time
+
+# then for each node, order events by time
+#    for each event sequence extract sub-sequences like:
+#        down xx up         
+#    for each such sub-sequence extract
+#        time between down and up
+
+from monitor.database.info.model import *
+from math import *
+import sys
+from datetime import datetime, timedelta
+from monitor.common import *
+
+if len(sys.argv) > 1 and sys.argv[1] == "--pcu":
+    args = sys.argv[2:]
+    find_pcu = True
+else:
+    args = sys.argv[1:]
+    find_pcu = False
+
+
+for index,node in enumerate(FindbadNodeRecord.query.all()):
+
+    if find_pcu:
+        if not node.plc_pcuid: 
+            # NOTE: if we're looking for nodes with pcus, then skip the ones that don't have them.
+            continue
+    else:
+        if node.plc_pcuid:
+            # NOTE: if we're not looking for nodes with pcus, then skip those that have them
+            continue
+
+    print >>sys.stderr, index, node.hostname
+
+    for v in node.versions:
+        if find_pcu:
+            # we should be guaranteed that the node has pcuids here, but it could be deleted or added later
+            pcu = FindbadPCURecord.get_by(plc_pcuid=v.plc_pcuid)
+            if pcu:
+                v_pcu = pcu.get_as_of(v.timestamp)
+                if v_pcu:
+                    print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), v.hostname, v.observed_status, v_pcu.reboot_trial_status, v_pcu.plc_pcu_stats['model'])
+            else:
+                print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), v.hostname, v.observed_status, "NOPCU", "NONE")
+                
+        else:
+            print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), v.hostname, v.observed_status, "NOPCU", "NONE")
diff --git a/statistics/harvest_site_history.py b/statistics/harvest_site_history.py
new file mode 100755 (executable)
index 0000000..8ec3335
--- /dev/null
@@ -0,0 +1,80 @@
+#!/usr/bin/python
+
+# Collect statistics from myops db on site failures and recovery times.
+# For every site that goes down we need:
+#    * site down time
+#    * site up time
+
+from monitor.database.info.model import *
+from math import *
+import sys
+from datetime import datetime, timedelta
+from monitor.common import *
+
+d_start = datetime(2009, 6, 10)
+d_end = datetime.now()
+
+#print "date_checked,date_changed,timestamp,last_changed,sitename,status"
+print "date,t,sitename,status,length"
+
+if len(sys.argv[1:]) > 0:
+    l = enumerate(sys.argv[1:])
+    l = [(i,HistorySiteRecord.get_by(loginbase=lb)) for i,lb in l ]
+else:
+    l = enumerate(HistorySiteRecord.query.all())
+
+for index,site in l:
+    d = d_start
+
+    time_at_level = 0
+    not_printed = True
+    start = False
+
+    last_level = site.versions[0].penalty_level
+    last_time = site.versions[0].timestamp
+
+    length = len(site.versions)
+    for i,v in enumerate(site.versions[1:]):
+
+        if v.nodes_up < 2 and not start:
+            last_time = v.timestamp
+            start = True
+
+        if v.nodes_up >= 2 and start and v.penalty_level == 0:
+            time_at_level = Time.dt_to_ts(v.timestamp) - Time.dt_to_ts(last_time)
+            print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), 
+                v.timestamp.strftime("%Y-%m-%d"), v.loginbase, last_level, time_at_level)
+            start = False
+
+        if last_level != v.penalty_level:
+            last_level = v.penalty_level
+
+        #print i,length-2, last_level != v.penalty_level, i == length-2
+        #if last_level != v.penalty_level or i == length-2:
+        #if last_level > v.penalty_level: # or i == length-2:
+            #print "last_time ", last_time
+            #print "diff times", v.timestamp
+            #print "timestamp ", v.timestamp 
+            #print "lastchange", v.last_changed
+        #    time_at_level = Time.dt_to_ts(v.timestamp) - Time.dt_to_ts(last_time)
+        #    print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), 
+        #        v.timestamp.strftime("%Y-%m-%d"), v.loginbase, last_level, time_at_level)
+
+        #if last_level != v.penalty_level:
+        #    #print "levels mis-match", last_level, v.penalty_level
+        #    last_level = v.penalty_level
+        #    if last_level != 2:
+        #    #    print "recording time", v.timestamp
+        #        last_time = v.timestamp
+
+
+        
+    #while d < d_end:
+#
+#        v = site.get_as_of(d)
+#        if v:
+#            print "%s,%s,%s,%s" % (Time.dt_to_ts(d),
+#                    d.strftime("%Y-%m-%d"), v.loginbase, v.penalty_level)
+##
+#        d += timedelta(1)
+
diff --git a/statistics/iterate_days.py b/statistics/iterate_days.py
new file mode 100755 (executable)
index 0000000..13e16bd
--- /dev/null
@@ -0,0 +1,26 @@
+#!/usr/bin/python
+
+import time, datetime
+import os
+import sys
+
+start = datetime.datetime(int(sys.argv[1]),int(sys.argv[2]), int(sys.argv[3]))
+#end = datetime.datetime(2010,06,10)
+end = datetime.datetime(2004,08,31)
+
+while start < end:
+       file = "dump_comon_%s.bz2" % ( start.strftime("%Y%m%d") )
+       cmd = "scp -q soltesz@opus:/n/fs/codeenlogs/comon_dumps/%s/%s ./" % (start.year, file)
+       os.system(cmd)
+
+       start += datetime.timedelta(1)
+
+       try:
+               os.stat(file)
+               cmd = "./comon_summary_parser.py %s" % file
+               os.system(cmd)
+               os.remove(file)
+       except OSError:
+               continue
+               
+
index 838e0b1..d953004 100644 (file)
@@ -1,5 +1,6 @@
 source("functions.r");
 
+print ("fuckyou 0");
 
 nsh <- read.csv('node_status_history.csv', sep=',', header=TRUE)
 
@@ -11,7 +12,9 @@ nsh_m1 <- read.csv('node_status_history_m1.csv', sep=',', header=TRUE)
 nsh_m1_nopcu_total <- read.csv('node_status_history_m1_nopcu_total.csv', sep=',', header=TRUE)
 nsh_m1_nopcu_notice <- read.csv('node_status_history_m1_nopcu.csv', sep=',', header=TRUE)
 nsh_m1_nopcu_kernel <- read.csv('node_status_history_m1_nopcu_may08sep08.csv', sep=',', header=TRUE)
+nsh_m1_pcu <- read.csv('node_status_history_m1_pcu.csv', sep=',', header=TRUE)
 
+print ("fuckyou 0a");
 node_hist_image <- function (t, year, from, to, max=0, type="week", title="")
 {
     dates <-seq(as.Date(from), as.Date(to), type)
@@ -131,6 +134,7 @@ hbreaks<-unclass(as.POSIXct(dates))
 x_start<-unclass(as.POSIXct("2008-05-01", origin="1970-01-01"))[1]
 x_end  <-unclass(as.POSIXct("2009-06-1", origin="1970-01-01"))[1]
 
+print ("fuckyou 0b");
 
 tstamp_0510 <-abline_at_date("2008-05-10", col='grey20', lty=0, height=570)
 # dates takes from reboot_image() output for API events.
@@ -149,20 +153,25 @@ tstamp_0223 <-abline_at_date("2009-02-23", col='grey70', height=570)
 # red
 #tstamp_0313 <-abline_at_date("2009-03-13", col='grey70', height=570)
 
-start_image("myops_restore_nopcu.png")
+print ("fuckyou 0c");
+start_image("myops_restore_nopcu.eps")
 par(mfrow=c(2,1))
 par(mai=c(.9,.8,.1,.1))
+print ("fuckyou 1");
 plot(dnc2$start[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0815 & dnc2$start <= tstamp_1015) )], 
     dnc2$available[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0815 & dnc2$start <= tstamp_1015) )], 
     type='l', col='red', ylim=c(0,600), xlim=c(x_start, x_end),
-    xlab="", ylab="Online Node Count", axes=F)
+    xlab="", ylab="a) Online Node Count", axes=F)
 
+print ("fuckyou 2");
 lines(dnc2$start[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0223) )], 
     dnc2$available[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0223) )], 
     type='l', col='red')
 
 lines(dnc2$start[which(!is.na(dnc2$available) & dnc2$start > tstamp_1015 & dnc2$start <= tstamp_0223)], dnc2$available[which(!is.na(dnc2$available)& dnc2$start > tstamp_1015 & dnc2$start <= tstamp_0223)], lty=2, type='l', col='blue')
 
+print ("fuckyou 4");
+
 lines(dnc2$start[which(!is.na(dnc2$available) & dnc2$start > tstamp_0510 & dnc2$start <= tstamp_0815)], dnc2$available[which(!is.na(dnc2$available)& dnc2$start > tstamp_0510 & dnc2$start <= tstamp_0815)], lty=3, type='l', col='darkgreen')
 
 #lines(dnc2$start[which(!is.na(dnc2$available))], dnc2$available[which(!is.na(dnc2$available))], 
@@ -259,6 +268,12 @@ nsh_dist_m1 <- node_hist_dist(nsh_m1_short, '2008', '2008-05-10', '2008-08-15',
 d_m1_kernel_bug <- ecdf(nsh_dist_m1/(60*60*24))
 
 
+nsh_m1_short <- nsh_m1_pcu[which(nsh_m1_pcu$start > t_0815 & nsh_m1_pcu$start <= t_0224),]
+nsh_dist_m1 <- node_hist_dist(nsh_m1_short, '2008', '2008-05-10', '2009-03-22', 0, 'day')
+d_m1_pcu <- ecdf(nsh_dist_m1/(60*60*24))
+
+
+
 # d<-ecdf(nsh_dist[which(nsh_dist/(60*60*24) < 90 )]/(60*60*24)), 
 # 180 ~= 6 months.
 par(mai=c(.9,.9,.1,.3))
@@ -267,18 +282,25 @@ par(mai=c(.9,.9,.1,.3))
 
 x_lim_max <- 150
 
+print ("fuckyou 4a");
 plot(d_m1_total, xlim=c(0,x_lim_max), ylim=c(0,1), axes=F, xlab="Days to Resolve", 
-    ylab="Fraction of Offline Nodes Restored", col.hor='red', col.vert='red', pch='.', 
+    ylab="b) Fraction of Offline Nodes Restored", col.hor='red', col.vert='red', pch='.', 
     col.points='red', main="")
+print ("fuckyou 5");
 
 plot(d_m1_notice_bug, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve", 
     col.hor='blue', col.vert='blue', pch='.', 
     col.points='blue', lty=2, add=TRUE)
+print ("fuckyou 6");
 
 plot(d_m1_kernel_bug, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve", 
     col.hor='darkgreen', col.vert='darkgreen', pch='.', 
     col.points='darkgreen', lty=3, add=TRUE)
 
+#plot(d_m1_pcu, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve", 
+#    col.hor='purple', col.vert='purple', pch='.', 
+#    col.points='purple', lty=4, add=TRUE)
+
 weeks <- c(0,7,14,21,28,60,90,120,150,180)
 axis(1, labels=weeks, at=weeks)
 percentages <- c(0,0.25, 0.5, 0.75, 0.85, 0.95, 1)
diff --git a/statistics/parse_nodestatus.py b/statistics/parse_nodestatus.py
new file mode 100755 (executable)
index 0000000..d0072f3
--- /dev/null
@@ -0,0 +1,71 @@
+#!/usr/bin/python
+
+import sys
+import os
+
+def recent(q, val, length):
+    return [val] + q[:length-1]
+
+def add(d, path, v):
+    if path not in d:
+        d[path] = []
+    d[path] = recent(d[path], v, 6)
+    #d[path].append(v)
+
+def stateof(l):
+    if len(set(l)) == 1:
+        return l[0]
+    else:
+        return "BOOT"
+
+def in_hard_state(l):
+    if len(set(l)) == 1:
+        return True
+    else:
+        return False
+
+recent_node_ts = {}
+recent_node_states = {}
+last_node_state = {}
+last_node_ts = {}
+print "ts,host,status,pcustatus,model,length"
+
+while True:
+    line = sys.stdin.readline()
+    if not line:
+        break
+    line = line.strip()
+    # NOTE assumes ts are ordered.
+    try:
+        (ts, host, state, pcu, pcumodel) = line.split(',')
+    except:
+        print >>sys.stderr, "EXCEPTION:", line
+        while True:
+            line = sys.stdin.readline()
+            if not line:
+                break
+            line = line.strip()
+            if line[0] == ',':
+                pcumodel = line[1:]
+                break
+        #sys.exit(1)
+
+    ts = int(float(ts))
+    add(recent_node_states, host, state)
+    add(recent_node_ts, host, ts)
+
+    if host not in last_node_state:
+        last_node_state[host] = "UNKNOWN"
+        last_node_ts[host] = ts
+        
+    if ( (in_hard_state(recent_node_states[host]) and stateof(recent_node_states[host]) == "DOWN") or \
+        not in_hard_state(recent_node_states[host]) ) and \
+       last_node_state[host] != stateof(recent_node_states[host]):
+
+        if stateof(recent_node_states[host]) == "DOWN":
+            ts = recent_node_ts[host][-1]   # get earliest time down
+        print "%s,%s,%s,%s,%s,%s" % (ts, host, stateof(recent_node_states[host]), pcu, pcumodel, ts-last_node_ts[host] )
+        last_node_state[host] = stateof(recent_node_states[host])
+        last_node_ts[host] = ts
+
+