From 145829c491624619f34509d22b0cd1788f796f92 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Mon, 26 Jul 2010 16:49:17 +0000 Subject: [PATCH] new files for dumping and parsing logs --- statistics/comon_analysis.r | 13 +-- statistics/comon_summary_parser.py | 165 +++++++++++++++++++++++++++++ statistics/harvest_nodestatus.py | 55 ++++++++++ statistics/harvest_site_history.py | 80 ++++++++++++++ statistics/iterate_days.py | 26 +++++ statistics/myops_restoration.r | 28 ++++- statistics/parse_nodestatus.py | 71 +++++++++++++ 7 files changed, 429 insertions(+), 9 deletions(-) create mode 100755 statistics/comon_summary_parser.py create mode 100755 statistics/harvest_nodestatus.py create mode 100755 statistics/harvest_site_history.py create mode 100755 statistics/iterate_days.py create mode 100755 statistics/parse_nodestatus.py diff --git a/statistics/comon_analysis.r b/statistics/comon_analysis.r index b0c0747..76445d2 100644 --- a/statistics/comon_analysis.r +++ b/statistics/comon_analysis.r @@ -59,25 +59,26 @@ time_graph_setup <- function (from, to) tg <- time_graph_setup('2004/1/1', '2010/6/28') -start_image("platform_availability.png") -par(mfrow=c(2,1)) +#start_image("platform_availability.png") +par(mfrow=c(3,1)) # bottom, left, top, right margins par(mai=c(0.2, 1, 0.3, 0.1)) -plot(cm$ts, cm$online, type='l', col='black', +plot(cm$ts, cm$total, type='l', col='grey60', axes=F, xlab="", xlim=tg$xlim, - ylim=c(0,900), + ylim=c(0,1000), ylab="a) Online Node Count") +lines(cm$ts, cm$online, type='l', col='black' ) #axis(1, labels=tg$month_str, at=tg$month_ts, cex.axis=0.7) #axis(1, tick=F, labels=tg$year_str, at=tg$year_ts, cex.axis=0.7, line=1) axis(1, labels=c("","","","","","",""), at=tg$year_ts, cex.axis=0.7, line=-0.5) axis(2, las=1) -planetlab_releases(800) +planetlab_releases(1000) par(mai=c(1, 1, 0.2, 0.1)) @@ -107,4 +108,4 @@ abline(h=log(7*60*60*24), col='grey80', lty=2) planetlab_releases(18) -end_image() +#end_image() diff --git a/statistics/comon_summary_parser.py b/statistics/comon_summary_parser.py new file mode 100755 index 0000000..f7e7a0b --- /dev/null +++ b/statistics/comon_summary_parser.py @@ -0,0 +1,165 @@ +#!/usr/bin/python + +import sys, os, time, bz2 + +if len(sys.argv) != 2 : + print 'usage: bz2comonlogfile' + sys.exit() + +filename = sys.argv[1] +start_time = time.time() +if not ('dump_comon_' in filename and filename.endswith('bz2')) : + print 'not a comon log file:' + sys.exit() + +def str_to_ts(date_str, format="%Y-%m-%d"): + ts = time.mktime(time.strptime(date_str, format)) + return ts + + +# .../dump_cotop_20080101 -> 2008-01-01 +indx = filename.rfind('dump_comon_') + len('dump_comon') + 1 +date = filename[indx:indx+8] +date = date[0:4] + '-' + date[4:6] + '-' + date[6:8] +ts = str_to_ts(date) + +# read in bz2 log file +bzobj = bz2.BZ2File(filename, mode = 'r') +lines = bzobj.readlines() + +last_time = 0 +entry = {} +processed_tags = set() + +# keep track of malformed entries +n_badTS = 0 +n_ambigTS = 0 +n_NA = {} + +important_tags = ['Start:', 'Name:', 'RespTime:'] + +def get_field(table, *args): + pos = table + l = len(args) + for i,v in enumerate(args[:-1]): + if v not in pos: + pos[v] = {} + pos = pos[v] + v = args[-1] + if args[-1] not in pos: + pos[v] = 0 + return pos[v] + +def set_field(table, *args): + pos = table + l = len(args) + #get_field(table, *args[0:-1]) + for i,v in enumerate(args[:-2]): + pos = pos[v] + pos[args[-2]] = args[-1] + +def isValid(entry): + # check important_tags + for t in important_tags: + if t not in entry: + #print "bad entry", entry + return False + + try: + if 'Uptime:' in entry: + float(entry['Uptime:']) + except: + #print "uptime fault" + return False + + return True + +hs = {} # HOST SUMMARY + +# Process log +for line in lines : + line = line.strip() + if line == '' : + #Process timestamp + try : + this_time = int(entry['Start:'][0]) + fmtime = time.strftime('%D %T', time.localtime(this_time)) + ambigTS = this_time < last_time + if ambigTS : + n_ambigTS += 1 + else : + last_time = this_time + #outcsv.write('%d,%s' % (this_time, ambigTS)) + except KeyError : + continue + except : + n_badTS += 1 + entry = {} + processed_tags = set() + continue + #Process other fields + #try : + + + if True: + + if not isValid(entry): + entry = {} + processed_tags = set() + continue + + h = entry['Name:'] + + if h not in hs: + get_field(hs,h,'offline') + get_field(hs,h,'online') + get_field(hs,h,'uptime') + + try: + if len(entry['RespTime:'].split()) > 1: + set_field(hs,h,'offline', get_field(hs,h,'offline') + 1) + else: + set_field(hs,h,'online', get_field(hs,h,'online') + 1) + set_field(hs,h,'uptime', max(get_field(hs,h,'uptime'),entry['Uptime:']) ) + except: + #print "except resptime" + continue + + + #except KeyError : + ## print "key error! on hostname: %s" % h + # continue + + entry = {} + processed_tags = set() + else : + words = line.split() + tag = words[0] + if tag in processed_tags : # start over, since a tag is repeating + entry = {} + processed_tags = set() + entry[tag] = " ".join(words[1:len(words)]) + processed_tags.add(tag) + +# finished processing log file + +# clean up memory objs +#outcsv.close() +bzobj.close() +lines = '' + +online = 0 +offline = 0 +uptimes = [] +#print "number of hosts:%s" % len(hs) +for h in hs: + if hs[h]['uptime'] > 0: uptimes.append(float(hs[h]['uptime'])) + if hs[h]['online'] > hs[h]['offline']: + online += 1 + else: + offline += 1 + +l = len(uptimes) +uptimes.sort() +end_time = time.time() +print date, ts, online+offline, online, offline, uptimes[0], uptimes[l/4], uptimes[l/2], uptimes[l/2+l/4], uptimes[-1], end_time-start_time, filename diff --git a/statistics/harvest_nodestatus.py b/statistics/harvest_nodestatus.py new file mode 100755 index 0000000..279cb16 --- /dev/null +++ b/statistics/harvest_nodestatus.py @@ -0,0 +1,55 @@ +#!/usr/bin/python + +# Collect statistics from myops db on node downtimes. +# For every node that goes down we need: +# * node down time +# * node reboot time +# * node notice time +# * node up time + +# then for each node, order events by time +# for each event sequence extract sub-sequences like: +# down xx up +# for each such sub-sequence extract +# time between down and up + +from monitor.database.info.model import * +from math import * +import sys +from datetime import datetime, timedelta +from monitor.common import * + +if len(sys.argv) > 1 and sys.argv[1] == "--pcu": + args = sys.argv[2:] + find_pcu = True +else: + args = sys.argv[1:] + find_pcu = False + + +for index,node in enumerate(FindbadNodeRecord.query.all()): + + if find_pcu: + if not node.plc_pcuid: + # NOTE: if we're looking for nodes with pcus, then skip the ones that don't have them. + continue + else: + if node.plc_pcuid: + # NOTE: if we're not looking for nodes with pcus, then skip those that have them + continue + + print >>sys.stderr, index, node.hostname + + for v in node.versions: + if find_pcu: + # we should be guaranteed that the node has pcuids here, but it could be deleted or added later + pcu = FindbadPCURecord.get_by(plc_pcuid=v.plc_pcuid) + if pcu: + v_pcu = pcu.get_as_of(v.timestamp) + if v_pcu: + print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), v.hostname, v.observed_status, v_pcu.reboot_trial_status, v_pcu.plc_pcu_stats['model']) + else: + print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), v.hostname, v.observed_status, "NOPCU", "NONE") + + else: + print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), v.hostname, v.observed_status, "NOPCU", "NONE") diff --git a/statistics/harvest_site_history.py b/statistics/harvest_site_history.py new file mode 100755 index 0000000..8ec3335 --- /dev/null +++ b/statistics/harvest_site_history.py @@ -0,0 +1,80 @@ +#!/usr/bin/python + +# Collect statistics from myops db on site failures and recovery times. +# For every site that goes down we need: +# * site down time +# * site up time + +from monitor.database.info.model import * +from math import * +import sys +from datetime import datetime, timedelta +from monitor.common import * + +d_start = datetime(2009, 6, 10) +d_end = datetime.now() + +#print "date_checked,date_changed,timestamp,last_changed,sitename,status" +print "date,t,sitename,status,length" + +if len(sys.argv[1:]) > 0: + l = enumerate(sys.argv[1:]) + l = [(i,HistorySiteRecord.get_by(loginbase=lb)) for i,lb in l ] +else: + l = enumerate(HistorySiteRecord.query.all()) + +for index,site in l: + d = d_start + + time_at_level = 0 + not_printed = True + start = False + + last_level = site.versions[0].penalty_level + last_time = site.versions[0].timestamp + + length = len(site.versions) + for i,v in enumerate(site.versions[1:]): + + if v.nodes_up < 2 and not start: + last_time = v.timestamp + start = True + + if v.nodes_up >= 2 and start and v.penalty_level == 0: + time_at_level = Time.dt_to_ts(v.timestamp) - Time.dt_to_ts(last_time) + print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), + v.timestamp.strftime("%Y-%m-%d"), v.loginbase, last_level, time_at_level) + start = False + + if last_level != v.penalty_level: + last_level = v.penalty_level + + #print i,length-2, last_level != v.penalty_level, i == length-2 + #if last_level != v.penalty_level or i == length-2: + #if last_level > v.penalty_level: # or i == length-2: + #print "last_time ", last_time + #print "diff times", v.timestamp + #print "timestamp ", v.timestamp + #print "lastchange", v.last_changed + # time_at_level = Time.dt_to_ts(v.timestamp) - Time.dt_to_ts(last_time) + # print "%s,%s,%s,%s,%s" % (Time.dt_to_ts(v.timestamp), + # v.timestamp.strftime("%Y-%m-%d"), v.loginbase, last_level, time_at_level) + + #if last_level != v.penalty_level: + # #print "levels mis-match", last_level, v.penalty_level + # last_level = v.penalty_level + # if last_level != 2: + # # print "recording time", v.timestamp + # last_time = v.timestamp + + + + #while d < d_end: +# +# v = site.get_as_of(d) +# if v: +# print "%s,%s,%s,%s" % (Time.dt_to_ts(d), +# d.strftime("%Y-%m-%d"), v.loginbase, v.penalty_level) +## +# d += timedelta(1) + diff --git a/statistics/iterate_days.py b/statistics/iterate_days.py new file mode 100755 index 0000000..13e16bd --- /dev/null +++ b/statistics/iterate_days.py @@ -0,0 +1,26 @@ +#!/usr/bin/python + +import time, datetime +import os +import sys + +start = datetime.datetime(int(sys.argv[1]),int(sys.argv[2]), int(sys.argv[3])) +#end = datetime.datetime(2010,06,10) +end = datetime.datetime(2004,08,31) + +while start < end: + file = "dump_comon_%s.bz2" % ( start.strftime("%Y%m%d") ) + cmd = "scp -q soltesz@opus:/n/fs/codeenlogs/comon_dumps/%s/%s ./" % (start.year, file) + os.system(cmd) + + start += datetime.timedelta(1) + + try: + os.stat(file) + cmd = "./comon_summary_parser.py %s" % file + os.system(cmd) + os.remove(file) + except OSError: + continue + + diff --git a/statistics/myops_restoration.r b/statistics/myops_restoration.r index 838e0b1..d953004 100644 --- a/statistics/myops_restoration.r +++ b/statistics/myops_restoration.r @@ -1,5 +1,6 @@ source("functions.r"); +print ("fuckyou 0"); nsh <- read.csv('node_status_history.csv', sep=',', header=TRUE) @@ -11,7 +12,9 @@ nsh_m1 <- read.csv('node_status_history_m1.csv', sep=',', header=TRUE) nsh_m1_nopcu_total <- read.csv('node_status_history_m1_nopcu_total.csv', sep=',', header=TRUE) nsh_m1_nopcu_notice <- read.csv('node_status_history_m1_nopcu.csv', sep=',', header=TRUE) nsh_m1_nopcu_kernel <- read.csv('node_status_history_m1_nopcu_may08sep08.csv', sep=',', header=TRUE) +nsh_m1_pcu <- read.csv('node_status_history_m1_pcu.csv', sep=',', header=TRUE) +print ("fuckyou 0a"); node_hist_image <- function (t, year, from, to, max=0, type="week", title="") { dates <-seq(as.Date(from), as.Date(to), type) @@ -131,6 +134,7 @@ hbreaks<-unclass(as.POSIXct(dates)) x_start<-unclass(as.POSIXct("2008-05-01", origin="1970-01-01"))[1] x_end <-unclass(as.POSIXct("2009-06-1", origin="1970-01-01"))[1] +print ("fuckyou 0b"); tstamp_0510 <-abline_at_date("2008-05-10", col='grey20', lty=0, height=570) # dates takes from reboot_image() output for API events. @@ -149,20 +153,25 @@ tstamp_0223 <-abline_at_date("2009-02-23", col='grey70', height=570) # red #tstamp_0313 <-abline_at_date("2009-03-13", col='grey70', height=570) -start_image("myops_restore_nopcu.png") +print ("fuckyou 0c"); +start_image("myops_restore_nopcu.eps") par(mfrow=c(2,1)) par(mai=c(.9,.8,.1,.1)) +print ("fuckyou 1"); plot(dnc2$start[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0815 & dnc2$start <= tstamp_1015) )], dnc2$available[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0815 & dnc2$start <= tstamp_1015) )], type='l', col='red', ylim=c(0,600), xlim=c(x_start, x_end), - xlab="", ylab="Online Node Count", axes=F) + xlab="", ylab="a) Online Node Count", axes=F) +print ("fuckyou 2"); lines(dnc2$start[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0223) )], dnc2$available[which(!is.na(dnc2$available) & (dnc2$start > tstamp_0223) )], type='l', col='red') lines(dnc2$start[which(!is.na(dnc2$available) & dnc2$start > tstamp_1015 & dnc2$start <= tstamp_0223)], dnc2$available[which(!is.na(dnc2$available)& dnc2$start > tstamp_1015 & dnc2$start <= tstamp_0223)], lty=2, type='l', col='blue') +print ("fuckyou 4"); + lines(dnc2$start[which(!is.na(dnc2$available) & dnc2$start > tstamp_0510 & dnc2$start <= tstamp_0815)], dnc2$available[which(!is.na(dnc2$available)& dnc2$start > tstamp_0510 & dnc2$start <= tstamp_0815)], lty=3, type='l', col='darkgreen') #lines(dnc2$start[which(!is.na(dnc2$available))], dnc2$available[which(!is.na(dnc2$available))], @@ -259,6 +268,12 @@ nsh_dist_m1 <- node_hist_dist(nsh_m1_short, '2008', '2008-05-10', '2008-08-15', d_m1_kernel_bug <- ecdf(nsh_dist_m1/(60*60*24)) +nsh_m1_short <- nsh_m1_pcu[which(nsh_m1_pcu$start > t_0815 & nsh_m1_pcu$start <= t_0224),] +nsh_dist_m1 <- node_hist_dist(nsh_m1_short, '2008', '2008-05-10', '2009-03-22', 0, 'day') +d_m1_pcu <- ecdf(nsh_dist_m1/(60*60*24)) + + + # d<-ecdf(nsh_dist[which(nsh_dist/(60*60*24) < 90 )]/(60*60*24)), # 180 ~= 6 months. par(mai=c(.9,.9,.1,.3)) @@ -267,18 +282,25 @@ par(mai=c(.9,.9,.1,.3)) x_lim_max <- 150 +print ("fuckyou 4a"); plot(d_m1_total, xlim=c(0,x_lim_max), ylim=c(0,1), axes=F, xlab="Days to Resolve", - ylab="Fraction of Offline Nodes Restored", col.hor='red', col.vert='red', pch='.', + ylab="b) Fraction of Offline Nodes Restored", col.hor='red', col.vert='red', pch='.', col.points='red', main="") +print ("fuckyou 5"); plot(d_m1_notice_bug, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve", col.hor='blue', col.vert='blue', pch='.', col.points='blue', lty=2, add=TRUE) +print ("fuckyou 6"); plot(d_m1_kernel_bug, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve", col.hor='darkgreen', col.vert='darkgreen', pch='.', col.points='darkgreen', lty=3, add=TRUE) +#plot(d_m1_pcu, xlim=c(0,x_lim_max), ylim=c(0,1), xlab="Days to Resolve", +# col.hor='purple', col.vert='purple', pch='.', +# col.points='purple', lty=4, add=TRUE) + weeks <- c(0,7,14,21,28,60,90,120,150,180) axis(1, labels=weeks, at=weeks) percentages <- c(0,0.25, 0.5, 0.75, 0.85, 0.95, 1) diff --git a/statistics/parse_nodestatus.py b/statistics/parse_nodestatus.py new file mode 100755 index 0000000..d0072f3 --- /dev/null +++ b/statistics/parse_nodestatus.py @@ -0,0 +1,71 @@ +#!/usr/bin/python + +import sys +import os + +def recent(q, val, length): + return [val] + q[:length-1] + +def add(d, path, v): + if path not in d: + d[path] = [] + d[path] = recent(d[path], v, 6) + #d[path].append(v) + +def stateof(l): + if len(set(l)) == 1: + return l[0] + else: + return "BOOT" + +def in_hard_state(l): + if len(set(l)) == 1: + return True + else: + return False + +recent_node_ts = {} +recent_node_states = {} +last_node_state = {} +last_node_ts = {} +print "ts,host,status,pcustatus,model,length" + +while True: + line = sys.stdin.readline() + if not line: + break + line = line.strip() + # NOTE assumes ts are ordered. + try: + (ts, host, state, pcu, pcumodel) = line.split(',') + except: + print >>sys.stderr, "EXCEPTION:", line + while True: + line = sys.stdin.readline() + if not line: + break + line = line.strip() + if line[0] == ',': + pcumodel = line[1:] + break + #sys.exit(1) + + ts = int(float(ts)) + add(recent_node_states, host, state) + add(recent_node_ts, host, ts) + + if host not in last_node_state: + last_node_state[host] = "UNKNOWN" + last_node_ts[host] = ts + + if ( (in_hard_state(recent_node_states[host]) and stateof(recent_node_states[host]) == "DOWN") or \ + not in_hard_state(recent_node_states[host]) ) and \ + last_node_state[host] != stateof(recent_node_states[host]): + + if stateof(recent_node_states[host]) == "DOWN": + ts = recent_node_ts[host][-1] # get earliest time down + print "%s,%s,%s,%s,%s,%s" % (ts, host, stateof(recent_node_states[host]), pcu, pcumodel, ts-last_node_ts[host] ) + last_node_state[host] = stateof(recent_node_states[host]) + last_node_ts[host] = ts + + -- 2.43.0