import string
from www.printbadnodes import cmpCategoryVal
from config import config
-print "policy"
+#print "policy"
config = config()
DAT="./monitor.dat"
def print_stats(key, stats):
if key in stats: print "%20s : %d" % (key, stats[key])
+def get_ticket_id(record):
+ if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
+ return record['ticket_id']
+ elif 'found_rt_ticket' in record and \
+ record['found_rt_ticket'] is not "" and \
+ record['found_rt_ticket'] is not None:
+ return record['found_rt_ticket']
+ else:
+ return None
+
class Merge(Thread):
def __init__(self, l_merge, toRT):
self.toRT = toRT
if loginbase not in self.mergedb:
self.mergedb[loginbase] = {}
+ # take the info either from act_all or fb-record.
+ # if node not in act_all
+ # then take it from fbrecord, obviously.
+ # else node in act_all
+ # if act_all == 0 length (no previous records)
+ # then take it from fbrecord.
+ # else
+ # take it from act_all.
+ #
+
# We must compare findbad state with act_all state
if nodename not in self.act_all:
# 1) ok, b/c it's a new problem. set ticket_id to null
self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
else:
if len(self.act_all[nodename]) == 0:
- print "len(act_all[%s]) == 0, skipping %s %s" % (nodename, loginbase, nodename)
- continue
-
- y = self.act_all[nodename][0]
-
- ## skip if end-stage
- #if 'stage' in y and "monitor-end-record" in y['stage']:
- # # 1) ok, b/c it's a new problem. set ticket_id to null
- ## self.mergedb[loginbase][nodename] = {}
- # self.mergedb[loginbase][nodename].update(x)
- # self.mergedb[loginbase][nodename]['ticket_id'] = ""
- # self.mergedb[loginbase][nodename]['prev_category'] = None
- # continue
-
- ## for legacy actions
- #if 'bucket' in y and y['bucket'][0] == 'dbg':
- # # Only bootcd debugs made it to the act_all db.
- # y['prev_category'] = "OLDBOOTCD"
- #elif 'bucket' in y and y['bucket'][0] == 'down':
- # y['prev_category'] = "ERROR"
- #elif 'bucket' not in y:
- # # for all other actions, just carry over the
- # # previous category
- # y['prev_category'] = y['category']
- #else:
- # print "UNKNOWN state for record: %s" % y
- # sys.exit(1)
-
- # determine through translation, if the buckets match
- #if 'category' in y and x['category'] == y['category']:
- # b_match = True
- #elif x['category'] == "OLDBOOTCD" and y['bucket'][0] == 'dbg':
- # b_match = True
- #elif x['category'] == "ERROR" and y['bucket'][0] == 'down':
- # b_match = True
- #else:
- # b_match = False
-
- #if b_match:
- # # 2b) ok, b/c they agree that there's still a problem..
- # # 2b) Comon & Monitor still agree; RT ticket?
- #else:
- # # 2a) mismatch, need a policy for how to resolve
- # # resolution will be handled in __diagnoseNode()
- # # for now just record the two categories.
- # #if x['category'] == "PROD" and x['state'] == "BOOT" and \
- # # ( y['bucket'][0] == 'down' or y['bucket'][0] == 'dbg'):
- # print "FINDBAD and MONITOR have a mismatch: %s vs %s" % \
- # (x['category'], y['bucket'])
-
- y['prev_category'] = y['category']
- self.mergedb[loginbase][nodename] = {}
- self.mergedb[loginbase][nodename].update(y)
- self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
- self.mergedb[loginbase][nodename]['category'] = x['category']
- self.mergedb[loginbase][nodename]['state'] = x['state']
- self.mergedb[loginbase][nodename]['kernel']=x['kernel']
- self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
- self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
+ self.mergedb[loginbase][nodename] = {}
+ self.mergedb[loginbase][nodename].update(x)
+ self.mergedb[loginbase][nodename]['ticket_id'] = ""
+ self.mergedb[loginbase][nodename]['prev_category'] = "NORECORD"
+ else:
+ y = self.act_all[nodename][0]
+ y['prev_category'] = y['category']
+
+ self.mergedb[loginbase][nodename] = {}
+ self.mergedb[loginbase][nodename].update(y)
+ self.mergedb[loginbase][nodename]['comonstats'] = x['comonstats']
+ self.mergedb[loginbase][nodename]['category'] = x['category']
+ self.mergedb[loginbase][nodename]['state'] = x['state']
+ self.mergedb[loginbase][nodename]['kernel']=x['kernel']
+ self.mergedb[loginbase][nodename]['bootcd']=x['bootcd']
+ self.mergedb[loginbase][nodename]['plcnode']=x['plcnode']
+ ticket = get_ticket_id(self.mergedb[loginbase][nodename])
+ self.mergedb[loginbase][nodename]['rt'] = mailer.getTicketStatus(ticket)
+
# delete the entry from cache_all to keep it out of case 3)
del self.cache_all[nodename]
pass
- def __getDaysDown(self, diag_record, nodename):
+ def getDaysDown(cls, diag_record):
daysdown = -1
- if diag_record['comonstats']['sshstatus'] != "null":
- daysdown = int(diag_record['comonstats']['sshstatus']) // (60*60*24)
- elif diag_record['comonstats']['lastcotop'] != "null":
- daysdown = int(diag_record['comonstats']['lastcotop']) // (60*60*24)
+ last_contact = diag_record['plcnode']['last_contact']
+ date_created = diag_record['plcnode']['date_created']
+
+ if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
+ daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
+ elif last_contact is None:
+ if date_created is not None:
+ now = time.time()
+ diff = now - date_created
+ daysdown = diff // (60*60*24)
+ else:
+ daysdown = -1
else:
now = time.time()
- last_contact = diag_record['plcnode']['last_contact']
- if last_contact == None:
- # the node has never been up, so give it a break
- daysdown = -1
- else:
- diff = now - last_contact
- daysdown = diff // (60*60*24)
+ diff = now - last_contact
+ daysdown = diff // (60*60*24)
return daysdown
+ getDaysDown = classmethod(getDaysDown)
+
+ def getStrDaysDown(cls, diag_record):
+ daysdown = "unknown"
+ last_contact = diag_record['plcnode']['last_contact']
+ date_created = diag_record['plcnode']['date_created']
+
+ if diag_record['comonstats']['uptime'] != "null" and \
+ diag_record['comonstats']['uptime'] != "-1":
+ daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
+ daysdown = "%d days up" % daysdown
- def __getStrDaysDown(self, diag_record, nodename):
- daysdown = self.__getDaysDown(diag_record, nodename)
- if daysdown > 0:
- return "(%d days down)"%daysdown
+ elif last_contact is None:
+ if date_created is not None:
+ now = time.time()
+ diff = now - date_created
+ daysdown = diff // (60*60*24)
+ daysdown = "Never contacted PLC, created %s days ago" % daysdown
+ else:
+ daysdown = "Never contacted PLC"
else:
- return "Unknown number of days"
+ now = time.time()
+ diff = now - last_contact
+ daysdown = diff // (60*60*24)
+ daysdown = "%s days down" % daysdown
+ return daysdown
+ getStrDaysDown = classmethod(getStrDaysDown)
+ #def getStrDaysDown(cls, diag_record):
+ # daysdown = cls.getDaysDown(diag_record)
+ # if daysdown > -1:
+ # return "%d days down"%daysdown
+ # elif daysdown == -1:
+ # return "Has never contacted PLC"
+ # else:
+ # return "%d days up"% -daysdown
+ #getStrDaysDown = classmethod(getStrDaysDown)
def __getCDVersion(self, diag_record, nodename):
cdversion = ""
if "ERROR" in category: # i.e. "DOWN"
diag_record = {}
diag_record.update(node_record)
- daysdown = self.__getDaysDown(diag_record, nodename)
+ daysdown = self.getDaysDown(diag_record)
if daysdown < 7:
format = "DIAG: %20s : %-40s Down only %s days NOTHING DONE"
print format % (loginbase, nodename, daysdown)
return None
- s_daysdown = self.__getStrDaysDown(diag_record, nodename)
+ s_daysdown = self.getStrDaysDown(diag_record)
diag_record['message'] = emailTxt.mailtxt.newdown
diag_record['args'] = {'nodename': nodename}
diag_record['info'] = (nodename, s_daysdown, "")
diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
diag_record['email_pcu'] = True
- if diag_record['ticket_id'] == "":
- diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
- (loginbase, nodename, diag_record['info'][1:], diag_record['found_rt_ticket'])
+ if 'ticket_id' in diag_record:
+ if diag_record['ticket_id'] == "":
+ if 'found_rt_ticket' in diag_record:
+ ticket_id = diag_record['found_rt_ticket']
+ else:
+ ticket_id = "None"
+ else:
+ ticket_id = diag_record['ticket_id']
else:
- diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
- (loginbase, nodename, diag_record['info'][1:], diag_record['ticket_id'])
+ ticket_id = "None"
+
+ diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
+ (loginbase, nodename, diag_record['info'][1:], ticket_id)
elif "OLDBOOTCD" in category:
# V2 boot cds as determined by findbad
- s_daysdown = self.__getStrDaysDown(node_record, nodename)
+ s_daysdown = self.getStrDaysDown(node_record)
s_cdversion = self.__getCDVersion(node_record, nodename)
diag_record = {}
diag_record.update(node_record)
(loginbase, nodename, diag_record['stage'],
state, category, diag_record['ticket_id'])
return diag_record
- elif time_diff >= 6*SPERHOUR:
- # heartbeat is older than 30 min.
- # then reset NM.
- #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
- diag_record = {}
- diag_record.update(node_record)
- diag_record['message'] = emailTxt.mailtxt.NMReset
- diag_record['args'] = {'nodename': nodename}
- diag_record['stage'] = "nmreset"
- diag_record['info'] = (nodename,
- node_record['prev_category'],
- node_record['category'])
- if diag_record['ticket_id'] == "":
- diag_record['log'] = "NM : %20s : %-40s == %20s %20s %s %s" % \
- (loginbase, nodename, diag_record['stage'],
- state, category, diag_record['found_rt_ticket'])
- else:
- diag_record['log'] = "NM : %20s : %-40s == %20s" % \
- (loginbase, nodename, diag_record['stage'])
-
- return diag_record
+ #elif time_diff >= 6*SPERHOUR:
+ # # heartbeat is older than 30 min.
+ # # then reset NM.
+ # #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
+ # diag_record = {}
+ # diag_record.update(node_record)
+ # diag_record['message'] = emailTxt.mailtxt.NMReset
+ # diag_record['args'] = {'nodename': nodename}
+ # diag_record['stage'] = "nmreset"
+ # diag_record['info'] = (nodename,
+ # node_record['prev_category'],
+ # node_record['category'])
+ # if diag_record['ticket_id'] == "":
+ # diag_record['log'] = "NM : %20s : %-40s == %20s %20s %s %s" % \
+ # (loginbase, nodename, diag_record['stage'],
+ # state, category, diag_record['found_rt_ticket'])
+ # else:
+ # diag_record['log'] = "NM : %20s : %-40s == %20s" % \
+ # (loginbase, nodename, diag_record['stage'])
+#
+# return diag_record
else:
return None
else:
#values are equal, carry on.
#print "why are we here?"
pass
+
+ if 'rt' in node_record and 'Status' in node_record['rt']:
+ if node_record['stage'] == 'ticket_waitforever':
+ if 'resolved' in node_record['rt']['Status']:
+ print "ending waitforever record for: ", node_record['nodename']
+ node_record['action'] = ['noop']
+ node_record['message'] = None
+ node_record['stage'] = 'monitor-end-record'
+ print "oldlog: %s" % node_record['log'],
+ print "%15s" % node_record['action']
+ return node_record
+ if 'new' in node_record['rt']['Status'] and \
+ 'Queue' in node_record['rt'] and \
+ 'Monitor' in node_record['rt']['Queue']:
+
+ print "RESETTING stage to findbad"
+ node_record['stage'] = 'findbad'
#### COMPARE category and prev_category
# if not_equal
#### found_RT_ticket
# TODO: need to record time found, and maybe add a stage for acting on it...
+ # NOTE: after found, if the support ticket is resolved, the block is
+ # not removed. How to remove the block on this?
if 'found_rt_ticket' in diag_record and \
diag_record['found_rt_ticket'] is not None:
if diag_record['stage'] is not 'improvement':
elif 'improvement' in diag_record['stage']:
# - backoff previous squeeze actions (slice suspend, nocreate)
# TODO: add a backoff_squeeze section... Needs to runthrough
+ print "backing off of %s" % nodename
act_record['action'] = ['close_rt']
act_record['message'] = message[0]
act_record['stage'] = 'monitor-end-record'
act_record['first-found'] = True
act_record['log'] += " firstfound"
act_record['action'] = ['ticket_waitforever']
- act_record['message'] = None
+ act_record['message'] = message[0]
act_record['time'] = current_time
else:
if delta >= 7*SPERDAY:
act_record['action'] = ['ticket_waitforever']
- act_record['message'] = None
+ if 'rt' in act_record and 'Status' in act_record['rt'] and \
+ act_record['rt']['Status'] == 'new':
+ act_record['message'] = message[0]
+ else:
+ act_record['message'] = None
+
act_record['time'] = current_time # reset clock
else:
act_record['action'] = ['ticket_waitforever']
if site_stats == None:
raise Exception, "loginbase with no nodes in findbad"
else:
- return site_stats['num_nodes']
+ if 'num_nodes' in site_stats:
+ return site_stats['num_nodes']
+ else:
+ return 0
"""
Returns number of up nodes as the total number *NOT* in act_all with a
# update node record with RT ticket_id
if nodename in self.act_all:
self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
+ # if the ticket was previously resolved, reset it to new.
+ if 'rt' in act_record and \
+ 'Status' in act_record['rt'] and \
+ act_record['rt']['Status'] == 'resolved':
+ mailer.setTicketStatus(ticket_id, "new")
+ status = mailer.getTicketStatus(ticket_id)
+ self.act_all[nodename][0]['rt'] = status
if config.mail: i_nodes_emailed += 1
print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,