+ def getDaysDown(cls, diag_record):
+ daysdown = -1
+ last_contact = diag_record['plcnode']['last_contact']
+ date_created = diag_record['plcnode']['date_created']
+
+ if diag_record['comonstats']['uptime'] != "null" and diag_record['comonstats']['uptime'] != "-1":
+ daysdown = - int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
+ elif last_contact is None:
+ if date_created is not None:
+ now = time.time()
+ diff = now - date_created
+ daysdown = diff // (60*60*24)
+ else:
+ daysdown = -1
+ else:
+ now = time.time()
+ diff = now - last_contact
+ daysdown = diff // (60*60*24)
+ return daysdown
+ getDaysDown = classmethod(getDaysDown)
+
+ def getStrDaysDown(cls, diag_record):
+ daysdown = "unknown"
+ last_contact = diag_record['plcnode']['last_contact']
+ date_created = diag_record['plcnode']['date_created']
+
+ if diag_record['comonstats']['uptime'] != "null" and \
+ diag_record['comonstats']['uptime'] != "-1":
+ daysdown = int(float(diag_record['comonstats']['uptime'])) // (60*60*24)
+ daysdown = "%d days up" % daysdown
+
+ elif last_contact is None:
+ if date_created is not None:
+ now = time.time()
+ diff = now - date_created
+ daysdown = diff // (60*60*24)
+ daysdown = "Never contacted PLC, created %s days ago" % daysdown
+ else:
+ daysdown = "Never contacted PLC"
+ else:
+ now = time.time()
+ diff = now - last_contact
+ daysdown = diff // (60*60*24)
+ daysdown = "%s days down" % daysdown
+ return daysdown
+ getStrDaysDown = classmethod(getStrDaysDown)
+ #def getStrDaysDown(cls, diag_record):
+ # daysdown = cls.getDaysDown(diag_record)
+ # if daysdown > -1:
+ # return "%d days down"%daysdown
+ # elif daysdown == -1:
+ # return "Has never contacted PLC"
+ # else:
+ # return "%d days up"% -daysdown
+ #getStrDaysDown = classmethod(getStrDaysDown)
+
+ def __getCDVersion(self, diag_record, nodename):
+ cdversion = ""
+ #print "Getting kernel for: %s" % diag_record['nodename']
+ cdversion = diag_record['kernel']
+ return cdversion
+
+ def __diagnoseSite(self, loginbase, d_diag_nodes):
+ """
+ d_diag_nodes are diagnose_in entries.
+ """
+ d_diag_site = {loginbase : { 'config' :
+ {'squeeze': False,
+ 'email': False
+ },
+ 'nodes': {}
+ }
+ }
+ sorted_nodes = d_diag_nodes.keys()
+ sorted_nodes.sort()
+ for nodename in sorted_nodes:
+ node_record = d_diag_nodes[nodename]
+ diag_record = self.__diagnoseNode(loginbase, node_record)
+
+ if diag_record != None:
+ d_diag_site[loginbase]['nodes'][nodename] = diag_record
+
+ # NOTE: improvement means, we need to act/squeeze and email.
+ #print "DIAG_RECORD", diag_record
+ if 'monitor-end-record' in diag_record['stage'] or \
+ 'nmreset' in diag_record['stage']:
+ # print "resetting loginbase!"
+ d_diag_site[loginbase]['config']['squeeze'] = True
+ d_diag_site[loginbase]['config']['email'] = True
+ #else:
+ # print "NO IMPROVEMENT!!!!"
+ else:
+ pass # there is nothing to do for this node.
+
+ # NOTE: these settings can be overridden by command line arguments,
+ # or the state of a record, i.e. if already in RT's Support Queue.
+ nodes_up = self.getUpAtSite(loginbase, d_diag_site)
+ if nodes_up < MINUP:
+ d_diag_site[loginbase]['config']['squeeze'] = True
+
+ max_slices = self.getMaxSlices(loginbase)
+ num_nodes = self.getNumNodes(loginbase)
+ # NOTE: when max_slices == 0, this is either a new site (the old way)
+ # or an old disabled site from previous monitor (before site['enabled'])
+ if nodes_up < num_nodes and max_slices != 0:
+ d_diag_site[loginbase]['config']['email'] = True
+
+ if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
+ print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
+
+ return d_diag_site
+
+ def diagRecordByCategory(self, node_record):
+ nodename = node_record['nodename']
+ category = node_record['category']
+ state = node_record['state']
+ loginbase = self.plcdb_hn2lb[nodename]
+ diag_record = None
+
+ if "ERROR" in category: # i.e. "DOWN"
+ diag_record = {}
+ diag_record.update(node_record)
+ daysdown = self.getDaysDown(diag_record)
+ if daysdown < 7:
+ format = "DIAG: %20s : %-40s Down only %s days NOTHING DONE"
+ print format % (loginbase, nodename, daysdown)
+ return None
+
+ s_daysdown = self.getStrDaysDown(diag_record)
+ diag_record['message'] = emailTxt.mailtxt.newdown
+ diag_record['args'] = {'nodename': nodename}
+ diag_record['info'] = (nodename, s_daysdown, "")
+
+ if 'reboot_node_failed' in node_record:
+ # there was a previous attempt to use the PCU.
+ if node_record['reboot_node_failed'] == False:
+ # then the last attempt apparently, succeeded.
+ # But, the category is still 'ERROR'. Therefore, the
+ # PCU-to-Node mapping is broken.
+ #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
+ diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
+ diag_record['email_pcu'] = True
+
+ if 'ticket_id' in diag_record:
+ if diag_record['ticket_id'] == "":
+ if 'found_rt_ticket' in diag_record:
+ ticket_id = diag_record['found_rt_ticket']
+ else:
+ ticket_id = "None"
+ else:
+ ticket_id = diag_record['ticket_id']
+ else:
+ ticket_id = "None"
+
+ diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
+ (loginbase, nodename, diag_record['info'][1:], ticket_id)
+
+ elif "OLDBOOTCD" in category:
+ # V2 boot cds as determined by findbad
+ s_daysdown = self.getStrDaysDown(node_record)
+ s_cdversion = self.__getCDVersion(node_record, nodename)
+ diag_record = {}
+ diag_record.update(node_record)
+ #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
+ diag_record['message'] = emailTxt.mailtxt.newbootcd
+ diag_record['args'] = {'nodename': nodename}
+ diag_record['info'] = (nodename, s_daysdown, s_cdversion)
+ if diag_record['ticket_id'] == "":
+ diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
+ (loginbase, nodename, diag_record['kernel'],
+ diag_record['bootcd'], diag_record['found_rt_ticket'])
+ else:
+ diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
+ (loginbase, nodename, diag_record['kernel'],
+ diag_record['bootcd'], diag_record['ticket_id'])
+
+ elif "PROD" in category:
+ if "DEBUG" in state:
+ # Not sure what to do with these yet. Probably need to
+ # reboot, and email.
+ print "DEBG: %20s : %-40s NOTHING DONE" % (loginbase, nodename)
+ return None
+ elif "BOOT" in state:
+ # no action needed.
+ # TODO: remove penalties, if any are applied.
+ now = time.time()
+ last_contact = node_record['plcnode']['last_contact']
+ if last_contact == None:
+ time_diff = 0
+ else:
+ time_diff = now - last_contact;
+
+ if 'improvement' in node_record['stage']:
+ # then we need to pass this on to 'action'
+ diag_record = {}
+ diag_record.update(node_record)
+ diag_record['message'] = emailTxt.mailtxt.newthankyou
+ diag_record['args'] = {'nodename': nodename}
+ diag_record['info'] = (nodename, node_record['prev_category'],
+ node_record['category'])
+ if 'email_pcu' in diag_record:
+ if diag_record['email_pcu']:
+ # previously, the pcu failed to reboot, so send
+ # email. Now, reset these values to try the reboot
+ # again.
+ diag_record['email_pcu'] = False
+ del diag_record['reboot_node_failed']
+
+ if diag_record['ticket_id'] == "":
+ diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+ (loginbase, nodename, diag_record['stage'],
+ state, category, diag_record['found_rt_ticket'])
+ else:
+ diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+ (loginbase, nodename, diag_record['stage'],
+ state, category, diag_record['ticket_id'])
+ return diag_record
+ #elif time_diff >= 6*SPERHOUR:
+ # # heartbeat is older than 30 min.
+ # # then reset NM.
+ # #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
+ # diag_record = {}
+ # diag_record.update(node_record)
+ # diag_record['message'] = emailTxt.mailtxt.NMReset
+ # diag_record['args'] = {'nodename': nodename}
+ # diag_record['stage'] = "nmreset"
+ # diag_record['info'] = (nodename,
+ # node_record['prev_category'],
+ # node_record['category'])
+ # if diag_record['ticket_id'] == "":
+ # diag_record['log'] = "NM : %20s : %-40s == %20s %20s %s %s" % \
+ # (loginbase, nodename, diag_record['stage'],
+ # state, category, diag_record['found_rt_ticket'])
+ # else:
+ # diag_record['log'] = "NM : %20s : %-40s == %20s" % \
+ # (loginbase, nodename, diag_record['stage'])
+#
+# return diag_record
+ else:
+ return None
+ else:
+ # unknown
+ pass
+ elif "ALPHA" in category:
+ pass
+ elif "clock_drift" in category:
+ pass
+ elif "dns" in category:
+ pass
+ elif "filerw" in category:
+ pass
+ else:
+ print "Unknown category!!!! %s" % category
+ sys.exit(1)
+
+ return diag_record
+
+ def __diagnoseNode(self, loginbase, node_record):
+ # TODO: change the format of the hostname in this
+ # record to something more natural.
+ nodename = node_record['nodename']
+ category = node_record['category']
+ prev_category = node_record['prev_category']
+ state = node_record['state']
+ #if 'prev_category' in node_record:
+ # prev_category = node_record['prev_category']
+ #else:
+ # prev_category = "ERROR"
+ if node_record['prev_category'] != "NORECORD":
+
+ val = cmpCategoryVal(category, prev_category)
+ print "%s went from %s -> %s" % (nodename, prev_category, category)
+ if val == 1:
+ # improved
+ if node_record['ticket_id'] == "" or node_record['ticket_id'] == None:
+ print "closing record with no ticket: ", node_record['nodename']
+ node_record['action'] = ['close_rt']
+ node_record['message'] = None
+ node_record['stage'] = 'monitor-end-record'
+ return node_record
+ else:
+ node_record['stage'] = 'improvement'
+
+ #if 'monitor-end-record' in node_record['stage']:
+ # # just ignore it if it's already ended.
+ # # otherwise, the status should be worse, and we won't get
+ # # here.
+ # print "monitor-end-record: ignoring ", node_record['nodename']
+ # return None
+#
+# #return None
+ elif val == -1:
+ # current category is worse than previous, carry on
+ pass
+ else:
+ #values are equal, carry on.
+ #print "why are we here?"
+ pass
+
+ if 'rt' in node_record and 'Status' in node_record['rt']:
+ if node_record['stage'] == 'ticket_waitforever':
+ if 'resolved' in node_record['rt']['Status']:
+ print "ending waitforever record for: ", node_record['nodename']
+ node_record['action'] = ['noop']
+ node_record['message'] = None
+ node_record['stage'] = 'monitor-end-record'
+ print "oldlog: %s" % node_record['log'],
+ print "%15s" % node_record['action']
+ return node_record
+ if 'new' in node_record['rt']['Status'] and \
+ 'Queue' in node_record['rt'] and \
+ 'Monitor' in node_record['rt']['Queue']:
+
+ print "RESETTING stage to findbad"
+ node_record['stage'] = 'findbad'
+
+ #### COMPARE category and prev_category
+ # if not_equal
+ # then assign a stage based on relative priorities
+ # else equal
+ # then check category for stats.
+ diag_record = self.diagRecordByCategory(node_record)
+ if diag_record == None:
+ #print "diag_record == None"
+ return None
+
+ #### found_RT_ticket
+ # TODO: need to record time found, and maybe add a stage for acting on it...
+ # NOTE: after found, if the support ticket is resolved, the block is
+ # not removed. How to remove the block on this?
+ if 'found_rt_ticket' in diag_record and \
+ diag_record['found_rt_ticket'] is not None:
+ if diag_record['stage'] is not 'improvement':
+ diag_record['stage'] = 'ticket_waitforever'
+
+ current_time = time.time()
+ # take off four days, for the delay that database caused.
+ # TODO: generalize delays at PLC, and prevent enforcement when there
+ # have been no emails.
+ # NOTE: 7*SPERDAY exists to offset the 'bad week'
+ #delta = current_time - diag_record['time'] - 7*SPERDAY
+ delta = current_time - diag_record['time']
+
+ message = diag_record['message']
+ act_record = {}
+ act_record.update(diag_record)
+
+ #### DIAGNOSE STAGES
+ if 'findbad' in diag_record['stage']:
+ # The node is bad, and there's no previous record of it.
+ act_record['email'] = TECH
+ act_record['action'] = ['noop']
+ act_record['message'] = message[0]
+ act_record['stage'] = 'stage_actinoneweek'
+
+ elif 'nmreset' in diag_record['stage']:
+ act_record['email'] = ADMIN
+ act_record['action'] = ['reset_nodemanager']
+ act_record['message'] = message[0]
+ act_record['stage'] = 'nmreset'
+ return None
+
+ elif 'reboot_node' in diag_record['stage']:
+ act_record['email'] = TECH
+ act_record['action'] = ['noop']
+ act_record['message'] = message[0]
+ act_record['stage'] = 'stage_actinoneweek'
+
+ elif 'improvement' in diag_record['stage']:
+ # - backoff previous squeeze actions (slice suspend, nocreate)
+ # TODO: add a backoff_squeeze section... Needs to runthrough
+ print "backing off of %s" % nodename
+ act_record['action'] = ['close_rt']
+ act_record['message'] = message[0]
+ act_record['stage'] = 'monitor-end-record'
+
+ elif 'actinoneweek' in diag_record['stage']:
+ if delta >= 7 * SPERDAY:
+ act_record['email'] = TECH | PI
+ act_record['stage'] = 'stage_actintwoweeks'
+ act_record['message'] = message[1]
+ act_record['action'] = ['nocreate' ]
+ act_record['time'] = current_time # reset clock for waitforever
+ elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
+ act_record['email'] = TECH
+ act_record['message'] = message[0]
+ act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
+ act_record['second-mail-at-oneweek'] = True
+ else:
+ act_record['message'] = None
+ act_record['action'] = ['waitforoneweekaction' ]
+ print "ignoring this record for: %s" % act_record['nodename']
+ return None # don't send if there's no action
+
+ elif 'actintwoweeks' in diag_record['stage']:
+ if delta >= 7 * SPERDAY:
+ act_record['email'] = TECH | PI | USER
+ act_record['stage'] = 'stage_waitforever'
+ act_record['message'] = message[2]
+ act_record['action'] = ['suspendslices']
+ act_record['time'] = current_time # reset clock for waitforever
+ elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
+ act_record['email'] = TECH | PI
+ act_record['message'] = message[1]
+ act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
+ act_record['second-mail-at-twoweeks'] = True
+ else:
+ act_record['message'] = None
+ act_record['action'] = ['waitfortwoweeksaction']
+ return None # don't send if there's no action
+
+ elif 'ticket_waitforever' in diag_record['stage']:
+ act_record['email'] = TECH
+ if 'first-found' not in act_record:
+ act_record['first-found'] = True
+ act_record['log'] += " firstfound"
+ act_record['action'] = ['ticket_waitforever']
+ act_record['message'] = message[0]
+ act_record['time'] = current_time
+ else:
+ if delta >= 7*SPERDAY:
+ act_record['action'] = ['ticket_waitforever']
+ if 'rt' in act_record and 'Status' in act_record['rt'] and \
+ act_record['rt']['Status'] == 'new':
+ act_record['message'] = message[0]
+ else:
+ act_record['message'] = None
+
+ act_record['time'] = current_time # reset clock
+ else:
+ act_record['action'] = ['ticket_waitforever']
+ act_record['message'] = None
+ return None
+
+ elif 'waitforever' in diag_record['stage']:
+ # more than 3 days since last action
+ # TODO: send only on weekdays.
+ # NOTE: expects that 'time' has been reset before entering waitforever stage
+ if delta >= 3*SPERDAY:
+ act_record['action'] = ['email-againwaitforever']
+ act_record['message'] = message[2]
+ act_record['time'] = current_time # reset clock
+ else:
+ act_record['action'] = ['waitforever']
+ act_record['message'] = None
+ return None # don't send if there's no action
+
+ else:
+ # There is no action to be taken, possibly b/c the stage has
+ # already been performed, but diagnose picked it up again.
+ # two cases,
+ # 1. stage is unknown, or
+ # 2. delta is not big enough to bump it to the next stage.
+ # TODO: figure out which. for now assume 2.
+ print "UNKNOWN stage for %s; nothing done" % nodename
+ act_record['action'] = ['unknown']
+ act_record['message'] = message[0]
+
+ act_record['email'] = TECH
+ act_record['action'] = ['noop']
+ act_record['message'] = message[0]
+ act_record['stage'] = 'stage_actinoneweek'
+ act_record['time'] = current_time # reset clock
+ #print "Exiting..."
+ #return None
+ #sys.exit(1)
+
+ print "%s" % act_record['log'],
+ print "%15s" % act_record['action']
+ return act_record
+
+ def getMaxSlices(self, loginbase):
+ # if sickdb has a loginbase, then it will have at least one node.
+ site_stats = None