+ d_diag_site = {loginbase : { 'config' :
+ {'squeeze': False,
+ 'email': False
+ },
+ 'nodes': {}
+ }
+ }
+ sorted_nodes = d_diag_nodes.keys()
+ sorted_nodes.sort()
+ for nodename in sorted_nodes:
+ node_record = d_diag_nodes[nodename]
+ diag_record = self.__diagnoseNode(loginbase, node_record)
+
+ if diag_record != None:
+ d_diag_site[loginbase]['nodes'][nodename] = diag_record
+
+ # NOTE: improvement means, we need to act/squeeze and email.
+ #print "DIAG_RECORD", diag_record
+ if 'monitor-end-record' in diag_record['stage'] or \
+ 'nmreset' in diag_record['stage']:
+ # print "resetting loginbase!"
+ d_diag_site[loginbase]['config']['squeeze'] = True
+ d_diag_site[loginbase]['config']['email'] = True
+ #else:
+ # print "NO IMPROVEMENT!!!!"
+ else:
+ pass # there is nothing to do for this node.
+
+ # NOTE: these settings can be overridden by command line arguments,
+ # or the state of a record, i.e. if already in RT's Support Queue.
+ nodes_up = self.getUpAtSite(loginbase, d_diag_site)
+ if nodes_up < MINUP:
+ d_diag_site[loginbase]['config']['squeeze'] = True
+
+ max_slices = self.getMaxSlices(loginbase)
+ num_nodes = self.getNumNodes(loginbase)
+ # NOTE: when max_slices == 0, this is either a new site (the old way)
+ # or an old disabled site from previous monitor (before site['enabled'])
+ if nodes_up < num_nodes and max_slices != 0:
+ d_diag_site[loginbase]['config']['email'] = True
+
+ if len(d_diag_site[loginbase]['nodes'].keys()) > 0:
+ print "SITE: %20s : %d nodes up, at most" % (loginbase, nodes_up)
+
+ return d_diag_site
+
+ def diagRecordByCategory(self, node_record):
+ nodename = node_record['nodename']
+ category = node_record['category']
+ state = node_record['state']
+ loginbase = self.plcdb_hn2lb[nodename]
+ diag_record = None
+
+ if "ERROR" in category: # i.e. "DOWN"
+ diag_record = {}
+ diag_record.update(node_record)
+ daysdown = self.__getDaysDown(diag_record, nodename)
+ if daysdown < 7:
+ format = "DIAG: %20s : %-40s Down only %s days NOTHING DONE"
+ print format % (loginbase, nodename, daysdown)
+ return None
+
+ s_daysdown = self.__getStrDaysDown(diag_record, nodename)
+ diag_record['message'] = emailTxt.mailtxt.newdown
+ diag_record['args'] = {'nodename': nodename}
+ diag_record['info'] = (nodename, s_daysdown, "")
+
+ if 'reboot_node_failed' in node_record:
+ # there was a previous attempt to use the PCU.
+ if node_record['reboot_node_failed'] == False:
+ # then the last attempt apparently, succeeded.
+ # But, the category is still 'ERROR'. Therefore, the
+ # PCU-to-Node mapping is broken.
+ #print "Setting message for ERROR node to PCU2NodeMapping: %s" % nodename
+ diag_record['message'] = emailTxt.mailtxt.pcutonodemapping
+ diag_record['email_pcu'] = True
+
+ if diag_record['ticket_id'] == "":
+ diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
+ (loginbase, nodename, diag_record['info'][1:], diag_record['found_rt_ticket'])
+ else:
+ diag_record['log'] = "DOWN: %20s : %-40s == %20s %s" % \
+ (loginbase, nodename, diag_record['info'][1:], diag_record['ticket_id'])
+
+ elif "OLDBOOTCD" in category:
+ # V2 boot cds as determined by findbad
+ s_daysdown = self.__getStrDaysDown(node_record, nodename)
+ s_cdversion = self.__getCDVersion(node_record, nodename)
+ diag_record = {}
+ diag_record.update(node_record)
+ #if "2.4" in diag_record['kernel'] or "v2" in diag_record['bootcd']:
+ diag_record['message'] = emailTxt.mailtxt.newbootcd
+ diag_record['args'] = {'nodename': nodename}
+ diag_record['info'] = (nodename, s_daysdown, s_cdversion)
+ if diag_record['ticket_id'] == "":
+ diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
+ (loginbase, nodename, diag_record['kernel'],
+ diag_record['bootcd'], diag_record['found_rt_ticket'])
+ else:
+ diag_record['log'] = "BTCD: %20s : %-40s == %20s %20s %s" % \
+ (loginbase, nodename, diag_record['kernel'],
+ diag_record['bootcd'], diag_record['ticket_id'])
+
+ elif "PROD" in category:
+ if "DEBUG" in state:
+ # Not sure what to do with these yet. Probably need to
+ # reboot, and email.
+ print "DEBG: %20s : %-40s NOTHING DONE" % (loginbase, nodename)
+ return None
+ elif "BOOT" in state:
+ # no action needed.
+ # TODO: remove penalties, if any are applied.
+ now = time.time()
+ last_contact = node_record['plcnode']['last_contact']
+ if last_contact == None:
+ time_diff = 0
+ else:
+ time_diff = now - last_contact;
+
+ if 'improvement' in node_record['stage']:
+ # then we need to pass this on to 'action'
+ diag_record = {}
+ diag_record.update(node_record)
+ diag_record['message'] = emailTxt.mailtxt.newthankyou
+ diag_record['args'] = {'nodename': nodename}
+ diag_record['info'] = (nodename, node_record['prev_category'],
+ node_record['category'])
+ if diag_record['ticket_id'] == "":
+ diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+ (loginbase, nodename, diag_record['stage'],
+ state, category, diag_record['found_rt_ticket'])
+ else:
+ diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
+ (loginbase, nodename, diag_record['stage'],
+ state, category, diag_record['ticket_id'])
+ return diag_record
+ elif time_diff >= 6*SPERHOUR:
+ # heartbeat is older than 30 min.
+ # then reset NM.
+ #print "Possible NM problem!! %s - %s = %s" % (now, last_contact, time_diff)
+ diag_record = {}
+ diag_record.update(node_record)
+ diag_record['message'] = emailTxt.mailtxt.NMReset
+ diag_record['args'] = {'nodename': nodename}
+ diag_record['stage'] = "nmreset"
+ diag_record['info'] = (nodename,
+ node_record['prev_category'],
+ node_record['category'])
+ if diag_record['ticket_id'] == "":
+ diag_record['log'] = "NM : %20s : %-40s == %20s %20s %s %s" % \
+ (loginbase, nodename, diag_record['stage'],
+ state, category, diag_record['found_rt_ticket'])
+ else:
+ diag_record['log'] = "NM : %20s : %-40s == %20s" % \
+ (loginbase, nodename, diag_record['stage'])
+
+ return diag_record
+ else:
+ return None
+ else:
+ # unknown
+ pass
+ elif "ALPHA" in category:
+ pass
+ elif "clock_drift" in category:
+ pass
+ elif "dns" in category:
+ pass
+ elif "filerw" in category:
+ pass