+ """
+ If in debug, set the node to rins, reboot via PCU/POD
+ """
+ daysdown = self.comon.codata[node]['sshstatus'] // (60*60*24)
+ logger.info("POLICY: Node %s in dbg. down for %s" %(node,daysdown))
+ plc.nodeBootState(node, "rins")
+ # TODO: only reboot if BootCD > 3.0
+ # if bootcd[node] > 3.0:
+ # if NODE_KEY in planet.cnf:
+ # plc.nodeBootState(node, "rins")
+ # reboot.reboot(node)
+ # else:
+ # email to update planet.cnf file
+
+ # If it has a PCU
+ reboot.reboot(node)
+ # else:
+ # email upgrade bootcd message, and treat as down.
+ # Log it
+ self.actionlogdb[node] = ['rins', daysdown, time.time()]
+
+ def __emailSite(self, loginbase, roles, message, args):
+ """
+ loginbase is the unique site abbreviation, prepended to slice names.
+ roles contains TECH, PI, USER roles, and derive email aliases.
+ record contains {'message': [<subj>,<body>], 'args': {...}}
+ """
+ args.update({'loginbase':loginbase})
+ # build targets
+ contacts = []
+ if TECH & roles:
+ contacts += [TECHEMAIL % loginbase]
+ elif PI & roles:
+ contacts += [PIEMAIL % loginbase]
+ elif USER & roles:
+ slices = plc.slices(loginbase)
+ if len(slices) >= 1:
+ for slice in slices:
+ contacts += [SLICEMAIL % slice]
+ else:
+ print "Received no slices for site: %s" % loginbase
+
+ try:
+ subject = message[0] % args
+ body = message[1] % args
+ mailer.emailViaRT(subject, body, contacts)
+ except Exception, err:
+ print "exception on message:"
+ print message
+
+ return
+
+ def format_diaginfo(self, diag_node):
+ info = diag_node['info']
+ hlist = " %s %s %s\n" % (info[0], info[2], info[1]) # (node, version, daysdown)
+ return hlist
+
+ def __actOnSite(self, loginbase, rec_diaglist):
+ i_nodes_actedon = 0
+ i_nodes_emailed = 0
+ b_squeeze = config.squeeze
+
+ action_argslist = []
+ for diag_node in rec_diaglist:
+ #print "calling actOnNode(%s)" % diag_node['nodename']
+ action_args = self.__actOnNode(diag_node)
+ action_argslist += [action_args]
+
+ #print "getSiteNodes(%s)" % loginbase
+ nodelist = plc.getSiteNodes(loginbase)
+ if len(nodelist) - len(action_argslist) < 2:
+ print "SITE: %20s : < 2 nodes !!" % loginbase
+ # TODO: check how long this has occurred.
+ # then plc.removeSliceCreation(nodename)
+ # There may be a similar act_1,act_2,wait db for sites?
+ else:
+ #print "SITE: goodNodesUp(%s) > 2 && %d bad" % \
+ # (loginbase, len(action_argslist))
+ b_squeeze = False
+
+ # create 'args' for email
+ #print "Create email args..."
+ email_args = {}
+ email_args['hostname_list'] = ""
+ for action_args in action_argslist:
+ email_args['hostname_list'] += action_args['msg_format']
+ email_args['hostname'] = action_args['nodename']
+
+ # Send email, perform node action
+ # TODO: only send one email per site for a given problem...
+ if len(action_argslist) > 0:
+ action_args = action_argslist[0]
+ #for action_args in action_argslist:
+ # TODO: perform the most severe action?
+ if b_squeeze:
+ act_key = action_args['action']
+ self.actions[act_key](email_args['hostname'])
+ i_nodes_actedon += 1
+ #print "Send email..."
+ if action_args['message'] != None:
+ self.__emailSite(loginbase, action_args['email'],
+ action_args['message'], email_args)
+ if config.mail: i_nodes_emailed += 1
+
+ return (i_nodes_actedon, i_nodes_emailed)
+
+ def __actOnNode(self, diag_node):
+ nodename = diag_node['nodename']
+ message = diag_node['message']
+ info = diag_node['info']
+ args = {}
+
+ # TODO: a node should only be in one category, right?
+ # - This is a constraint that should be enforced. It may be possible
+ # for a node to fall into the wrong set.
+ # - Also, it is necessary to remove a node from an action set, if it
+ # comes back up, or enters another state between checks.
+ # TODO: check that the reason a node ends up in a 'bad' state has or
+ # hasn't changed. If it's changed, then probably the process should
+ # start over, or at leat be acknowledged. I'm not sure that this is
+ # the right place for this operation.
+
+ args['nodename'] = nodename
+ args['msg_format'] = self.format_diaginfo(diag_node)
+ current_time = time.time()
+
+ #k1 = self.act_1week.keys()
+ #k2 = self.act_2weeks.keys()
+ #k3 = self.act_waitforever.keys()
+ #print "lengths: %d %d %d" % (len(k1), len(k2), len(k3))
+
+ delta = current_time - diag_node['time']
+
+ if 'waitforever' in diag_node['stage']:
+ # TODO: define what to do in the 'forever' state
+ # TODO: there should probably be a periodic email sent after this,
+ # to the site, or to us...
+ args['action'] = 'noop'
+ args['message'] = None
+
+ elif 'actintwoweeks' in diag_node['stage'] or delta >= 14 * SPERDAY:
+ #nodename in self.act_2weeks:
+ args['email'] = TECH | PI | USER
+ args['action'] = 'suspendslices'
+ args['message'] = message[2]
+ args['stage'] = 'stage_waitforever'
+ # TODO: This will lose original 'time'
+ diag_node.update(args)
+
+ elif 'actinoneweek' in diag_node['stage'] or delta >= 7 * SPERDAY:
+ # nodename in self.act_1week:
+ args['email'] = TECH | PI
+
+ args['action'] = 'nocreate'
+ # args['action'] = 'rins'
+ args['message'] = message[1]
+ args['stage'] = 'stage_actintwoweeks'
+ diag_node.update(args)
+
+ else:
+ # the node is bad, but there's no previous record of it.
+ args['email'] = TECH
+ args['action'] = 'noop'
+ args['message'] = message[0]
+ args['stage'] = 'stage_actinoneweek'
+ diag_node.update(args)
+
+ print "%s" % diag_node['log'],
+ print "%15s" % args['action']
+
+ if nodename not in self.act_all: self.act_all[nodename] = []
+ self.act_all[nodename].insert(0,diag_node)
+
+ return args
+
+ def lappend_once(list, element):
+ if element not in list:
+ list.append(element)
+ def sappend_once(string, element, separator=','):
+ if element not in string:
+ return ("%s%c%s" % (string, separator, element),1)
+ else:
+ return (string,0)
+
+ def analyseSites(self):
+ i_sites = 0
+ i_sites_diagnosed = 0
+ i_nodes_diagnosed = 0
+ i_nodes_actedon = 0
+ i_sites_emailed = 0
+ l_allsites = []
+
+ sorted_sites = self.sickdb.keys()
+ sorted_sites.sort()
+ for loginbase in sorted_sites:
+ rec_nodedict = self.sickdb[loginbase]
+ #print "calling diagnoseSite(%s)" % loginbase
+ rec_diaglist = self.__diagnoseSite(loginbase, rec_nodedict)
+ l_allsites += [loginbase]
+
+
+ if len(rec_diaglist) > 0:
+ i_nodes_diagnosed += len(rec_diaglist)
+ i_sites_diagnosed += 1
+
+ #print "calling actOnSite(%s)" % loginbase
+ (na,ne) = self.__actOnSite(loginbase, rec_diaglist)
+
+ i_sites += 1
+ i_nodes_actedon += na
+ i_sites_emailed += ne
+
+ return {'sites': i_sites,
+ 'sites_diagnosed': i_sites_diagnosed,
+ 'nodes_diagnosed': i_nodes_diagnosed,
+ 'sites_emailed': i_sites_emailed,
+ 'nodes_actedon': i_nodes_actedon,
+ 'allsites':l_allsites}
+
+
+ def __diagnoseSite(self, loginbase, rec_nodedict):
+ """
+ rec_sitelist is a sickdb entry:
+ """
+ diag_list = []
+ sorted_nodes = rec_nodedict.keys()
+ sorted_nodes.sort()
+ for nodename in sorted_nodes:
+ rec_node = rec_nodedict[nodename]
+ diag_node = self.__diagnoseNode(loginbase, rec_node)
+ if diag_node != None:
+ diag_list += [ diag_node ]
+ return diag_list
+
+ def __getDaysDown(self, nodename):
+ daysdown = -1
+ if self.comon.codata[nodename]['sshstatus'] != "null":
+ daysdown = int(self.comon.codata[nodename]['sshstatus']) // (60*60*24)
+ return daysdown
+
+ def __getStrDaysDown(self, nodename):
+ daysdown = self.__getDaysDown(nodename)
+ if daysdown > 0:
+ return "(%d days down)"%daysdown
+ else:
+ return ""
+
+ def __getCDVersion(self, nodename):
+ cdversion = ""
+ if nodename in self.bootcds:
+ cdversion = self.bootcds[nodename]
+ return cdversion
+
+ def __diagnoseNode(self, loginbase, rec_node):
+ # TODO: change the format of the hostname in this
+ # record to something more natural.
+ nodename = rec_node['nodename']
+ buckets = rec_node['bucket']
+ diag_record = None
+
+ # xyz as determined by monitor
+ # down as determined by comon
+ if rec_node['stage'] == "stage_rt_working":
+ # err, this can be used as a counter of some kind..
+ # but otherwise, no diagnosis is necessary, return None, implies that
+ # it gets skipped.
+ print "DIAG: %20s : %-40s ticket %s" % \
+ (loginbase, nodename, rec_node['ticket_id'])
+
+ elif "down" in buckets:
+ diag_record = {}
+ diag_record.update(rec_node)
+ diag_record['nodename'] = nodename
+ diag_record['message'] = emailTxt.mailtxt.newdown
+ diag_record['args'] = {'nodename': nodename}
+ s_daysdown = self.__getStrDaysDown(nodename)
+ diag_record['info'] = (nodename, s_daysdown, "")
+ diag_record['bucket'] = ["down"]
+ diag_record['log'] = "DOWN: %20s : %-40s == %20s" % \
+ (loginbase, nodename, diag_record['info']),
+
+ elif "dbg" in buckets:
+ # V2 boot cds as determined by monitor
+ s_daysdown = self.__getStrDaysDown(nodename)
+ s_cdversion = self.__getCDVersion(nodename)
+ diag_record = {}
+ diag_record.update(rec_node)
+ diag_record['nodename'] = nodename
+ diag_record['info'] = (nodename, s_daysdown, s_cdversion)
+
+ if nodename in self.bootcds and "v2" in self.bootcds[nodename]:
+ diag_record['log'] = "BTCD: %20s : %-40s == %20s" % \
+ (loginbase, nodename, self.bootcds[nodename]),
+ diag_record['message'] = emailTxt.mailtxt.newbootcd
+ diag_record['args'] = {'nodename': nodename}
+ # TODO: figure a better 'bucket' scheme, for merge()
+ #diag_record['bucket'] = ["monitor"]
+ else:
+ print "DEBG: %20s : %-40s" % \
+ (loginbase, nodename)
+ return None
+
+ msg = ("dbg mode",
+ "Comon reports the node in debug mode, %s" % \
+ "but monitor does not know what to do yet.")
+ # TODO: replace with a real action
+ diag_record['message'] = [msg, msg, msg]
+ diag_record['bucket'] = ["dbg"]
+ diag_record['args'] = {'nodename': nodename}
+ elif "ssh" in buckets:
+ pass
+ elif "clock_drift" in buckets:
+ pass
+ elif "dns" in buckets:
+ pass
+ elif "filerw" in buckets:
+ pass
+ else:
+ print "Unknown buckets!!!! %s" % buckets
+ sys.exit(1)
+
+ return diag_record
+
+