+
+ def mergePreviousActions(self):
+ """
+ look at the sick node_records as reported by comon, and then look at the
+ node_records in act_all. There are four cases:
+ 1) problem in comon but not in act_all
+ this ok, b/c it just means it's a new problem
+ 2) problem in comon and in act_all
+ we need to figure out the mis-match. Did the problem get better
+ or worse? Reset the stage clock to 'initial', if it's better,
+ continue if it's gotten worse. Hard to make this judgement here, though.
+ 3) no problem in comon, problem in act_all
+ this may mean that the node is operational again, or that monitor
+ knows how to define a problem that comon does not. For now, if
+ comon does not report a problem, monitor obeys. Ultimately,
+ however, we want to catch problems that comon can't see.
+ 4) no problem in comon, no problem in act_all
+ there won't be a record in either db, so there's no code.
+
+ TODO: this is where back-offs will be acknowledged. If the nodes get
+ better, it should be possible to 're-enable' the site, or slice, etc.
+ """
+ sorted_sites = self.sickdb.keys()
+ sorted_sites.sort()
+ # look at all problems reported by comon
+ for loginbase in sorted_sites:
+ rec_nodedict = self.sickdb[loginbase]
+ sorted_nodes = rec_nodedict.keys()
+ sorted_nodes.sort()
+ #for rec_node in rec_nodelist:
+ for nodename in sorted_nodes:
+ rec_node = rec_nodedict[nodename]
+ hn = nodename
+ x = self.sickdb[loginbase][hn]
+ if hn in self.act_all:
+ y = self.act_all[hn][0]
+ if x['bucket'][0] != y['bucket'][0]:
+ # 2a) mismatch, need a policy for how to resolve
+ print "COMON and MONITOR have a mismatch: %s vs %s" % \
+ (x['bucket'], y['bucket'])
+ else:
+ # 2b) ok, b/c they agree that there's still a problem..
+ pass
+
+ # for now, overwrite the comon entry for the one in act_all
+ self.sickdb[loginbase][hn] = y
+ # delete the entry from cache_all to keep it out of case 3)
+ del self.cache_all[hn]
+ else:
+ # 1) ok, b/c it's a new problem.
+ pass
+
+ # 3) nodes that remin in cache_all were not identified by comon as
+ # down. Do we keep them or not?
+ for hn in self.cache_all.keys():
+ y = self.act_all[hn][0]
+ if 'monitor' in y['bucket']:
+ loginbase = self.plcdb_hn2lb[hn]
+ if loginbase not in self.sickdb:
+ self.sickdb[loginbase] = {}
+ self.sickdb[loginbase][hn] = y
+ else:
+ del self.cache_all[hn]
+
+ print "len of cache_all: %d" % len(self.cache_all.keys())
+
+ return
+
+ def accumSickSites(self):
+ """
+ Take all sick nodes, find their sites, and put in
+ sickdb[loginbase] = [diag_node1, diag_node2, ...]
+ """
+ while 1:
+ diag_node = self.sickNoTicket.get(block = True)
+ if diag_node == "None":
+ break
+
+ #for bucket in self.comon.comon_buckets.keys():
+ # if (hostname in getattr(self.comon, bucket)):
+ # buckets_per_node.append(bucket)
+
+ #########################################################
+ # TODO: this will break with more than one comon bucket!!
+ nodename = diag_node['nodename']
+ loginbase = self.plcdb_hn2lb[nodename] # plc.siteId(node)
+
+ if loginbase not in self.sickdb:
+ self.sickdb[loginbase] = {}
+ #self.sickdb[loginbase][nodename] = []
+ #else:
+ #if nodename not in self.sickdb[loginbase]:
+ # self.sickdb[loginbase][nodename] = []
+
+ #self.sickdb[loginbase][nodename].append(diag_node)
+ self.sickdb[loginbase][nodename] = diag_node
+ # TODO: this will break with more than one comon bucket!!
+ #########################################################
+
+
+ def __actOnDebug(self, node):
+ """
+ If in debug, set the node to rins, reboot via PCU/POD
+ """
+ daysdown = self.comon.codata[node]['sshstatus'] // (60*60*24)
+ logger.info("POLICY: Node %s in dbg. down for %s" %(node,daysdown))
+ plc.nodeBootState(node, "rins")
+ # TODO: only reboot if BootCD > 3.0
+ # if bootcd[node] > 3.0:
+ # if NODE_KEY in planet.cnf:
+ # plc.nodeBootState(node, "rins")
+ # reboot.reboot(node)
+ # else:
+ # email to update planet.cnf file
+
+ # If it has a PCU
+ reboot.reboot(node)
+ # else:
+ # email upgrade bootcd message, and treat as down.
+ # Log it
+ self.actionlogdb[node] = ['rins', daysdown, time.time()]
+
+ def __emailSite(self, loginbase, roles, message, args):
+ """
+ loginbase is the unique site abbreviation, prepended to slice names.
+ roles contains TECH, PI, USER roles, and derive email aliases.
+ record contains {'message': [<subj>,<body>], 'args': {...}}
+ """
+ args.update({'loginbase':loginbase})
+ # build targets
+ contacts = []
+ if TECH & roles:
+ contacts += [TECHEMAIL % loginbase]
+ elif PI & roles:
+ contacts += [PIEMAIL % loginbase]
+ elif USER & roles:
+ slices = plc.slices(loginbase)
+ if len(slices) >= 1:
+ for slice in slices:
+ contacts += [SLICEMAIL % slice]
+ else:
+ print "Received no slices for site: %s" % loginbase
+
+ try:
+ subject = message[0] % args
+ body = message[1] % args
+ mailer.emailViaRT(subject, body, contacts)
+ except Exception, err:
+ print "exception on message:"
+ print message
+
+ return
+
+ def format_diaginfo(self, diag_node):
+ info = diag_node['info']
+ hlist = " %s %s %s\n" % (info[0], info[2], info[1]) # (node, version, daysdown)
+ return hlist
+
+ def __actOnSite(self, loginbase, rec_diaglist):
+ i_nodes_actedon = 0
+ i_nodes_emailed = 0
+ b_squeeze = config.squeeze
+
+ action_argslist = []
+ for diag_node in rec_diaglist:
+ #print "calling actOnNode(%s)" % diag_node['nodename']
+ action_args = self.__actOnNode(diag_node)
+ action_argslist += [action_args]
+
+ #print "getSiteNodes(%s)" % loginbase
+ nodelist = plc.getSiteNodes(loginbase)
+ if len(nodelist) - len(action_argslist) < 2:
+ print "SITE: %20s : < 2 nodes !!" % loginbase
+ # TODO: check how long this has occurred.
+ # then plc.removeSliceCreation(nodename)
+ # There may be a similar act_1,act_2,wait db for sites?
+ else:
+ #print "SITE: goodNodesUp(%s) > 2 && %d bad" % \
+ # (loginbase, len(action_argslist))
+ b_squeeze = False
+
+ # create 'args' for email
+ #print "Create email args..."
+ email_args = {}
+ email_args['hostname_list'] = ""
+ for action_args in action_argslist:
+ email_args['hostname_list'] += action_args['msg_format']
+ email_args['hostname'] = action_args['nodename']
+
+ # Send email, perform node action
+ # TODO: only send one email per site for a given problem...
+ if len(action_argslist) > 0:
+ action_args = action_argslist[0]
+ #for action_args in action_argslist:
+ # TODO: perform the most severe action?
+ if b_squeeze:
+ act_key = action_args['action']
+ self.actions[act_key](email_args['hostname'])
+ i_nodes_actedon += 1
+ #print "Send email..."
+ if action_args['message'] != None:
+ self.__emailSite(loginbase, action_args['email'],
+ action_args['message'], email_args)
+ if config.mail: i_nodes_emailed += 1
+
+ return (i_nodes_actedon, i_nodes_emailed)
+
+ def __actOnNode(self, diag_node):
+ nodename = diag_node['nodename']
+ message = diag_node['message']
+ info = diag_node['info']
+ args = {}
+
+ # TODO: a node should only be in one category, right?
+ # - This is a constraint that should be enforced. It may be possible
+ # for a node to fall into the wrong set.
+ # - Also, it is necessary to remove a node from an action set, if it
+ # comes back up, or enters another state between checks.
+ # TODO: check that the reason a node ends up in a 'bad' state has or
+ # hasn't changed. If it's changed, then probably the process should
+ # start over, or at leat be acknowledged. I'm not sure that this is
+ # the right place for this operation.
+
+ args['nodename'] = nodename
+ args['msg_format'] = self.format_diaginfo(diag_node)
+ current_time = time.time()
+
+ #k1 = self.act_1week.keys()
+ #k2 = self.act_2weeks.keys()
+ #k3 = self.act_waitforever.keys()
+ #print "lengths: %d %d %d" % (len(k1), len(k2), len(k3))
+
+ delta = current_time - diag_node['time']
+
+ if 'waitforever' in diag_node['stage']:
+ # TODO: define what to do in the 'forever' state
+ # TODO: there should probably be a periodic email sent after this,
+ # to the site, or to us...
+ args['action'] = 'noop'
+ args['message'] = None
+
+ elif 'actintwoweeks' in diag_node['stage'] or delta >= 14 * SPERDAY:
+ #nodename in self.act_2weeks:
+ args['email'] = TECH | PI | USER
+ args['action'] = 'suspendslices'
+ args['message'] = message[2]
+ args['stage'] = 'stage_waitforever'
+ # TODO: This will lose original 'time'
+ diag_node.update(args)
+
+ elif 'actinoneweek' in diag_node['stage'] or delta >= 7 * SPERDAY:
+ # nodename in self.act_1week:
+ args['email'] = TECH | PI
+
+ args['action'] = 'nocreate'
+ # args['action'] = 'rins'
+ args['message'] = message[1]
+ args['stage'] = 'stage_actintwoweeks'
+ diag_node.update(args)
+
+ else:
+ # the node is bad, but there's no previous record of it.
+ args['email'] = TECH
+ args['action'] = 'noop'
+ args['message'] = message[0]
+ args['stage'] = 'stage_actinoneweek'
+ diag_node.update(args)
+
+ print "%s" % diag_node['log'],
+ print "%15s" % args['action']
+
+ if nodename not in self.act_all: self.act_all[nodename] = []
+ self.act_all[nodename].insert(0,diag_node)
+
+ return args
+
+ def lappend_once(list, element):
+ if element not in list:
+ list.append(element)
+ def sappend_once(string, element, separator=','):
+ if element not in string:
+ return ("%s%c%s" % (string, separator, element),1)