-class Action(Thread):
- def __init__(self, l_action):
- self.l_action = l_action
-
- # the hostname to loginbase mapping
- self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
-
- # Actions to take.
- self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
- # Actions taken.
- self.act_all = database.if_cached_else(1, "act_all", lambda : {})
-
- # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
- self.actions = {}
- self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
- self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
- self.actions['close_rt'] = lambda args: close_rt_backoff(args)
- self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
- self.actions['noop'] = lambda args: args
- self.actions['reboot_node'] = lambda args: reboot_node(args)
- self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
-
- self.actions['ticket_waitforever'] = lambda args: args
- self.actions['waitforever'] = lambda args: args
- self.actions['unknown'] = lambda args: args
- self.actions['waitforoneweekaction'] = lambda args: args
- self.actions['waitfortwoweeksaction'] = lambda args: args
- self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
- self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
- self.actions['email-againwaitforever'] = lambda args: args
- self.actions['email-againticket_waitforever'] = lambda args: args
-
-
- self.sickdb = {}
- Thread.__init__(self)
-
- def run(self):
- self.accumSites()
- print "Accumulated %d sick sites" % len(self.sickdb.keys())
- logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
-
- try:
- stats = self.analyseSites()
- except Exception, err:
- print "----------------"
- import traceback
- print traceback.print_exc()
- print err
- if config.policysavedb:
- print "Saving Databases... act_all"
- database.dbDump("act_all", self.act_all)
- sys.exit(1)
-
- print_stats("sites_observed", stats)
- print_stats("sites_diagnosed", stats)
- print_stats("nodes_diagnosed", stats)
- print_stats("sites_emailed", stats)
- print_stats("nodes_actedon", stats)
- print string.join(stats['allsites'], ",")
-
- if config.policysavedb:
- print "Saving Databases... act_all"
- #database.dbDump("policy.eventlog", self.eventlog)
- # TODO: remove 'diagnose_out',
- # or at least the entries that were acted on.
- database.dbDump("act_all", self.act_all)
-
- def accumSites(self):
- """
- Take all nodes, from l_action, look them up in the diagnose_db database,
- and insert them into sickdb[] as:
-
- This way only the given l_action nodes will be acted on regardless
- of how many from diagnose_db are available.
-
- sickdb[loginbase][nodename] = diag_record
- """
- # TODO: what if l_action == None ?
- for nodename in self.l_action:
-
- loginbase = self.plcdb_hn2lb[nodename]
-
- if loginbase in self.diagnose_db and \
- nodename in self.diagnose_db[loginbase]['nodes']:
-
- diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
-
- if loginbase not in self.sickdb:
- self.sickdb[loginbase] = {'nodes' : {}}
-
- # NOTE: don't copy all node records, since not all will be in l_action
- self.sickdb[loginbase]['nodes'][nodename] = diag_record
- # NOTE: but, we want to get the loginbase config settings,
- # this is the easiest way.
- self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
- #else:
- #print "%s not in diagnose_db!!" % loginbase
- return
-
- def __emailSite(self, loginbase, roles, message, args):
- """
- loginbase is the unique site abbreviation, prepended to slice names.
- roles contains TECH, PI, USER roles, and derive email aliases.
- record contains {'message': [<subj>,<body>], 'args': {...}}
- """
- ticket_id = 0
- args.update({'loginbase':loginbase})
-
- if not config.mail and not config.debug and config.bcc:
- roles = ADMIN
- if config.mail and config.debug:
- roles = ADMIN
-
- # build targets
- contacts = []
- if ADMIN & roles:
- contacts += [config.email]
- if TECH & roles:
- contacts += [TECHEMAIL % loginbase]
- if PI & roles:
- contacts += [PIEMAIL % loginbase]
- if USER & roles:
- slices = plc.slices(loginbase)
- if len(slices) >= 1:
- for slice in slices:
- contacts += [SLICEMAIL % slice]
- print "SLIC: %20s : %d slices" % (loginbase, len(slices))
- else:
- print "SLIC: %20s : 0 slices" % loginbase
-
- try:
- subject = message[0] % args
- body = message[1] % args
- if ADMIN & roles:
- # send only to admin
- if 'ticket_id' in args:
- subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
- else:
- subj = "Re: [PL noticket] %s" % subject
- mailer.email(subj, body, contacts)
- ticket_id = args['ticket_id']
- else:
- ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
- except Exception, err:
- print "exception on message:"
- import traceback
- print traceback.print_exc()
- print message
-
- return ticket_id
-
-
- def _format_diaginfo(self, diag_node):
- info = diag_node['info']
- if diag_node['stage'] == 'monitor-end-record':
- hlist = " %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
- else:
- hlist = " %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
- return hlist
-
-
- def get_email_args(self, act_recordlist, loginbase=None):
-
- email_args = {}
- email_args['hostname_list'] = ""
-
- for act_record in act_recordlist:
- email_args['hostname_list'] += act_record['msg_format']
- email_args['hostname'] = act_record['nodename']
- if 'plcnode' in act_record and \
- 'pcu_ids' in act_record['plcnode'] and \
- len(act_record['plcnode']['pcu_ids']) > 0:
- print "setting 'pcu_id' for email_args %s"%email_args['hostname']
- email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
- else:
- email_args['pcu_id'] = "-1"
-
- if 'ticket_id' in act_record:
- if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
- print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
- sys.stdout.flush()
- line = sys.stdin.readline()
- try:
- ticket_id = int(line)
- except:
- print "could not get ticket_id from stdin..."
- os._exit(1)
- else:
- ticket_id = act_record['ticket_id']
-
- email_args['ticket_id'] = ticket_id
-
- return email_args
-
- def get_unique_issues(self, act_recordlist):
- # NOTE: only send one email per site, per problem...
- unique_issues = {}
- for act_record in act_recordlist:
- act_key = act_record['action'][0]
- if act_key not in unique_issues:
- unique_issues[act_key] = []
-
- unique_issues[act_key] += [act_record]
-
- return unique_issues
-
-
- def __actOnSite(self, loginbase, site_record):
- i_nodes_actedon = 0
- i_nodes_emailed = 0
-
- act_recordlist = []
-
- for nodename in site_record['nodes'].keys():
- diag_record = site_record['nodes'][nodename]
- act_record = self.__actOnNode(diag_record)
- #print "nodename: %s %s" % (nodename, act_record)
- if act_record is not None:
- act_recordlist += [act_record]
-
- unique_issues = self.get_unique_issues(act_recordlist)
-
- for issue in unique_issues.keys():
- print "\tworking on issue: %s" % issue
- issue_record_list = unique_issues[issue]
- email_args = self.get_email_args(issue_record_list, loginbase)
-
- # for each record.
- for act_record in issue_record_list:
- # if there's a pcu record and email config is set
- if 'email_pcu' in act_record:
- if act_record['message'] != None and act_record['email_pcu'] and site_record['config']['email']:
- # and 'reboot_node' in act_record['stage']:
-
- email_args['hostname'] = act_record['nodename']
- ticket_id = self.__emailSite(loginbase,
- act_record['email'],
- emailTxt.mailtxt.pcudown[0],
- email_args)
- if ticket_id == 0:
- # error.
- print "got a ticket_id == 0!!!! %s" % act_record['nodename']
- os._exit(1)
- pass
- email_args['ticket_id'] = ticket_id
-
-
- act_record = issue_record_list[0]
- # send message before squeezing
- print "\t\tconfig.email: %s and %s" % (act_record['message'] != None,
- site_record['config']['email'])
- if act_record['message'] != None and site_record['config']['email']:
- ticket_id = self.__emailSite(loginbase, act_record['email'],
- act_record['message'], email_args)
-
- if ticket_id == 0:
- # error.
- print "ticket_id == 0 for %s %s" % (loginbase, act_record['nodename'])
- os._exit(1)
- pass
-
- # Add ticket_id to ALL nodenames
- for act_record in issue_record_list:
- nodename = act_record['nodename']
- # update node record with RT ticket_id
- if nodename in self.act_all:
- self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
- # if the ticket was previously resolved, reset it to new.
- if 'rt' in act_record and \
- 'Status' in act_record['rt'] and \
- act_record['rt']['Status'] == 'resolved':
- mailer.setTicketStatus(ticket_id, "new")
- status = mailer.getTicketStatus(ticket_id)
- self.act_all[nodename][0]['rt'] = status
- if config.mail: i_nodes_emailed += 1
-
- print "\t\tconfig.squeeze: %s and %s" % (config.squeeze,
- site_record['config']['squeeze'])
- if config.squeeze and site_record['config']['squeeze']:
- for act_key in act_record['action']:
- self.actions[act_key](email_args)
- i_nodes_actedon += 1
-
- if config.policysavedb:
- print "Saving Databases... act_all, diagnose_out"
- database.dbDump("act_all", self.act_all)
- # remove site record from diagnose_out, it's in act_all as done.
- del self.diagnose_db[loginbase]
- database.dbDump("diagnose_out", self.diagnose_db)
-
- print "sleeping for 1 sec"
- time.sleep(1)
- #print "Hit enter to continue..."
- #sys.stdout.flush()
- #line = sys.stdin.readline()
-
- return (i_nodes_actedon, i_nodes_emailed)
-
- def __actOnNode(self, diag_record):
- nodename = diag_record['nodename']
- message = diag_record['message']
-
- act_record = {}
- act_record.update(diag_record)
- act_record['nodename'] = nodename
- act_record['msg_format'] = self._format_diaginfo(diag_record)
- print "act_record['stage'] == %s " % act_record['stage']
-
- # avoid end records, and nmreset records
- # reboot_node_failed, is set below, so don't reboot repeatedly.
-
- if 'monitor-end-record' not in act_record['stage'] and \
- 'nmreset' not in act_record['stage'] and \
- 'reboot_node_failed' not in act_record:
-
- if "DOWN" in act_record['log'] and \
- 'pcu_ids' in act_record['plcnode'] and \
- len(act_record['plcnode']['pcu_ids']) > 0:
-
- print "%s" % act_record['log'],
- print "%15s" % (['reboot_node'],)
- # Set node to re-install
- plc.nodeBootState(act_record['nodename'], "rins")
- try:
- ret = reboot_node({'hostname': act_record['nodename']})
- except Exception, exc:
- print "exception on reboot_node:"
- import traceback
- print traceback.print_exc()
- ret = False
-
- if ret: # and ( 'reboot_node_failed' not in act_record or act_record['reboot_node_failed'] == False):
- # Reboot Succeeded
- print "reboot succeeded for %s" % act_record['nodename']
- act_record2 = {}
- act_record2.update(act_record)
- act_record2['action'] = ['reboot_node']
- act_record2['stage'] = "reboot_node"
- act_record2['reboot_node_failed'] = False
- act_record2['email_pcu'] = False
-
- if nodename not in self.act_all:
- self.act_all[nodename] = []
- print "inserting 'reboot_node' record into act_all"
- self.act_all[nodename].insert(0,act_record2)
-
- # return None to avoid further action
- print "Taking no further action"
- return None
- else:
- print "reboot failed for %s" % act_record['nodename']
- # set email_pcu to also send pcu notice for this record.
- act_record['reboot_node_failed'] = True
- act_record['email_pcu'] = True
-
- print "%s" % act_record['log'],
- print "%15s" % act_record['action']
-
- if act_record['stage'] is not 'monitor-end-record' and \
- act_record['stage'] is not 'nmreset':
- if nodename not in self.act_all:
- self.act_all[nodename] = []
-
- self.act_all[nodename].insert(0,act_record)
- else:
- print "Not recording %s in act_all" % nodename
-
- return act_record
-
- def analyseSites(self):
- i_sites_observed = 0
- i_sites_diagnosed = 0
- i_nodes_diagnosed = 0
- i_nodes_actedon = 0
- i_sites_emailed = 0
- l_allsites = []
-
- sorted_sites = self.sickdb.keys()
- sorted_sites.sort()
- for loginbase in sorted_sites:
- site_record = self.sickdb[loginbase]
- print "sites: %s" % loginbase
-
- i_nodes_diagnosed += len(site_record.keys())
- i_sites_diagnosed += 1
-
- (na,ne) = self.__actOnSite(loginbase, site_record)
-
- i_sites_observed += 1
- i_nodes_actedon += na
- i_sites_emailed += ne
-
- l_allsites += [loginbase]
-
- return {'sites_observed': i_sites_observed,
- 'sites_diagnosed': i_sites_diagnosed,
- 'nodes_diagnosed': i_nodes_diagnosed,
- 'sites_emailed': i_sites_emailed,
- 'nodes_actedon': i_nodes_actedon,
- 'allsites':l_allsites}
-
- def print_stats(self, key, stats):
- print "%20s : %d" % (key, stats[key])
-
-
-
- #"""
- #Prints, logs, and emails status of up nodes, down nodes, and buckets.
- #"""
- #def status(self):
- # sub = "Monitor Summary"
- # msg = "\nThe following nodes were acted upon: \n\n"
- # for (node, (type, date)) in self.emailed.items():
- # # Print only things acted on today.
- # if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
- # msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
- # msg +="\n\nThe following sites have been 'squeezed':\n\n"
- # for (loginbase, (date, type)) in self.squeezed.items():
- # # Print only things acted on today.
- # if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
- # msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
- # mailer.email(sub, msg, [SUMTO])
- # logger.info(msg)
- # return
-
- #"""
- #Store/Load state of emails. When, where, what.
- #"""
- #def emailedStore(self, action):
- # try:
- # if action == "LOAD":
- # f = open(DAT, "r+")
- # logger.info("POLICY: Found and reading " + DAT)
- # self.emailed.update(pickle.load(f))
- # if action == "WRITE":
- # f = open(DAT, "w")
- # #logger.debug("Writing " + DAT)
- # pickle.dump(self.emailed, f)
- # f.close()
- # except Exception, err:
- # logger.info("POLICY: Problem with DAT, %s" %err)
-
-
-#class Policy(Thread):