- elif 'ticket_waitforever' in diag_record['stage']:
- act_record['email'] = TECH
- if 'first-found' not in act_record:
- act_record['first-found'] = True
- act_record['log'] += " firstfound"
- act_record['action'] = ['ticket_waitforever']
- act_record['message'] = message[0]
- act_record['time'] = current_time
- else:
- if delta >= 7*SPERDAY:
- act_record['action'] = ['ticket_waitforever']
- if 'rt' in act_record and 'Status' in act_record['rt'] and \
- act_record['rt']['Status'] == 'new':
- act_record['message'] = message[0]
- else:
- act_record['message'] = None
-
- act_record['time'] = current_time # reset clock
- else:
- act_record['action'] = ['ticket_waitforever']
- act_record['message'] = None
- return None
-
- elif 'waitforever' in diag_record['stage']:
- # more than 3 days since last action
- # TODO: send only on weekdays.
- # NOTE: expects that 'time' has been reset before entering waitforever stage
- if delta >= 3*SPERDAY:
- act_record['action'] = ['email-againwaitforever']
- act_record['message'] = message[2]
- act_record['time'] = current_time # reset clock
- else:
- act_record['action'] = ['waitforever']
- act_record['message'] = None
- return None # don't send if there's no action
-
- else:
- # There is no action to be taken, possibly b/c the stage has
- # already been performed, but diagnose picked it up again.
- # two cases,
- # 1. stage is unknown, or
- # 2. delta is not big enough to bump it to the next stage.
- # TODO: figure out which. for now assume 2.
- print "UNKNOWN stage for %s; nothing done" % nodename
- act_record['action'] = ['unknown']
- act_record['message'] = message[0]
-
- act_record['email'] = TECH
- act_record['action'] = ['noop']
- act_record['message'] = message[0]
- act_record['stage'] = 'stage_actinoneweek'
- act_record['time'] = current_time # reset clock
- #print "Exiting..."
- #return None
- #sys.exit(1)
-
- print "%s" % act_record['log'],
- print "%15s" % act_record['action']
- return act_record
-
- def getMaxSlices(self, loginbase):
- # if sickdb has a loginbase, then it will have at least one node.
- site_stats = None
-
- for nodename in self.diagnose_in[loginbase].keys():
- if nodename in self.findbad['nodes']:
- site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
- break
-
- if site_stats == None:
- raise Exception, "loginbase with no nodes in findbad"
- else:
- return site_stats['max_slices']
-
- def getNumNodes(self, loginbase):
- # if sickdb has a loginbase, then it will have at least one node.
- site_stats = None
-
- for nodename in self.diagnose_in[loginbase].keys():
- if nodename in self.findbad['nodes']:
- site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
- break
-
- if site_stats == None:
- raise Exception, "loginbase with no nodes in findbad"
- else:
- if 'num_nodes' in site_stats:
- return site_stats['num_nodes']
- else:
- return 0
-
- """
- Returns number of up nodes as the total number *NOT* in act_all with a
- stage other than 'steady-state' .
- """
- def getUpAtSite(self, loginbase, d_diag_site):
- # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
- # that aren't recorded yet.
-
- numnodes = self.getNumNodes(loginbase)
- # NOTE: assume nodes we have no record of are ok. (too conservative)
- # TODO: make the 'up' value more representative
- up = numnodes
- for nodename in d_diag_site[loginbase]['nodes'].keys():
-
- rec = d_diag_site[loginbase]['nodes'][nodename]
- if rec['stage'] != 'monitor-end-record':
- up -= 1
- else:
- pass # the node is assumed to be up.
-
- #if up != numnodes:
- # print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
-
- return up
-
-
-class SiteAction:
- def __init__(self, parameter_names=['hostname', 'ticket_id']):
- self.parameter_names = parameter_names
- def checkParam(self, args):
- for param in self.parameter_names:
- if param not in args:
- raise Exception("Parameter %s not provided in args"%param)
- def run(self, args):
- self.checkParam(args)
- return self._run(args)
- def _run(self, args):
- pass
-
-class SuspendAction(SiteAction):
- def _run(self, args):
- return plc.suspendSlices(args['hostname'])
-
-class RemoveSliceCreation(SiteAction):
- def _run(self, args):
- return plc.removeSliceCreation(args['hostname'])
-
-class BackoffActions(SiteAction):
- def _run(self, args):
- plc.enableSlices(args['hostname'])
- plc.enableSliceCreation(args['hostname'])
- return True
-
-# TODO: create class for each action below,
-# allow for lists of actions to be performed...
-
-def close_rt_backoff(args):
- if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
- mailer.closeTicketViaRT(args['ticket_id'],
- "Ticket CLOSED automatically by SiteAssist.")
- plc.enableSlices(args['hostname'])
- plc.enableSliceCreation(args['hostname'])
- return
-
-def reboot_node(args):
- host = args['hostname']
- return reboot.reboot_policy(host, True, config.debug)
-
-def reset_nodemanager(args):
- os.system("ssh root@%s /sbin/service nm restart" % nodename)
- return
-
-class Action(Thread):
- def __init__(self, l_action):
- self.l_action = l_action
-
- # the hostname to loginbase mapping
- self.plcdb_hn2lb = database.dbLoad("plcdb_hn2lb")
-
- # Actions to take.
- self.diagnose_db = database.if_cached_else(1, "diagnose_out", lambda : {})
- # Actions taken.
- self.act_all = database.if_cached_else(1, "act_all", lambda : {})
-
- # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
- self.actions = {}
- self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
- self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
- self.actions['close_rt'] = lambda args: close_rt_backoff(args)
- self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
- self.actions['noop'] = lambda args: args
- self.actions['reboot_node'] = lambda args: reboot_node(args)
- self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
-
- self.actions['ticket_waitforever'] = lambda args: args
- self.actions['waitforever'] = lambda args: args
- self.actions['unknown'] = lambda args: args
- self.actions['waitforoneweekaction'] = lambda args: args
- self.actions['waitfortwoweeksaction'] = lambda args: args
- self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
- self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
- self.actions['email-againwaitforever'] = lambda args: args
- self.actions['email-againticket_waitforever'] = lambda args: args
-
-
- self.sickdb = {}
- Thread.__init__(self)
-
- def run(self):
- self.accumSites()
- print "Accumulated %d sick sites" % len(self.sickdb.keys())
- logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
-
- try:
- stats = self.analyseSites()
- except Exception, err:
- print "----------------"
- import traceback
- print traceback.print_exc()
- print err
- if config.policysavedb:
- print "Saving Databases... act_all"
- database.dbDump("act_all", self.act_all)
- sys.exit(1)
-
- print_stats("sites_observed", stats)
- print_stats("sites_diagnosed", stats)
- print_stats("nodes_diagnosed", stats)
- print_stats("sites_emailed", stats)
- print_stats("nodes_actedon", stats)
- print string.join(stats['allsites'], ",")
-
- if config.policysavedb:
- print "Saving Databases... act_all"
- #database.dbDump("policy.eventlog", self.eventlog)
- # TODO: remove 'diagnose_out',
- # or at least the entries that were acted on.
- database.dbDump("act_all", self.act_all)
-
- def accumSites(self):
- """
- Take all nodes, from l_action, look them up in the diagnose_db database,
- and insert them into sickdb[] as:
-
- This way only the given l_action nodes will be acted on regardless
- of how many from diagnose_db are available.
-
- sickdb[loginbase][nodename] = diag_record
- """
- # TODO: what if l_action == None ?
- for nodename in self.l_action:
-
- loginbase = self.plcdb_hn2lb[nodename]
-
- if loginbase in self.diagnose_db and \
- nodename in self.diagnose_db[loginbase]['nodes']:
-
- diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
-
- if loginbase not in self.sickdb:
- self.sickdb[loginbase] = {'nodes' : {}}
-
- # NOTE: don't copy all node records, since not all will be in l_action
- self.sickdb[loginbase]['nodes'][nodename] = diag_record
- # NOTE: but, we want to get the loginbase config settings,
- # this is the easiest way.
- self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
- #else:
- #print "%s not in diagnose_db!!" % loginbase
- return
-
- def __emailSite(self, loginbase, roles, message, args):
- """
- loginbase is the unique site abbreviation, prepended to slice names.
- roles contains TECH, PI, USER roles, and derive email aliases.
- record contains {'message': [<subj>,<body>], 'args': {...}}
- """
- ticket_id = 0
- args.update({'loginbase':loginbase})
-
- if not config.mail and not config.debug and config.bcc:
- roles = ADMIN
- if config.mail and config.debug:
- roles = ADMIN
-
- # build targets
- contacts = []
- if ADMIN & roles:
- contacts += [config.email]
- if TECH & roles:
- contacts += [TECHEMAIL % loginbase]
- if PI & roles:
- contacts += [PIEMAIL % loginbase]
- if USER & roles:
- slices = plc.slices(loginbase)
- if len(slices) >= 1:
- for slice in slices:
- contacts += [SLICEMAIL % slice]
- print "SLIC: %20s : %d slices" % (loginbase, len(slices))
- else:
- print "SLIC: %20s : 0 slices" % loginbase
-
- try:
- subject = message[0] % args
- body = message[1] % args
- if ADMIN & roles:
- # send only to admin
- if 'ticket_id' in args:
- subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
- else:
- subj = "Re: [PL noticket] %s" % subject
- mailer.email(subj, body, contacts)
- ticket_id = args['ticket_id']
- else:
- ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
- except Exception, err:
- print "exception on message:"
- import traceback
- print traceback.print_exc()
- print message
-
- return ticket_id
-
-
- def _format_diaginfo(self, diag_node):
- info = diag_node['info']
- if diag_node['stage'] == 'monitor-end-record':
- hlist = " %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
- else:
- hlist = " %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
- return hlist
-
-
- def get_email_args(self, act_recordlist, loginbase=None):
-
- email_args = {}
- email_args['hostname_list'] = ""
-
- for act_record in act_recordlist:
- email_args['hostname_list'] += act_record['msg_format']
- email_args['hostname'] = act_record['nodename']
- if 'plcnode' in act_record and \
- 'pcu_ids' in act_record['plcnode'] and \
- len(act_record['plcnode']['pcu_ids']) > 0:
- print "setting 'pcu_id' for email_args %s"%email_args['hostname']
- email_args['pcu_id'] = act_record['plcnode']['pcu_ids'][0]
- else:
- email_args['pcu_id'] = "-1"
-
- if 'ticket_id' in act_record:
- if act_record['ticket_id'] == 0 or act_record['ticket_id'] == '0':
- print "Enter the ticket_id for %s @ %s" % (loginbase, act_record['nodename'])
- sys.stdout.flush()
- line = sys.stdin.readline()
- try:
- ticket_id = int(line)
- except:
- print "could not get ticket_id from stdin..."
- os._exit(1)
- else:
- ticket_id = act_record['ticket_id']