- current_time = time.time()
- delta = current_time - diag_record['time']
-
- message = diag_record['message']
- act_record = {}
- act_record.update(diag_record)
-
- #### DIAGNOSE STAGES
- #print "%s has stage %s" % (nodename, diag_record['stage'])
- if 'findbad' in diag_record['stage']:
- # The node is bad, and there's no previous record of it.
- act_record['email'] = TECH # addative emails
- act_record['action'] = 'noop'
- act_record['message'] = message[0]
- act_record['stage'] = 'stage_actinoneweek'
-
- elif 'improvement' in diag_record['stage']:
- # - backoff previous squeeze actions (slice suspend, nocreate)
- # TODO: add a backoff_squeeze section... Needs to runthrough
- act_record['action'] = 'close_rt'
- act_record['message'] = message[0]
- act_record['stage'] = 'monitor-end-record'
-
- elif 'actinoneweek' in diag_record['stage']:
- act_record['email'] = TECH | PI # addative emails
- if delta >= 7 * SPERDAY:
- act_record['stage'] = 'stage_actintwoweeks'
- act_record['message'] = message[1]
- act_record['action'] = 'nocreate'
- elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
- act_record['message'] = message[1]
- act_record['action'] = 'sendmailagain-waitforoneweekaction'
- act_record['second-mail-at-oneweek'] = True
- else:
- act_record['message'] = None
- act_record['action'] = 'waitforoneweekaction'
-
- elif 'actintwoweeks' in diag_record['stage']:
- act_record['email'] = TECH | PI | USER # addative emails
- if delta >= 14 * SPERDAY:
- act_record['stage'] = 'stage_waitforever'
- act_record['message'] = message[2]
- act_record['action'] = 'suspendslices'
- act_record['time'] = current_time # reset clock for waitforever
- elif delta >= 10* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
- act_record['message'] = message[2]
- act_record['action'] = 'sendmailagain-waitfortwoweeksaction'
- act_record['second-mail-at-twoweeks'] = True
- else:
- act_record['message'] = None
- act_record['action'] = 'waitfortwoweeksaction'
-
- elif 'ticket_waitforever' in diag_record['stage']:
- act_record['email'] = TECH
- if 'first-found' not in act_record:
- act_record['first-found'] = True
- act_record['action'] = 'ticket_waitforever'
- act_record['message'] = None
- act_record['time'] = current_time
- else:
- if delta >= 7*SPERDAY:
- act_record['action'] = 'email-againticket_waitforever'
- act_record['message'] = message[0]
- act_record['time'] = current_time # reset clock
- else:
- act_record['action'] = 'ticket_waitforever'
- act_record['message'] = None
-
- elif 'waitforever' in diag_record['stage']:
- # more than 3 days since last action
- # TODO: send only on weekdays.
- # NOTE: expects that 'time' has been reset before entering waitforever stage
- if delta >= 3*SPERDAY:
- act_record['action'] = 'email-againwaitforever'
- act_record['message'] = message[0]
- act_record['time'] = current_time # reset clock
- else:
- act_record['action'] = 'waitforever'
- act_record['message'] = None
-
- else:
- # There is no action to be taken, possibly b/c the stage has
- # already been performed, but diagnose picked it up again.
- # two cases,
- # 1. stage is unknown, or
- # 2. delta is not big enough to bump it to the next stage.
- # TODO: figure out which. for now assume 2.
- print "UNKNOWN!!? %s" % nodename
- act_record['action'] = 'unknown'
- act_record['message'] = message[0]
- print "Exiting..."
- sys.exit(1)
-
- print "%s" % act_record['log'],
- print "%15s" % act_record['action']
- return act_record
-
-
-class SiteAction:
- def __init__(self, parameter_names=['hostname', 'ticket_id']):
- self.parameter_names = parameter_names
- def checkParam(self, args):
- for param in self.parameter_names:
- if param not in args:
- raise Exception("Parameter %s not provided in args"%param)
- def run(self, args):
- self.checkParam(args)
- return self._run(args)
- def _run(self, args):
- pass
-
-class SuspendAction(SiteAction):
- def _run(self, args):
- return plc.suspendSlices(args['hostname'])
-
-class RemoveSliceCreation(SiteAction):
- def _run(self, args):
- return plc.removeSliceCreation(args['hostname'])
-
-class BackoffActions(SiteAction):
- def _run(self, args):
- plc.enableSlices(args['hostname'])
- plc.enableSliceCreation(args['hostname'])
- return True
-
-# TODO: create class for each action below,
-# allow for lists of actions to be performed...
-
-def close_rt_backoff(args):
- mailer.closeTicketViaRT(args['ticket_id'], "Ticket CLOSED automatically by SiteAssist.")
- plc.enableSlices(args['hostname'])
- plc.enableSliceCreation(args['hostname'])
- return
-
-class Action(Thread):
- def __init__(self, l_action):
- self.l_action = l_action
-
- # the hostname to loginbase mapping
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
-
- # Actions to take.
- self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
- # Actions taken.
- self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
-
- # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
- self.actions = {}
- self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
- self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
- self.actions['close_rt'] = lambda args: close_rt_backoff(args)
- self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
- self.actions['noop'] = lambda args: args
- self.actions['ticket_waitforever'] = lambda args: args
- self.actions['waitforever'] = lambda args: args
- self.actions['unknown'] = lambda args: args
- self.actions['waitforoneweekaction'] = lambda args: args
- self.actions['waitfortwoweeksaction'] = lambda args: args
- self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
- self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
- self.actions['email-againwaitforever'] = lambda args: args
- self.actions['email-againticket_waitforever'] = lambda args: args
-
-
- self.sickdb = {}
- Thread.__init__(self)
-
- def run(self):
- self.accumSites()
- print "Accumulated %d sick sites" % len(self.sickdb.keys())
- logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
-
- try:
- stats = self.analyseSites()
- except Exception, err:
- print "----------------"
- import traceback
- print traceback.print_exc()
- print err
- if config.policysavedb:
- print "Saving Databases... act_all"
- soltesz.dbDump("act_all", self.act_all)
- sys.exit(1)
-
- self.print_stats("sites", stats)
- self.print_stats("sites_diagnosed", stats)
- self.print_stats("nodes_diagnosed", stats)
- self.print_stats("sites_emailed", stats)
- self.print_stats("nodes_actedon", stats)
- print string.join(stats['allsites'], ",")
-
- if config.policysavedb:
- print "Saving Databases... act_all"
- #soltesz.dbDump("policy.eventlog", self.eventlog)
- # TODO: remove 'diagnose_out',
- # or at least the entries that were acted on.
- soltesz.dbDump("act_all", self.act_all)
-
- def accumSites(self):
- """
- Take all nodes, from l_action, look them up in the diagnose_db database,
- and insert them into sickdb[] as:
-
- This way only the given l_action nodes will be acted on regardless
- of how many from diagnose_db are available.
-
- sickdb[loginbase][nodename] = diag_record
- """
- # TODO: what if l_action == None ?
- for nodename in self.l_action:
-
- loginbase = self.plcdb_hn2lb[nodename]
-
- if loginbase in self.diagnose_db and \
- nodename in self.diagnose_db[loginbase]:
-
- diag_record = self.diagnose_db[loginbase][nodename]
-
- if loginbase not in self.sickdb:
- self.sickdb[loginbase] = {}
-
- self.sickdb[loginbase][nodename] = diag_record
- return
-
- def __emailSite(self, loginbase, roles, message, args):
- """
- loginbase is the unique site abbreviation, prepended to slice names.
- roles contains TECH, PI, USER roles, and derive email aliases.
- record contains {'message': [<subj>,<body>], 'args': {...}}
- """
- ticket_id = 0
- args.update({'loginbase':loginbase})
-
- if not config.mail and not config.debug and config.bcc:
- roles = ADMIN
- if config.mail and config.debug:
- roles = ADMIN
-
- # build targets
- contacts = []
- if ADMIN & roles:
- contacts += [config.email]
- if TECH & roles:
- contacts += [TECHEMAIL % loginbase]
- if PI & roles:
- contacts += [PIEMAIL % loginbase]
- if USER & roles:
- slices = plc.slices(loginbase)
- if len(slices) >= 1:
- for slice in slices:
- contacts += [SLICEMAIL % slice]
- print "SLIC: %20s : %d slices" % (loginbase, len(slices))
- else:
- print "SLIC: %20s : 0 slices" % loginbase
-
- try:
- subject = message[0] % args
- body = message[1] % args
- if ADMIN & roles:
- # send only to admin
- if 'ticket_id' in args:
- subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
- else:
- subj = "Re: [PL noticket] %s" % subject
- mailer.email(subj, body, contacts)
- ticket_id = args['ticket_id']
- else:
- #if 'ticket_id' in args and 'ticket_id' != "":
- # # Reformat Subject to include Ticket_ID for RT
- # subj = "Re: [PL #%s] %s" % (args['ticket_id'], subject)
- # # RT remembers old contacts, so only add new users
- # mailer.email(subj, body, ['monitor@planet-lab.org'] + contacts)
- # ticket_id = args['ticket_id']
- #else:
- # ticket_id = mailer.emailViaRT(subject, body, contacts)
- ticket_id = mailer.emailViaRT(subject, body, contacts, args['ticket_id'])
- except Exception, err:
- print "exception on message:"
- import traceback
- print traceback.print_exc()
- print message
-
- return ticket_id
-
-
- def _format_diaginfo(self, diag_node):
- info = diag_node['info']
- if diag_node['stage'] == 'monitor-end-record':
- hlist = " %s went from '%s' to '%s'\n" % (info[0], info[1], info[2])
- else:
- hlist = " %s %s - %s\n" % (info[0], info[2], info[1]) #(node,ver,daysdn)
- return hlist
-
- def __actOnSite(self, loginbase, site_record):
- i_nodes_actedon = 0
- i_nodes_emailed = 0
- b_squeeze = config.squeeze
-
- act_recordlist = []
-
- for nodename in site_record.keys():
- diag_record = site_record[nodename]
- act_record = self.__actOnNode(diag_record)
- act_recordlist += [act_record]
-
- count_up = self.currentUpAtSite(loginbase)
- if count_up < MINUP:
- print "SITE: %20s : %d nodes up" % (loginbase, count_up)
- else:
- print "SITE: %20s : %d nodes up" % (loginbase, count_up)
- # There may be a second penalty regardless of which stage it's in.
- # TODO: check how long this has occurred.
-
- email_args = {}
- email_args['hostname_list'] = ""
- for act_record in act_recordlist:
- email_args['hostname_list'] += act_record['msg_format']
- email_args['hostname'] = act_record['nodename']
- if 'ticket_id' in act_record:
- email_args['ticket_id'] = act_record['ticket_id']
-
- # Send email, perform node action
- # TODO: only send one email per site for a given problem...
- if len(act_recordlist) > 0:
- act_record = act_recordlist[0]
-
- # send message before squeezing, b/c
- if act_record['message'] != None:
- ticket_id = self.__emailSite(loginbase, act_record['email'],
- act_record['message'], email_args)
-
- # Add ticket_id to ALL nodenames
- for act_record in act_recordlist:
- nodename = act_record['nodename']
- # update node record with RT ticket_id
- self.act_all[nodename][0]['ticket_id'] = "%s" % ticket_id
- if config.mail: i_nodes_emailed += 1
-
- # TODO: perform the most severe action?
- if b_squeeze:
- act_key = act_record['action']
- self.actions[act_key](email_args)
- i_nodes_actedon += 1
-
- if config.policysavedb:
- print "Saving Databases... act_all, diagnose_out"
- soltesz.dbDump("act_all", self.act_all)
- # remove site record from diagnose_out, it's in act_all as done.
- del self.diagnose_db[loginbase]
- soltesz.dbDump("diagnose_out", self.diagnose_db)
-
- print "Hit enter to continue..."