- # If no luck with tech, email PI
- if (delta >= 1):
- target.append(PIEMAIL % loginbase)
-
- # If more than PI thresh, but less than slicethresh
- if (delta >= PITHRESH) and (delta < SLICETHRESH):
- #remove slice creation if enough nodes arent up
- if not self.enoughUp(loginbase):
- slices = plc.slices(loginbase)
- if len(slices) >= 1:
- for slice in slices:
- target.append(SLICEMAIL % slice)
- logger.info("POLICY: Removing slice creation from %s" % loginbase)
- tmp = emailTxt.mailtxt.removedSliceCreation
- sbj = tmp[0]
- msg = tmp[1] % {'loginbase': loginbase}
- plc.removeSliceCreation(node)
- mailer.email(sbj, msg, target)
- self.squeezed[loginbase] = (time.time(), "creation")
- self.emailed[node] = ("creation", time.time())
- return
-
- # If more than PI thresh and slicethresh
- if (delta >= PITHRESH) and (delta > SLICETHRESH):
- target.append(PIEMAIL % loginbase)
- # Email slices at site.
- slices = plc.slices(loginbase)
- if len(slices) >= 1:
- for slice in slices:
- target.append(SLICEMAIL % slice)
- # If not enough up, freeze slices and email everyone.
- if not self.enoughUp(loginbase):
- logger.info("POLICY: Suspending %s slices." % loginbase)
- tmp = emailTxt.mailtxt.suspendSlices
- sbj = tmp[0]
- msg = tmp[1] % {'loginbase': loginbase}
- plc.suspendSlices(node)
- self.squeezed[loginbase] = (time.time(), "freeze")
- mailer.email(sbj, msg, target)
- self.emailed[node] = ("freeze", time.time())
- return
-
- # Find the bucket the node is in and send appropriate email
- # to approriate list of people.
- for bkt in self.cmn.comonbkts.keys():
- if (node in getattr(self.cmn, bkt)):
- # Send predefined message for that bucket.
- logger.info("POLICY: Emailing (%s) %s - %s"\
- %(bkt, node, target))
- tmp = getattr(emailTxt.mailtxt, bkt)
- sbj = tmp[0] % {'hostname': node}
- msg = tmp[1] % {'hostname': node}
- mailer.email(sbj, msg, target)
- self.emailed[node] = (bkt , time.time())
- return
-
-
- '''
- Prints, logs, and emails status of up nodes, down nodes, and buckets.
- '''
- def status(self):
- sub = "Monitor Summary"
- msg = "\nThe following nodes were acted upon: \n\n"
- for (node, (type, date)) in self.emailed.items():
- # Print only things acted on today.
- if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
- msg +="%s\t(%s)\t%s\n" %(node, type, time.ctime(date))
- msg +="\n\nThe following sites have been 'squeezed':\n\n"
- for (loginbase, (date, type)) in self.squeezed.items():
- # Print only things acted on today.
- if (time.gmtime(time.time())[2] == time.gmtime(date)[2]):
- msg +="%s\t(%s)\t%s\n" %(loginbase, type, time.ctime(date))
- mailer.email(sub, msg, [SUMTO])
- logger.info(msg)
- return
-
- '''
- Store/Load state of emails. When, where, what.
- '''
- def emailedStore(self, action):
+ elif 'improvement' in diag_record['stage']:
+ # - backoff previous squeeze actions (slice suspend, nocreate)
+ # TODO: add a backoff_squeeze section... Needs to runthrough
+ act_record['action'] = ['close_rt']
+ act_record['message'] = message[0]
+ act_record['stage'] = 'monitor-end-record'
+
+ elif 'actinoneweek' in diag_record['stage']:
+ if delta >= 7 * SPERDAY:
+ act_record['email'] = TECH | PI
+ act_record['stage'] = 'stage_actintwoweeks'
+ act_record['message'] = message[1]
+ act_record['action'] = ['nocreate' ]
+ act_record['time'] = current_time # reset clock for waitforever
+ elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
+ act_record['email'] = TECH
+ act_record['message'] = message[0]
+ act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
+ act_record['second-mail-at-oneweek'] = True
+ else:
+ act_record['message'] = None
+ act_record['action'] = ['waitforoneweekaction' ]
+ print "ignoring this record for: %s" % act_record['nodename']
+ return None # don't send if there's no action
+
+ elif 'actintwoweeks' in diag_record['stage']:
+ if delta >= 7 * SPERDAY:
+ act_record['email'] = TECH | PI | USER
+ act_record['stage'] = 'stage_waitforever'
+ act_record['message'] = message[2]
+ act_record['action'] = ['suspendslices']
+ act_record['time'] = current_time # reset clock for waitforever
+ elif delta >= 3* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
+ act_record['email'] = TECH | PI
+ act_record['message'] = message[1]
+ act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
+ act_record['second-mail-at-twoweeks'] = True
+ else:
+ act_record['message'] = None
+ act_record['action'] = ['waitfortwoweeksaction']
+ return None # don't send if there's no action
+
+ elif 'ticket_waitforever' in diag_record['stage']:
+ act_record['email'] = TECH
+ if 'first-found' not in act_record:
+ act_record['first-found'] = True
+ act_record['log'] += " firstfound"
+ act_record['action'] = ['ticket_waitforever']
+ act_record['message'] = None
+ act_record['time'] = current_time
+ else:
+ if delta >= 7*SPERDAY:
+ act_record['action'] = ['ticket_waitforever']
+ act_record['message'] = None
+ act_record['time'] = current_time # reset clock
+ else:
+ act_record['action'] = ['ticket_waitforever']
+ act_record['message'] = None
+ return None
+
+ elif 'waitforever' in diag_record['stage']:
+ # more than 3 days since last action
+ # TODO: send only on weekdays.
+ # NOTE: expects that 'time' has been reset before entering waitforever stage
+ if delta >= 3*SPERDAY:
+ act_record['action'] = ['email-againwaitforever']
+ act_record['message'] = message[2]
+ act_record['time'] = current_time # reset clock
+ else:
+ act_record['action'] = ['waitforever']
+ act_record['message'] = None
+ return None # don't send if there's no action
+
+ else:
+ # There is no action to be taken, possibly b/c the stage has
+ # already been performed, but diagnose picked it up again.
+ # two cases,
+ # 1. stage is unknown, or
+ # 2. delta is not big enough to bump it to the next stage.
+ # TODO: figure out which. for now assume 2.
+ print "UNKNOWN stage for %s; nothing done" % nodename
+ act_record['action'] = ['unknown']
+ act_record['message'] = message[0]
+ #print "Exiting..."
+ return None
+ #sys.exit(1)
+
+ print "%s" % act_record['log'],
+ print "%15s" % act_record['action']
+ return act_record
+
+ def getMaxSlices(self, loginbase):
+ # if sickdb has a loginbase, then it will have at least one node.
+ site_stats = None
+
+ for nodename in self.diagnose_in[loginbase].keys():
+ if nodename in self.findbad['nodes']:
+ site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
+ break
+
+ if site_stats == None:
+ raise Exception, "loginbase with no nodes in findbad"
+ else:
+ return site_stats['max_slices']
+
+ def getNumNodes(self, loginbase):
+ # if sickdb has a loginbase, then it will have at least one node.
+ site_stats = None
+
+ for nodename in self.diagnose_in[loginbase].keys():
+ if nodename in self.findbad['nodes']:
+ site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
+ break
+
+ if site_stats == None:
+ raise Exception, "loginbase with no nodes in findbad"
+ else:
+ return site_stats['num_nodes']
+
+ """
+ Returns number of up nodes as the total number *NOT* in act_all with a
+ stage other than 'steady-state' .
+ """
+ def getUpAtSite(self, loginbase, d_diag_site):
+ # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
+ # that aren't recorded yet.
+
+ numnodes = self.getNumNodes(loginbase)
+ # NOTE: assume nodes we have no record of are ok. (too conservative)
+ # TODO: make the 'up' value more representative
+ up = numnodes
+ for nodename in d_diag_site[loginbase]['nodes'].keys():
+
+ rec = d_diag_site[loginbase]['nodes'][nodename]
+ if rec['stage'] != 'monitor-end-record':
+ up -= 1
+ else:
+ pass # the node is assumed to be up.
+
+ #if up != numnodes:
+ # print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
+
+ return up
+
+
+class SiteAction:
+ def __init__(self, parameter_names=['hostname', 'ticket_id']):
+ self.parameter_names = parameter_names
+ def checkParam(self, args):
+ for param in self.parameter_names:
+ if param not in args:
+ raise Exception("Parameter %s not provided in args"%param)
+ def run(self, args):
+ self.checkParam(args)
+ return self._run(args)
+ def _run(self, args):
+ pass
+
+class SuspendAction(SiteAction):
+ def _run(self, args):
+ return plc.suspendSlices(args['hostname'])
+
+class RemoveSliceCreation(SiteAction):
+ def _run(self, args):
+ return plc.removeSliceCreation(args['hostname'])
+
+class BackoffActions(SiteAction):
+ def _run(self, args):
+ plc.enableSlices(args['hostname'])
+ plc.enableSliceCreation(args['hostname'])
+ return True
+
+# TODO: create class for each action below,
+# allow for lists of actions to be performed...
+
+def close_rt_backoff(args):
+ if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
+ mailer.closeTicketViaRT(args['ticket_id'],
+ "Ticket CLOSED automatically by SiteAssist.")
+ plc.enableSlices(args['hostname'])
+ plc.enableSliceCreation(args['hostname'])
+ return
+
+def reboot_node(args):
+ host = args['hostname']
+ return reboot.reboot_new(host, True, config.debug)
+
+def reset_nodemanager(args):
+ os.system("ssh root@%s /sbin/service nm restart" % nodename)
+ return
+
+class Action(Thread):
+ def __init__(self, l_action):
+ self.l_action = l_action
+
+ # the hostname to loginbase mapping
+ self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
+
+ # Actions to take.
+ self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
+ # Actions taken.
+ self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
+
+ # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
+ self.actions = {}
+ self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
+ self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
+ self.actions['close_rt'] = lambda args: close_rt_backoff(args)
+ self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
+ self.actions['noop'] = lambda args: args
+ self.actions['reboot_node'] = lambda args: reboot_node(args)
+ self.actions['reset_nodemanager'] = lambda args: args # reset_nodemanager(args)
+
+ self.actions['ticket_waitforever'] = lambda args: args
+ self.actions['waitforever'] = lambda args: args
+ self.actions['unknown'] = lambda args: args
+ self.actions['waitforoneweekaction'] = lambda args: args
+ self.actions['waitfortwoweeksaction'] = lambda args: args
+ self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
+ self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
+ self.actions['email-againwaitforever'] = lambda args: args
+ self.actions['email-againticket_waitforever'] = lambda args: args
+
+
+ self.sickdb = {}
+ Thread.__init__(self)
+
+ def run(self):
+ self.accumSites()
+ print "Accumulated %d sick sites" % len(self.sickdb.keys())
+ logger.debug("Accumulated %d sick sites" % len(self.sickdb.keys()))
+
+ try:
+ stats = self.analyseSites()
+ except Exception, err:
+ print "----------------"
+ import traceback
+ print traceback.print_exc()
+ print err
+ if config.policysavedb:
+ print "Saving Databases... act_all"
+ soltesz.dbDump("act_all", self.act_all)
+ sys.exit(1)
+
+ print_stats("sites_observed", stats)
+ print_stats("sites_diagnosed", stats)
+ print_stats("nodes_diagnosed", stats)
+ print_stats("sites_emailed", stats)
+ print_stats("nodes_actedon", stats)
+ print string.join(stats['allsites'], ",")
+
+ if config.policysavedb:
+ print "Saving Databases... act_all"
+ #soltesz.dbDump("policy.eventlog", self.eventlog)
+ # TODO: remove 'diagnose_out',
+ # or at least the entries that were acted on.
+ soltesz.dbDump("act_all", self.act_all)
+
+ def accumSites(self):
+ """
+ Take all nodes, from l_action, look them up in the diagnose_db database,
+ and insert them into sickdb[] as:
+
+ This way only the given l_action nodes will be acted on regardless
+ of how many from diagnose_db are available.
+
+ sickdb[loginbase][nodename] = diag_record
+ """
+ # TODO: what if l_action == None ?
+ for nodename in self.l_action:
+
+ loginbase = self.plcdb_hn2lb[nodename]
+
+ if loginbase in self.diagnose_db and \
+ nodename in self.diagnose_db[loginbase]['nodes']:
+
+ diag_record = self.diagnose_db[loginbase]['nodes'][nodename]
+
+ if loginbase not in self.sickdb:
+ self.sickdb[loginbase] = {'nodes' : {}}
+
+ # NOTE: don't copy all node records, since not all will be in l_action
+ self.sickdb[loginbase]['nodes'][nodename] = diag_record
+ # NOTE: but, we want to get the loginbase config settings,
+ # this is the easiest way.
+ self.sickdb[loginbase]['config'] = self.diagnose_db[loginbase]['config']
+ #else:
+ #print "%s not in diagnose_db!!" % loginbase
+ return
+
+ def __emailSite(self, loginbase, roles, message, args):
+ """
+ loginbase is the unique site abbreviation, prepended to slice names.
+ roles contains TECH, PI, USER roles, and derive email aliases.
+ record contains {'message': [<subj>,<body>], 'args': {...}}
+ """
+ ticket_id = 0
+ args.update({'loginbase':loginbase})
+
+ if not config.mail and not config.debug and config.bcc:
+ roles = ADMIN
+ if config.mail and config.debug:
+ roles = ADMIN
+
+ # build targets
+ contacts = []
+ if ADMIN & roles:
+ contacts += [config.email]
+ if TECH & roles:
+ contacts += [TECHEMAIL % loginbase]
+ if PI & roles:
+ contacts += [PIEMAIL % loginbase]
+ if USER & roles:
+ slices = plc.slices(loginbase)
+ if len(slices) >= 1:
+ for slice in slices:
+ contacts += [SLICEMAIL % slice]
+ print "SLIC: %20s : %d slices" % (loginbase, len(slices))
+ else:
+ print "SLIC: %20s : 0 slices" % loginbase
+