- diag_record['log'] = "IMPR: %20s : %-40s == %20s %20s %s %s" % \
- (loginbase, nodename, diag_record['stage'],
- state, category, diag_record['ticket_id'])
- return diag_record
- else:
- return None
- else:
- # unknown
- pass
- elif "ALPHA" in category:
- pass
- elif "clock_drift" in category:
- pass
- elif "dns" in category:
- pass
- elif "filerw" in category:
- pass
- else:
- print "Unknown category!!!! %s" % category
- sys.exit(1)
-
- return diag_record
-
- def __diagnoseNode(self, loginbase, node_record):
- # TODO: change the format of the hostname in this
- # record to something more natural.
- nodename = node_record['nodename']
- category = node_record['category']
- prev_category = node_record['prev_category']
- state = node_record['state']
-
- val = cmpCategoryVal(category, prev_category)
- if val == -1:
- # current category is worse than previous, carry on
- pass
- elif val == 1:
- # current category is better than previous
- # TODO: too generous for now, but will be handled correctly
- # TODO: if stage is currently ticket_waitforever,
- if 'ticket_id' not in node_record:
- print "ignoring: ", node_record['nodename']
- return None
- else:
- if node_record['ticket_id'] == "" or \
- node_record['ticket_id'] == None:
- print "closing: ", node_record['nodename']
- node_record['action'] = ['close_rt']
- node_record['message'] = None
- node_record['stage'] = 'monitor-end-record'
- return node_record
- #return None
- else:
- node_record['stage'] = 'improvement'
- else:
- #values are equal, carry on.
- pass
-
- #### COMPARE category and prev_category
- # if not_equal
- # then assign a stage based on relative priorities
- # else equal
- # then check category for stats.
- diag_record = self.diagRecordByCategory(node_record)
- if diag_record == None:
- return None
-
- #### found_RT_ticket
- # TODO: need to record time found, and maybe add a stage for acting on it...
- if 'found_rt_ticket' in diag_record and \
- diag_record['found_rt_ticket'] is not None:
- if diag_record['stage'] is not 'improvement':
- diag_record['stage'] = 'ticket_waitforever'
-
- current_time = time.time()
- # take off four days, for the delay that database caused.
- # TODO: generalize delays at PLC, and prevent enforcement when there
- # have been no emails.
- # NOTE: 7*SPERDAY exists to offset the 'bad week'
- delta = current_time - diag_record['time'] - 7*SPERDAY
-
- message = diag_record['message']
- act_record = {}
- act_record.update(diag_record)
-
- #### DIAGNOSE STAGES
- #print "%s has stage %s" % (nodename, diag_record['stage'])
- if 'findbad' in diag_record['stage']:
- # The node is bad, and there's no previous record of it.
- act_record['email'] = TECH # addative emails
- act_record['action'] = ['noop']
- act_record['message'] = message[0]
- act_record['stage'] = 'stage_actinoneweek'
-
- elif 'improvement' in diag_record['stage']:
- # - backoff previous squeeze actions (slice suspend, nocreate)
- # TODO: add a backoff_squeeze section... Needs to runthrough
- act_record['action'] = ['close_rt']
- act_record['message'] = message[0]
- act_record['stage'] = 'monitor-end-record'
-
- elif 'actinoneweek' in diag_record['stage']:
- if delta >= 7 * SPERDAY:
- act_record['email'] = TECH | PI
- act_record['stage'] = 'stage_actintwoweeks'
- act_record['message'] = message[1]
- act_record['action'] = ['nocreate' ]
- elif delta >= 3* SPERDAY and not 'second-mail-at-oneweek' in act_record:
- act_record['email'] = TECH
- act_record['message'] = message[0]
- act_record['action'] = ['sendmailagain-waitforoneweekaction' ]
- act_record['second-mail-at-oneweek'] = True
- else:
- act_record['message'] = None
- act_record['action'] = ['waitforoneweekaction' ]
- return None # don't send if there's no action
-
- elif 'actintwoweeks' in diag_record['stage']:
- if delta >= 14 * SPERDAY:
- act_record['email'] = TECH | PI | USER
- act_record['stage'] = 'stage_waitforever'
- act_record['message'] = message[2]
- act_record['action'] = ['suspendslices']
- act_record['time'] = current_time # reset clock for waitforever
- elif delta >= 10* SPERDAY and not 'second-mail-at-twoweeks' in act_record:
- act_record['email'] = TECH | PI
- act_record['message'] = message[1]
- act_record['action'] = ['sendmailagain-waitfortwoweeksaction' ]
- act_record['second-mail-at-twoweeks'] = True
- else:
- act_record['message'] = None
- act_record['action'] = ['waitfortwoweeksaction']
- return None # don't send if there's no action
-
- elif 'ticket_waitforever' in diag_record['stage']:
- act_record['email'] = TECH
- if 'first-found' not in act_record:
- act_record['first-found'] = True
- act_record['log'] += " firstfound"
- act_record['action'] = ['ticket_waitforever']
- act_record['message'] = None
- act_record['time'] = current_time
- else:
- if delta >= 7*SPERDAY:
- act_record['action'] = ['ticket_waitforever']
- act_record['message'] = None
- act_record['time'] = current_time # reset clock
- else:
- act_record['action'] = ['ticket_waitforever']
- act_record['message'] = None
- return None
-
- elif 'waitforever' in diag_record['stage']:
- # more than 3 days since last action
- # TODO: send only on weekdays.
- # NOTE: expects that 'time' has been reset before entering waitforever stage
- if delta >= 3*SPERDAY:
- act_record['action'] = ['email-againwaitforever']
- act_record['message'] = message[2]
- act_record['time'] = current_time # reset clock
- else:
- act_record['action'] = ['waitforever']
- act_record['message'] = None
- return None # don't send if there's no action
-
- else:
- # There is no action to be taken, possibly b/c the stage has
- # already been performed, but diagnose picked it up again.
- # two cases,
- # 1. stage is unknown, or
- # 2. delta is not big enough to bump it to the next stage.
- # TODO: figure out which. for now assume 2.
- print "UNKNOWN!!? %s" % nodename
- act_record['action'] = ['unknown']
- act_record['message'] = message[0]
- print "Exiting..."
- sys.exit(1)
-
- print "%s" % act_record['log'],
- print "%15s" % act_record['action']
- return act_record
-
- def getMaxSlices(self, loginbase):
- # if sickdb has a loginbase, then it will have at least one node.
- site_stats = None
-
- for nodename in self.diagnose_in[loginbase].keys():
- if nodename in self.findbad['nodes']:
- site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
- break
-
- if site_stats == None:
- raise Exception, "loginbase with no nodes in findbad"
- else:
- return site_stats['max_slices']
-
- def getNumNodes(self, loginbase):
- # if sickdb has a loginbase, then it will have at least one node.
- site_stats = None
-
- for nodename in self.diagnose_in[loginbase].keys():
- if nodename in self.findbad['nodes']:
- site_stats = self.findbad['nodes'][nodename]['values']['plcsite']
- break
-
- if site_stats == None:
- raise Exception, "loginbase with no nodes in findbad"
- else:
- return site_stats['num_nodes']
-
- """
- Returns number of up nodes as the total number *NOT* in act_all with a
- stage other than 'steady-state' .
- """
- def getUpAtSite(self, loginbase, d_diag_site):
- # TODO: THIS DOESN"T WORK!!! it misses all the 'debug' state nodes
- # that aren't recorded yet.
-
- numnodes = self.getNumNodes(loginbase)
- # NOTE: assume nodes we have no record of are ok. (too conservative)
- # TODO: make the 'up' value more representative
- up = numnodes
- for nodename in d_diag_site[loginbase]['nodes'].keys():
-
- rec = d_diag_site[loginbase]['nodes'][nodename]
- if rec['stage'] != 'monitor-end-record':
- up -= 1
- else:
- pass # the node is assumed to be up.
-
- #if up != numnodes:
- # print "ERROR: %s total nodes up and down != %d" % (loginbase, numnodes)
-
- return up
-
-
-class SiteAction:
- def __init__(self, parameter_names=['hostname', 'ticket_id']):
- self.parameter_names = parameter_names
- def checkParam(self, args):
- for param in self.parameter_names:
- if param not in args:
- raise Exception("Parameter %s not provided in args"%param)
- def run(self, args):
- self.checkParam(args)
- return self._run(args)
- def _run(self, args):
- pass
-
-class SuspendAction(SiteAction):
- def _run(self, args):
- return plc.suspendSlices(args['hostname'])
-
-class RemoveSliceCreation(SiteAction):
- def _run(self, args):
- return plc.removeSliceCreation(args['hostname'])
-
-class BackoffActions(SiteAction):
- def _run(self, args):
- plc.enableSlices(args['hostname'])
- plc.enableSliceCreation(args['hostname'])
- return True
-
-# TODO: create class for each action below,
-# allow for lists of actions to be performed...
-
-def close_rt_backoff(args):
- if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
- mailer.closeTicketViaRT(args['ticket_id'],
- "Ticket CLOSED automatically by SiteAssist.")
- plc.enableSlices(args['hostname'])
- plc.enableSliceCreation(args['hostname'])
- return
-
-class Action(Thread):
- def __init__(self, l_action):
- self.l_action = l_action
-
- # the hostname to loginbase mapping
- self.plcdb_hn2lb = soltesz.dbLoad("plcdb_hn2lb")
-
- # Actions to take.
- self.diagnose_db = soltesz.if_cached_else(1, "diagnose_out", lambda : {})
- # Actions taken.
- self.act_all = soltesz.if_cached_else(1, "act_all", lambda : {})
-
- # A dict of actions to specific functions. PICKLE doesnt' like lambdas.
- self.actions = {}
- self.actions['suspendslices'] = lambda args: plc.suspendSlices(args['hostname'])
- self.actions['nocreate'] = lambda args: plc.removeSliceCreation(args['hostname'])
- self.actions['close_rt'] = lambda args: close_rt_backoff(args)
- self.actions['rins'] = lambda args: plc.nodeBootState(args['hostname'], "rins")
- self.actions['noop'] = lambda args: args
- self.actions['ticket_waitforever'] = lambda args: args
- self.actions['waitforever'] = lambda args: args
- self.actions['unknown'] = lambda args: args
- self.actions['waitforoneweekaction'] = lambda args: args
- self.actions['waitfortwoweeksaction'] = lambda args: args
- self.actions['sendmailagain-waitforoneweekaction'] = lambda args: args
- self.actions['sendmailagain-waitfortwoweeksaction'] = lambda args: args
- self.actions['email-againwaitforever'] = lambda args: args
- self.actions['email-againticket_waitforever'] = lambda args: args