X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=policy.py;h=7ce85db590acbb6f9db7eb2d7df113d7fa4cd169;hb=386a31a829887a28f5a026ea6863ccbb0e6e1d08;hp=3d226f4ba77ef3e0951c5b202c6722c930b7009f;hpb=66c4742c05622d6c53368e2890670eaefa5345f3;p=monitor.git diff --git a/policy.py b/policy.py index 3d226f4..7ce85db 100755 --- a/policy.py +++ b/policy.py @@ -18,227 +18,42 @@ import traceback import sys from optparse import OptionParser -import bootman # debug nodes - -from monitor import util -from monitor import const -from monitor import reboot from monitor import config -from monitor import database from monitor import parser as parsermodule from monitor.common import * from monitor.model import * from monitor.wrapper import plc from monitor.wrapper import plccache -from monitor.wrapper.emailTxt import mailtxt from monitor.database.info.model import * +from monitor.database.info.interface import * from nodequery import verify,query_to_dict,node_select api = plc.getAuthAPI() - -class SiteInterface(HistorySiteRecord): - @classmethod - def get_or_make(cls, if_new_set={}, **kwargs): - if 'hostname' in kwargs: - kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']] - del kwargs['hostname'] - res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs) - return SiteInterface(res) - - def __init__(self, sitehist): - self.db = sitehist - - def getRecentActions(self, **kwargs): - # TODO: make query only return records within a certin time range, - # i.e. greater than 0.5 days ago. or 5 days, etc. - - #print "kwargs: ", kwargs - - recent_actions = [] - if 'loginbase' in kwargs: - recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc()) - elif 'hostname' in kwargs: - recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc()) - return recent_actions - - def increasePenalty(self): - #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',) - self.db.penalty_level += 1 - # NOTE: this is to prevent overflow or index errors in applyPenalty. - # there's probably a better approach to this. - if self.db.penalty_level >= 2: - self.db.penalty_level = 2 - self.db.penalty_applied = True - - def applyPenalty(self): - penalty_map = [] - penalty_map.append( { 'name': 'noop', 'enable' : lambda site: None, - 'disable' : lambda site: None } ) - penalty_map.append( { 'name': 'nocreate', 'enable' : lambda site: plc.removeSiteSliceCreation(site), - 'disable' : lambda site: plc.enableSiteSliceCreation(site) } ) - penalty_map.append( { 'name': 'suspendslices', 'enable' : lambda site: plc.suspendSiteSlices(site), - 'disable' : lambda site: plc.enableSiteSlices(site) } ) - - for i in range(len(penalty_map)-1,self.db.penalty_level,-1): - print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase) - penalty_map[i]['disable'](self.db.loginbase) - - for i in range(0,self.db.penalty_level+1): - print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase) - penalty_map[i]['enable'](self.db.loginbase) - - return - - def pausePenalty(self): - act = ActionRecord(loginbase=self.db.loginbase, - action='penalty', - action_type='pause_penalty',) - - def clearPenalty(self): - #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',) - self.db.penalty_level = 0 - self.db.penalty_applied = False - - def getTicketStatus(self): - if self.db.message_id != 0: - rtstatus = mailer.getTicketStatus(self.db.message_id) - self.db.message_status = rtstatus['Status'] - self.db.message_queue = rtstatus['Queue'] - self.db.message_created = datetime.fromtimestamp(rtstatus['Created']) - - def setTicketStatus(self, status): - print 'SETTING status %s' % status - if self.db.message_id != 0: - rtstatus = mailer.setTicketStatus(self.db.message_id, status) - - def getContacts(self): - contacts = [] - if self.db.penalty_level >= 0: - contacts += plc.getTechEmails(self.db.loginbase) - - if self.db.penalty_level >= 1: - contacts += plc.getPIEmails(self.db.loginbase) - - if self.db.penalty_level >= 2: - contacts += plc.getSliceUserEmails(self.db.loginbase) - - return contacts - - def sendMessage(self, type, **kwargs): - - # NOTE: evidently changing an RT message's subject opens the ticket. - # the logic in this policy depends up a ticket only being 'open' - # if a user has replied to it. - # So, to preserve these semantics, we check the status before - # sending, then after sending, reset the status to the - # previous status. - # There is a very tiny race here, where a user sends a reply - # within the time it takes to check, send, and reset. - # This sucks. It's almost certainly fragile. - - # - # TODO: catch any errors here, and add an ActionRecord that contains - # those errors. - - args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level} - args.update(kwargs) - - hostname = None - if 'hostname' in args: - hostname = args['hostname'] - - if hasattr(mailtxt, type): - - message = getattr(mailtxt, type) - viart = True - if 'viart' in kwargs: - viart = kwargs['viart'] - - if viart: - self.getTicketStatus() # get current message status - - m = Message(message[0] % args, message[1] % args, viart, self.db.message_id) - - contacts = self.getContacts() - contacts = [config.cc_email] # TODO: remove after testing... - - print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname) - - ret = m.send(contacts) - if viart: - self.db.message_id = ret - # reset to previous status, since a new subject 'opens' RT tickets. - self.setTicketStatus(self.db.message_status) - - # NOTE: only make a record of it if it's in RT. - act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', - action_type=type, message_id=self.db.message_id) - - else: - print "+-- WARNING! ------------------------------" - print "| No such message name in emailTxt.mailtxt: %s" % type - print "+------------------------------------------" - - return - - def closeTicket(self): - # TODO: close the rt ticket before overwriting the message_id - mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor") - act = ActionRecord(loginbase=self.db.loginbase, action='notice', - action_type='end_notice', message_id=self.db.message_id) - self.db.message_id = 0 - self.db.message_status = "new" - - def runBootManager(self, hostname): - print "attempting BM reboot of %s" % hostname - ret = "" - try: - ret = bootman.restore(self, hostname) - err = "" - except: - err = traceback.format_exc() - print err - - act = ActionRecord(loginbase=self.db.loginbase, - hostname=hostname, - action='reboot', - action_type='bootmanager_restore', - error_string=err) - return ret - - def attemptReboot(self, hostname): - print "attempting PCU reboot of %s" % hostname - ret = reboot.reboot_str(hostname) - if ret == 0 or ret == "0": - ret = "" - act = ActionRecord(loginbase=self.db.loginbase, - hostname=hostname, - action='reboot', - action_type='first_try_reboot', - error_string=ret) - def logic(): plc.nodeBootState(host, 'rins') node_end_record(host) - - - def main(hostnames, sitenames): - l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) # commands: i = 1 node_count = 1 site_count = 1 #print "hosts: %s" % hostnames - for host in hostnames: + for i,host in enumerate(hostnames): try: lb = plccache.plcdb_hn2lb[host] except: print "unknown host in plcdb_hn2lb %s" % host + email_exception(host) + continue + + nodeblack = BlacklistRecord.get_by(hostname=host) + + if nodeblack and not nodeblack.expired(): + print "skipping %s due to blacklist. will expire %s" % (host, nodeblack.willExpire() ) continue sitehist = SiteInterface.get_or_make(loginbase=lb) @@ -247,38 +62,51 @@ def main(hostnames, sitenames): nodehist = HistoryNodeRecord.findby_or_create(hostname=host) - print "%s %s" % ( nodehist.hostname, nodehist.status) + print "%s %s %s" % (i, nodehist.hostname, nodehist.status) if nodehist.status == 'good' and \ changed_lessthan(nodehist.last_changed, 1.0) and \ + found_within(recent_actions, 'down_notice', 7.0) and \ not found_within(recent_actions, 'online_notice', 0.5): + # NOTE: chronicly flapping nodes will not get 'online' notices + # since, they are never up long enough to be 'good'. + # NOTE: searching for down_notice proves that the node has + # gone through a 'down' state first, rather than just + # flapping through: good, offline, online, ... + # # NOTE: there is a narrow window in which this command must be - # evaluated, otherwise the notice will not go out. this is not ideal. - sitehist.sendMessage('online_notice', hostname=host) + # evaluated, otherwise the notice will not go out. + # this is not ideal. + sitehist.sendMessage('online_notice', hostname=host, viart=False) print "send message for host %s online" % host - pass - if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \ + # if a node is offline and doesn't have a PCU, remind the user that they should have one. + if not nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.0) and \ - not found_between(recent_actions, 'first_try_reboot', 3.5, 1): + not found_within(recent_actions, 'pcumissing_notice', 7.0): + + sitehist.sendMessage('pcumissing_notice', hostname=host) + print "send message for host %s pcumissing_notice" % host + + # if it is offline and HAS a PCU, then try to use it. + if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ + changed_greaterthan(nodehist.last_changed,1.0) and \ + not found_between(recent_actions, 'try_reboot', 3.5, 1): sitehist.attemptReboot(host) - print "send message for host %s first_try_reboot" % host - pass + print "send message for host %s try_reboot" % host - # NOTE: non-intuitive is that found_between(first_try_reboot, 3.5, 1) + # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1) # will be false for a day after the above condition is satisfied - if ( nodehist.status == 'offline' or nodehist.status == 'down' ) and \ + if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.5) and \ - found_between(recent_actions, 'first_try_reboot', 3.5, 1) and \ + found_between(recent_actions, 'try_reboot', 3.5, 1) and \ not found_within(recent_actions, 'pcufailed_notice', 3.5): - # found_within(recent_actions, 'first_try_reboot', 3.5) and \ # send pcu failure message #act = ActionRecord(**kwargs) sitehist.sendMessage('pcufailed_notice', hostname=host) print "send message for host %s PCU Failure" % host - pass if nodehist.status == 'monitordebug' and \ changed_greaterthan(nodehist.last_changed, 1) and \ @@ -296,20 +124,26 @@ def main(hostnames, sitenames): # send down node notice sitehist.sendMessage('down_notice', hostname=host) - print "send message for host %s offline" % host - pass + print "send message for host %s down" % host node_count = node_count + 1 + print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') + sys.stdout.flush() + session.flush() - for site in sitenames: + for i,site in enumerate(sitenames): sitehist = SiteInterface.get_or_make(loginbase=site) + siteblack = BlacklistRecord.get_by(loginbase=site) + + if siteblack and not siteblack.expired(): + print "skipping %s due to blacklist. will expire %s" % (site, siteblack.willExpire() ) + continue + # TODO: make query only return records within a certin time range, # i.e. greater than 0.5 days ago. or 5 days, etc. recent_actions = sitehist.getRecentActions(loginbase=site) - #sitehist.sendMessage('test_notice', host) - - print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status) + print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status) if sitehist.db.status == 'down': if not found_within(recent_actions, 'pause_penalty', 30) and \ not found_within(recent_actions, 'increase_penalty', 7) and \ @@ -339,15 +173,19 @@ def main(hostnames, sitenames): # find all ticket ids for site ( could be on the site record? ) # determine if there are penalties within the last 30 days? # if so, add a 'pause_penalty' action. - if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and sitehist.db.penalty_level > 0: + if sitehist.db.message_id != 0 and sitehist.db.message_status == 'open' and \ + sitehist.db.penalty_level > 0 and not found_within(recent_actions, 'pause_penalty', 30): # pause escalation print "Pausing penalties for %s" % site sitehist.pausePenalty() site_count = site_count + 1 - session.flush() + print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') + sys.stdout.flush() + session.flush() + session.flush() return @@ -361,8 +199,7 @@ if __name__ == "__main__": force=False, nosetup=False, verbose=False, - quiet=False, - ) + quiet=False,) parser.add_option("", "--stopselect", dest="stopselect", metavar="", help="The select string that must evaluate to true for the node to be considered 'done'") @@ -387,22 +224,6 @@ if __name__ == "__main__": parser = parsermodule.getParser(['defaults'], parser) config = parsermodule.parse_args(parser) -# # COLLECT nodegroups, nodes and node lists -# if config.nodegroup: -# ng = api.GetNodeGroups({'name' : config.nodegroup}) -# nodelist = api.GetNodes(ng[0]['node_ids']) -# hostnames = [ n['hostname'] for n in nodelist ] - -# if config.node or config.nodelist: -# if config.node: hostnames = [ config.node ] -# else: hostnames = util.file.getListFromFile(config.nodelist) -# -# fbquery = FindbadNodeRecord.get_all_latest() -# fb_nodelist = [ n.hostname for n in fbquery ] - -# if config.nodeselect: -# hostnames = node_select(config.nodeselect, fb_nodelist) - fbquery = HistoryNodeRecord.query.all() hostnames = [ n.hostname for n in fbquery ] @@ -410,8 +231,9 @@ if __name__ == "__main__": sitenames = [ s.loginbase for s in fbquery ] if config.site: - site = api.GetSites(config.site) - l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname']) + # TODO: replace with calls to local db. the api fails so often that + # these calls should be regarded as unreliable. + l_nodes = plccache.GetNodesBySite(config.site) filter_hostnames = [ n['hostname'] for n in l_nodes ] hostnames = filter(lambda x: x in filter_hostnames, hostnames) @@ -423,10 +245,12 @@ if __name__ == "__main__": try: main(hostnames, sitenames) + session.flush() except KeyboardInterrupt: print "Killed by interrupt" + session.flush() sys.exit(0) except: #email_exception() print traceback.print_exc(); - print "Continuing..." + print "fail all..."