X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=policy.py;fp=policy.py;h=4befbd94b3452fe6bed2c39bc76e29c4ab61abeb;hb=1aa6b24594dc485d7232649c50e9773956a1492f;hp=fcbbb94a2d46d2237848d783d08d2d1a2ba53eaa;hpb=924b7e5c530ecf25d4c5b002fa89ff73ef11f53c;p=monitor.git diff --git a/policy.py b/policy.py index fcbbb94..4befbd9 100755 --- a/policy.py +++ b/policy.py @@ -18,229 +18,31 @@ import traceback import sys from optparse import OptionParser -import bootman # debug nodes - -from monitor import util -from monitor import const -from monitor import reboot from monitor import config -from monitor import database from monitor import parser as parsermodule from monitor.common import * from monitor.model import * from monitor.wrapper import plc from monitor.wrapper import plccache -from monitor.wrapper.emailTxt import mailtxt from monitor.database.info.model import * +from monitor.database.info.interface import * from nodequery import verify,query_to_dict,node_select api = plc.getAuthAPI() - -class SiteInterface(HistorySiteRecord): - @classmethod - def get_or_make(cls, if_new_set={}, **kwargs): - if 'hostname' in kwargs: - kwargs['loginbase'] = plccache.plcdb_hn2lb[kwargs['hostname']] - del kwargs['hostname'] - res = HistorySiteRecord.findby_or_create(if_new_set, **kwargs) - return SiteInterface(res) - - def __init__(self, sitehist): - self.db = sitehist - - def getRecentActions(self, **kwargs): - # TODO: make query only return records within a certin time range, - # i.e. greater than 0.5 days ago. or 5 days, etc. - - #print "kwargs: ", kwargs - - recent_actions = [] - if 'loginbase' in kwargs: - recent_actions = ActionRecord.query.filter_by(loginbase=kwargs['loginbase']).order_by(ActionRecord.date_created.desc()) - elif 'hostname' in kwargs: - recent_actions = ActionRecord.query.filter_by(hostname=kwargs['hostname']).order_by(ActionRecord.date_created.desc()) - return recent_actions - - def increasePenalty(self): - #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='increase_penalty',) - self.db.penalty_level += 1 - # NOTE: this is to prevent overflow or index errors in applyPenalty. - # there's probably a better approach to this. - if self.db.penalty_level >= 2: - self.db.penalty_level = 2 - self.db.penalty_applied = True - - def applyPenalty(self): - penalty_map = [] - penalty_map.append( { 'name': 'noop', 'enable' : lambda site: None, - 'disable' : lambda site: None } ) - penalty_map.append( { 'name': 'nocreate', 'enable' : lambda site: plc.removeSiteSliceCreation(site), - 'disable' : lambda site: plc.enableSiteSliceCreation(site) } ) - penalty_map.append( { 'name': 'suspendslices', 'enable' : lambda site: plc.suspendSiteSlices(site), - 'disable' : lambda site: plc.enableSiteSlices(site) } ) - - for i in range(len(penalty_map)-1,self.db.penalty_level,-1): - print "\tdisabling %s on %s" % (penalty_map[i]['name'], self.db.loginbase) - penalty_map[i]['disable'](self.db.loginbase) - - for i in range(0,self.db.penalty_level+1): - print "\tapplying %s on %s" % (penalty_map[i]['name'], self.db.loginbase) - penalty_map[i]['enable'](self.db.loginbase) - - return - - def pausePenalty(self): - act = ActionRecord(loginbase=self.db.loginbase, - action='penalty', - action_type='pause_penalty',) - - def clearPenalty(self): - #act = ActionRecord(loginbase=self.db.loginbase, action='penalty', action_type='clear_penalty',) - self.db.penalty_level = 0 - self.db.penalty_applied = False - - def getTicketStatus(self): - if self.db.message_id != 0: - rtstatus = mailer.getTicketStatus(self.db.message_id) - self.db.message_status = rtstatus['Status'] - self.db.message_queue = rtstatus['Queue'] - self.db.message_created = datetime.fromtimestamp(rtstatus['Created']) - - def setTicketStatus(self, status): - print 'SETTING status %s' % status - if self.db.message_id != 0: - rtstatus = mailer.setTicketStatus(self.db.message_id, status) - - def getContacts(self): - contacts = [] - if self.db.penalty_level >= 0: - contacts += plc.getTechEmails(self.db.loginbase) - - if self.db.penalty_level >= 1: - contacts += plc.getPIEmails(self.db.loginbase) - - if self.db.penalty_level >= 2: - contacts += plc.getSliceUserEmails(self.db.loginbase) - - return contacts - - def sendMessage(self, type, **kwargs): - - # NOTE: evidently changing an RT message's subject opens the ticket. - # the logic in this policy depends up a ticket only being 'open' - # if a user has replied to it. - # So, to preserve these semantics, we check the status before - # sending, then after sending, reset the status to the - # previous status. - # There is a very tiny race here, where a user sends a reply - # within the time it takes to check, send, and reset. - # This sucks. It's almost certainly fragile. - - # - # TODO: catch any errors here, and add an ActionRecord that contains - # those errors. - - args = {'loginbase' : self.db.loginbase, 'penalty_level' : self.db.penalty_level} - args.update(kwargs) - - hostname = None - if 'hostname' in args: - hostname = args['hostname'] - - if hasattr(mailtxt, type): - - message = getattr(mailtxt, type) - viart = True - if 'viart' in kwargs: - viart = kwargs['viart'] - - if viart: - self.getTicketStatus() # get current message status - - m = Message(message[0] % args, message[1] % args, viart, self.db.message_id) - - contacts = self.getContacts() - contacts = [config.cc_email] # TODO: remove after testing... - - print "sending message: %s to site %s for host %s" % (type, self.db.loginbase, hostname) - - ret = m.send(contacts) - if viart: - self.db.message_id = ret - # reset to previous status, since a new subject 'opens' RT tickets. - self.setTicketStatus(self.db.message_status) - - # NOTE: only make a record of it if it's in RT. - act = ActionRecord(loginbase=self.db.loginbase, hostname=hostname, action='notice', - action_type=type, message_id=self.db.message_id) - - else: - print "+-- WARNING! ------------------------------" - print "| No such message name in emailTxt.mailtxt: %s" % type - print "+------------------------------------------" - - return - - def closeTicket(self): - # TODO: close the rt ticket before overwriting the message_id - mailer.closeTicketViaRT(self.db.message_id, "Ticket Closed by Monitor") - act = ActionRecord(loginbase=self.db.loginbase, action='notice', - action_type='end_notice', message_id=self.db.message_id) - self.db.message_id = 0 - self.db.message_status = "new" - - def runBootManager(self, hostname): - print "attempting BM reboot of %s" % hostname - ret = "" - try: - ret = bootman.restore(self, hostname) - err = "" - except: - err = traceback.format_exc() - print err - - act = ActionRecord(loginbase=self.db.loginbase, - hostname=hostname, - action='reboot', - action_type='bootmanager_restore', - error_string=err) - return ret - - def attemptReboot(self, hostname): - print "attempting PCU reboot of %s" % hostname - err = "" - try: - ret = reboot.reboot_str(hostname) - except Exception, e: - err = traceback.format_exc() - ret = str(e) - - if ret == 0 or ret == "0": - ret = "" - - act = ActionRecord(loginbase=self.db.loginbase, - hostname=hostname, - action='reboot', - action_type='first_try_reboot', - error_string=err) - def logic(): plc.nodeBootState(host, 'rins') node_end_record(host) - - - def main(hostnames, sitenames): # commands: i = 1 node_count = 1 site_count = 1 #print "hosts: %s" % hostnames - for host in hostnames: + for i,host in enumerate(hostnames): try: lb = plccache.plcdb_hn2lb[host] except: @@ -259,13 +61,13 @@ def main(hostnames, sitenames): nodehist = HistoryNodeRecord.findby_or_create(hostname=host) - print "%s %s" % ( nodehist.hostname, nodehist.status) + print "%s %s %s" % (i, nodehist.hostname, nodehist.status) if nodehist.status == 'good' and \ changed_lessthan(nodehist.last_changed, 1.0) and \ not found_within(recent_actions, 'online_notice', 0.5): # NOTE: there is a narrow window in which this command must be # evaluated, otherwise the notice will not go out. this is not ideal. - sitehist.sendMessage('online_notice', hostname=host) + sitehist.sendMessage('online_notice', hostname=host, viart=False) print "send message for host %s online" % host pass @@ -314,15 +116,19 @@ def main(hostnames, sitenames): node_count = node_count + 1 session.flush() - for site in sitenames: + for i,site in enumerate(sitenames): sitehist = SiteInterface.get_or_make(loginbase=site) + siteblack = BlacklistRecord.get_by(loginbase=site) + + if siteblack and not siteblack.expired(): + print "skipping %s due to blacklist. will expire %s" % (site, siteblack.willExpire() ) + continue + # TODO: make query only return records within a certin time range, # i.e. greater than 0.5 days ago. or 5 days, etc. recent_actions = sitehist.getRecentActions(loginbase=site) - #sitehist.sendMessage('test_notice', host) - - print "%s %s" % ( sitehist.db.loginbase , sitehist.db.status) + print "%s %s %s" % (i, sitehist.db.loginbase, sitehist.db.status) if sitehist.db.status == 'down': if not found_within(recent_actions, 'pause_penalty', 30) and \ not found_within(recent_actions, 'increase_penalty', 7) and \ @@ -375,8 +181,7 @@ if __name__ == "__main__": force=False, nosetup=False, verbose=False, - quiet=False, - ) + quiet=False,) parser.add_option("", "--stopselect", dest="stopselect", metavar="", help="The select string that must evaluate to true for the node to be considered 'done'") @@ -401,12 +206,6 @@ if __name__ == "__main__": parser = parsermodule.getParser(['defaults'], parser) config = parsermodule.parse_args(parser) -# # COLLECT nodegroups, nodes and node lists -# if config.nodegroup: -# ng = api.GetNodeGroups({'name' : config.nodegroup}) -# nodelist = api.GetNodes(ng[0]['node_ids']) -# hostnames = [ n['hostname'] for n in nodelist ] - fbquery = HistoryNodeRecord.query.all() hostnames = [ n.hostname for n in fbquery ] @@ -416,8 +215,7 @@ if __name__ == "__main__": if config.site: # TODO: replace with calls to local db. the api fails so often that # these calls should be regarded as unreliable. - site = api.GetSites(config.site) - l_nodes = api.GetNodes(site[0]['node_ids'], ['hostname']) + l_nodes = plccache.GetNodesBySite(config.site) filter_hostnames = [ n['hostname'] for n in l_nodes ] hostnames = filter(lambda x: x in filter_hostnames, hostnames)