From 944d143a6528c4157b71f51ed480aec806cbaa06 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Fri, 15 Aug 2008 19:13:35 +0000 Subject: [PATCH] www/printbadnodes.py www/runlevels.py use the new site-package module to pull in configuration information and locate database files. config.py uses ConfigParser now. nodequery.py doesn't unconfitionally load findbad db. sets up fbpcus for reboot module printbadcsv.py pull from monitor module for database and configuration findbad.py change default db name, and perform node_selects correctly with fb fetch.py debug statements to track parser.py error. mailmonitor.py simplify content. nodebad.py remove unnecessary db loads, and import from 'const' module. parser.py set nodeselect default to None rather than 'False'. expected changes to parse_args() based on new config.py pcubad.py import constants from correct module clean_policy.py don't import from www modules don't import from policy module unified_model.py remove unconditional db load add two definitions from nodecommon and model.py grouprins.py use normal config.py fix pcu identification logic. really need to have a better fix for this. monitor-server.spec add requirements, the monitor python module, nodecommon.py moved node_end_record to unified_model.py bootman.py load from 'const.py' additional boot sequence ids. policy.py removed various functions for clean policy. findbadpcu.py update use of parser module nodeinfo.py remove unconditional loads add node.last_changed output database.py conflate config and monitorconfig reboot.py remove unconditional load monitor_policy.py add funcs from policy.py --- bootman.py | 31 ++++--- clean_policy.py | 13 ++- config.py | 203 ++++++++++--------------------------------- database.py | 5 +- fetch.py | 7 +- findbad.py | 5 +- findbadpcu.py | 9 +- grouprins.py | 28 +++--- mailmonitor.py | 81 +++++++++-------- model.py | 12 --- monitor-server.spec | 15 +++- monitor/__init__.py | 1 + monitor_policy.py | 26 +++++- nodebad.py | 3 +- nodecommon.py | 31 +------ nodeinfo.py | 22 +++-- nodequery.py | 11 ++- parser.py | 5 +- pcubad.py | 2 +- policy.py | 21 +---- printbadcsv.py | 4 +- reboot.py | 14 +-- rt.py | 1 - sitebad.py | 2 +- threadpool.py | 1 - todo | 33 ++++--- unified_model.py | 51 +++++++++-- www/printbadnodes.py | 46 +++------- www/runlevels.py | 4 +- 29 files changed, 305 insertions(+), 382 deletions(-) diff --git a/bootman.py b/bootman.py index d34e6ef..a278afe 100755 --- a/bootman.py +++ b/bootman.py @@ -7,7 +7,7 @@ api = plc.getAuthAPI() import sys import os -import policy +import const from getsshkeys import SSHKnownHosts @@ -321,7 +321,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) print "\tDisabling %s due to out-of-date BOOTCD" % hostname api.UpdateNode(hostname, {'boot_state' : 'disable'}) @@ -453,7 +453,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.set_nodestate('disable') return False @@ -615,11 +615,14 @@ def reboot(hostname, config=None, forced_action=None): # update_node_config_email for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done", - "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done", + "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done", ]: sequences.update({n : "update_node_config_email"}) - for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]: + for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", + "bminit-cfg-update-exception-nodehostname-update-debug-done", + ]: sequences.update({n : "nodenetwork_email"}) # update_bootcd_email @@ -643,7 +646,11 @@ def reboot(hostname, config=None, forced_action=None): sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"}) # bad_dns_email - sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"}) + for n in [ + "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done", + ]: + sequences.update( { n : "bad_dns_email"}) flag_set = True @@ -708,7 +715,7 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodeid_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.dump_plconf_file() conn.set_nodestate('disable') @@ -720,7 +727,7 @@ def reboot(hostname, config=None, forced_action=None): m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args, True, db='nodenet_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.dump_plconf_file() conn.set_nodestate('disable') @@ -735,7 +742,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) print "\tDisabling %s due to out-of-date BOOTCD" % hostname conn.set_nodestate('disable') @@ -753,7 +760,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.baddisk[1] % args, True, db='hardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.set_nodestate('disable') elif sequences[s] == "update_hardware_email": @@ -765,7 +772,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.set_nodestate('disable') elif sequences[s] == "bad_dns_email": @@ -788,7 +795,7 @@ def reboot(hostname, config=None, forced_action=None): mailtxt.baddns[1] % args, True, db='baddns_persistmessages') loginbase = plc.siteId(hostname) - m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase]) + m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase]) conn.set_nodestate('disable') if flag_set: diff --git a/clean_policy.py b/clean_policy.py index f1249cf..d2bde41 100644 --- a/clean_policy.py +++ b/clean_policy.py @@ -2,12 +2,11 @@ import config import database import time import mailer -from www.printbadnodes import cmpCategoryVal +from unified_model import cmpCategoryVal import sys import emailTxt import string -from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node from rt import is_host_in_rt_tickets import plc @@ -21,6 +20,16 @@ from const import * from unified_model import * +def get_ticket_id(record): + if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None: + return record['ticket_id'] + elif 'found_rt_ticket' in record and \ + record['found_rt_ticket'] is not "" and \ + record['found_rt_ticket'] is not None: + return record['found_rt_ticket'] + else: + return None + class MonitorMergeDiagnoseSendEscellate: def __init__(self, hostname, act): self.hostname = hostname diff --git a/config.py b/config.py index 859d0aa..b37e04a 100644 --- a/config.py +++ b/config.py @@ -1,173 +1,58 @@ #!/usr/bin/python -import pickle + +# load defaults from /etc/monitor.conf +# home/.monitor.conf +# $PWD/.monitor.conf import os -import getopt -import sys -import __main__ -from optparse import OptionParser -from parser import parse_bool +import ConfigParser + +class Options(object): + def __init__(self): + cp = ConfigParser.ConfigParser() + cp.optionxform = str + # load defaults from global, home dir, then $PWD + cp.read(['/etc/monitor.conf', os.path.expanduser('~/.monitor.conf'), + '.monitor.conf', 'monitor.conf']) + self.cp = cp + self.section = "default" + def __getattr__(self, name): + if name in self.cp.sections(): + self.section = name + return self + else: + return self.cp.get(self.section, name) -debug=0 -mail=0 -bcc=0 -email="soltesz@cs.utk.edu" -run=False -checkopt=False -squeeze=0 -policysavedb=0 -config_command = False +import config +imported = False def updatemodule(module, cf): module.__dict__.update(cf.__dict__) -class config: - debug=0 - mail=0 - bcc=0 - email="soltesz@cs.utk.edu" - run=False - checkopt=False - squeeze=0 - policysavedb=0 - __file = ".config" - - def __init__(self, parser=None): - if os.path.exists(self.__file): # file exists, read that. - f = open(self.__file, 'r') - o = pickle.load(f) - self.__dict__.update(o) - f.close() - - if parser == None: - self.parser = OptionParser() +def update_section(options, section, bool=False): + # Place all default commandline values at the top level of this module + for key in options.cp.options(section): + if bool: + config.__dict__.update({key : options.cp.getboolean(section, key)}) else: - self.parser = parser - - self.parser.set_defaults(debug = self.debug, - mail = self.mail, - bcc = self.bcc, - email = self.email, - run = self.run, - checkopt = False, - squeeze = self.squeeze, - policysavedb = self.policysavedb) - - self.parser.add_option("", "--debug", dest="debug", - help="Enable debugging", - type="int", - metavar="[0|1]", - action="callback", - callback=parse_bool) - self.parser.add_option("", "--mail", dest="mail", - help="Enable sending email", - type="int", - metavar="[0|1]", - action="callback", - callback=parse_bool) - self.parser.add_option("", "--bcc", dest="bcc", - help="Include BCC to user", - type="int", - metavar="[0|1]", - action="callback", - callback=parse_bool) - self.parser.add_option("", "--squeeze", dest="squeeze", - help="Squeeze sites or not", - type="int", - metavar="[0|1]", - action="callback", - callback=parse_bool) - self.parser.add_option("", "--policysavedb", dest="policysavedb", - help="Save the policy event database after a run", - type="int", - metavar="[0|1]", - action="callback", - callback=parse_bool) - self.parser.add_option("", "--checkopt", dest="checkopt", - action="store_true", - help="print current options") - self.parser.add_option("", "--run", dest="run", - action="store_true", - help="Perform monitor or print configs") - self.parser.add_option("", "--email", dest="email", - help="Specify an email address to use for mail when "+\ - "debug is enabled or for bcc when it is not") - - # config_command is needed to keep subsequent loads of config() from - # trying to parse the arguments that have already been parsed by - # the new main(). - if parser == None and config_command: - print "calling parse_args" - self.parse_args() - - def parse_args(self): - #print "self: %s" % self - #import traceback - #print traceback.print_stack() - #print "Ccalling parse_args" - (options, args) = self.parser.parse_args() - #for o in options.__dict__: - # print "optin: %s == %s" % (o, options.__dict__[o]) - self.__dict__.update(options.__dict__) - self.__dict__['args'] = args - self.save(options) - if options.checkopt: - self.usage() - # print "\nAdd --run to actually perform the command" - sys.exit(1) - - def getListFromFile(self, file): - f = open(file, 'r') - list = [] - for line in f: - line = line.strip() - list += [line] - return list - - def print_values(self): - exclude = ['parser'] - for key in self.__dict__.keys(): - if key not in exclude: - print "%20s == %s" % (key, self.__dict__[key]) - - def save(self, options=None): - f = open(self.__file, 'w') - if options == None: - o = {'debug': self.debug, - 'mail': self.mail, - 'bcc': self.bcc, - 'email':self.email, - 'squeeze':self.squeeze, - 'policysavedb':self.policysavedb} - else: - o = options.__dict__ - - pickle.dump(o, f) - f.close() - - def usage(self): - self.print_values() - self.parser.print_help() - + config.__dict__.update({key : options.cp.get(section, key)}) -def main(): - """ Start threads, do some housekeeping, then daemonize. """ - # Defaults - global config_command - config_command = True - config = __main__.config() +def update(parseoptions): + update_commandline() + # now update the top-level module with all other args passed in here. + for key in parseoptions.__dict__.keys(): + config.__dict__.update({key: parseoptions.__dict__[key]}) - try: - print "acalling parse_args" - config.parse_args() - - except Exception, err: - print "Error: %s " % err - config.usage() - sys.exit(1) +if not config.imported: + imported = True - config.usage() + #from config import options as config + options = Options() + update_section(options, 'commandline', True) + update_section(options, 'monitorconfig') +#for i in dir(config): +# if "__" not in i: +# print i, "==", config.__dict__[i] +#print "======================================" -if __name__ == '__main__': - main() diff --git a/database.py b/database.py index 3c657fe..3b5bd65 100644 --- a/database.py +++ b/database.py @@ -12,7 +12,7 @@ except: import inspect import shutil import config -import monitorconfig +import config as monitorconfig DEBUG= 0 PICKLE_PATH=monitorconfig.MONITOR_DATA_ROOT @@ -111,7 +111,10 @@ class SPickle: raise Exception, "No such file %s" % name + #import traceback + #print traceback.print_stack() #print "loading %s" % self.__file(name, type) + #sys.stderr.write("-----------------------------\n") f = open(self.__file(name, type), 'r') if type == None: o = pickle.load(f) diff --git a/fetch.py b/fetch.py index 7d93967..91b5715 100755 --- a/fetch.py +++ b/fetch.py @@ -47,8 +47,11 @@ if __name__ == "__main__": if not os.path.exists(outdir): os.system('mkdir -p %s' % outdir) - if config.site is not None or config.nodeselect is not None or config.nodegroup is not None: + if config.site is not None or \ + config.nodeselect is not None or \ + config.nodegroup is not None: print "TODO: implement support for nodeselect and site queries." + print "%s %s %s" % (config.site, config.nodeselect, config.nodegroup) sys.exit(1) if config.nodelist == None and config.node == None: @@ -71,4 +74,6 @@ if __name__ == "__main__": f = open(config.cmdfile,'r') cmd = f.read() + print filelist + vx_start(filelist, outdir, cmd, int(config.timeout)) diff --git a/findbad.py b/findbad.py index ce41d30..7e45408 100755 --- a/findbad.py +++ b/findbad.py @@ -359,7 +359,8 @@ def main(): # perform this query after the above options, so that the filter above # does not break. if config.nodeselect: - l_nodes = node_select(config.nodeselect) + fb = database.dbLoad("findbad") + l_nodes = node_select(config.nodeselect, fb['nodes'].keys(), fb) print "fetching %s hosts" % len(l_nodes) @@ -373,7 +374,7 @@ if __name__ == '__main__': parser = parsermodule.getParser(['nodesets']) - parser.set_defaults( increment=False, dbname="findbadnodes", cachenodes=False) + parser.set_defaults( increment=False, dbname="findbad", cachenodes=False) parser.add_option("", "--cachenodes", action="store_true", help="Cache node lookup from PLC") parser.add_option("", "--dbname", dest="dbname", metavar="FILE", diff --git a/findbadpcu.py b/findbadpcu.py index de4474b..55422a3 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -404,9 +404,8 @@ if __name__ == '__main__': formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') fh.setFormatter(formatter) logger.addHandler(fh) - from config import config - from optparse import OptionParser - parser = OptionParser() + import parser as parsermodule + parser = parsermodule.getParser() parser.set_defaults(nodelist=None, increment=False, pcuid=None, @@ -426,8 +425,8 @@ if __name__ == '__main__': help="Refresh the cached values") parser.add_option("-i", "--increment", action="store_true", dest="increment", help="Increment round number to force refresh or retry") - config = config(parser) - config.parse_args() + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) try: # NOTE: evidently, there is a bizarre interaction between iLO and ssh # when LANG is set... Do not know why. Unsetting LANG, fixes the problem. diff --git a/grouprins.py b/grouprins.py index 92a745c..1896f41 100755 --- a/grouprins.py +++ b/grouprins.py @@ -15,12 +15,12 @@ import plc api = plc.getAuthAPI() -import policy import traceback -from config import config as cfg +import config import util.file from optparse import OptionParser +import const from nodecommon import * from nodequery import verify,query_to_dict,node_select import database @@ -32,8 +32,8 @@ import parser as parsermodule from model import * import bootman # debug nodes -import monitor # down nodes with pcu import reboot # down nodes without pcu +import mailmonitor # down nodes with pcu from emailTxt import mailtxt #reboot.verbose = 0 import sys @@ -54,18 +54,21 @@ class Reboot(object): mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages') loginbase = plc.siteId(host) - m.send([policy.TECHEMAIL % loginbase]) + m.send([const.TECHEMAIL % loginbase]) def pcu(self, host): # TODO: It should be possible to diagnose the various conditions of # the PCU here, and send different messages as appropriate. - if self.fbnode['pcu'] == "PCU": + print "'%s'" % self.fbnode['pcu'] + if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']: self.action = "reboot.reboot('%s')" % host pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags') + pflags.resetRecentFlag('pcutried') if not pflags.getRecentFlag('pcutried'): pflags.setRecentFlag('pcutried') try: + print "CALLING REBOOT!!!" ret = reboot.reboot(host) pflags.save() @@ -94,8 +97,10 @@ class Reboot(object): return True else: + print "GetRecentFlag()" return False else: + print "NO PCUOK" self.action = "None" return False @@ -108,10 +113,10 @@ class Reboot(object): pflags.setRecentFlag('endrecord') pflags.save() - # Then in either case, run monitor.reboot() - self.action = "monitor.reboot('%s')" % host + # Then in either case, run mailmonitor.reboot() + self.action = "mailmonitor.reboot('%s')" % host try: - return monitor.reboot(host) + return mailmonitor.reboot(host) except Exception, e: print traceback.print_exc(); print e return False @@ -207,8 +212,10 @@ if config.node or config.nodelist: if config.node: hostnames = [ config.node ] else: hostnames = config.getListFromFile(config.nodelist) +fb = database.dbLoad("findbad") + if config.nodeselect: - hostnames = node_select(config.nodeselect) + hostnames = node_select(config.nodeselect, fb['nodes'].keys(), fb) if config.findbad: # rerun findbad with the nodes in the given nodes. @@ -216,7 +223,6 @@ if config.findbad: util.file.setFileFromList(file, hostnames) os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file) -fb = database.dbLoad("findbad") # commands: i = 1 count = 1 @@ -233,7 +239,7 @@ for host in hostnames: print "%-2d" % i, nodegroup_display(node, fb) i += 1 - if i < int(config.skip): continue + if i-1 <= int(config.skip): continue if config.stopselect: dict_query = query_to_dict(config.stopselect) diff --git a/mailmonitor.py b/mailmonitor.py index 48fa514..87b301f 100644 --- a/mailmonitor.py +++ b/mailmonitor.py @@ -8,7 +8,6 @@ import database -from monitor_policy import * import rt import sys @@ -55,46 +54,46 @@ def reboot(hostname): return True -def reboot2(hostname): - l_nodes = api.GetNodes(hostname) - if len(l_nodes) == 0: - raise Exception("No such host: %s" % hostname) - - l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) - l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : []) - - l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) - if len(l_nodes) == 0: - raise Exception("Host removed via blacklist: %s" % hostname) - - ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None) - if ad_dbTickets == None: - raise Exception("Could not find cached dbTickets") - - - args = {} - args['hostname'] = "%s" % hostname - args['hostname_list'] = "%s" % hostname - args['loginbase'] = plc.siteId(hostname) - - m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname, - mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages') - - #print "merge" - merge = Merge( [node['hostname'] for node in l_nodes]) - record_list = merge.run() - #print "rt" - rt = RT(record_list, ad_dbTickets, l_ticket_blacklist) - record_list = rt.run() - #print "diagnose" - diag = Diagnose(record_list) - diagnose_out = diag.run() - #print diagnose_out - #print "action" - action = Action(diagnose_out) - action.run() - - return True +#def reboot2(hostname): +# l_nodes = api.GetNodes(hostname) +# if len(l_nodes) == 0: +# raise Exception("No such host: %s" % hostname) +# +# l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : []) +# l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : []) +# +# l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes) +# if len(l_nodes) == 0: +# raise Exception("Host removed via blacklist: %s" % hostname) +# +# ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None) +# if ad_dbTickets == None: +# raise Exception("Could not find cached dbTickets") +# +# +# args = {} +# args['hostname'] = "%s" % hostname +# args['hostname_list'] = "%s" % hostname +# args['loginbase'] = plc.siteId(hostname) +# +# m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname, +# mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages') +# +# #print "merge" +# merge = Merge( [node['hostname'] for node in l_nodes]) +# record_list = merge.run() +# #print "rt" +# rt = RT(record_list, ad_dbTickets, l_ticket_blacklist) +# record_list = rt.run() +# #print "diagnose" +# diag = Diagnose(record_list) +# diagnose_out = diag.run() +# #print diagnose_out +# #print "action" +# action = Action(diagnose_out) +# action.run() +# +# return True def main(): diff --git a/model.py b/model.py index 558d04d..c8f6331 100644 --- a/model.py +++ b/model.py @@ -108,16 +108,4 @@ class Diagnose(MonRecord): MonRecord.__init__(self, data) return -class Action(MonRecord): - def __init__(self, host, data): - self.host = host - MonRecord.__init__(self, data) - return - - def deltaDays(self, delta): - t = datetime.fromtimestamp(self.__dict__['time']) - d = t + timedelta(delta) - self.__dict__['time'] = time.mktime(d.timetuple()) - - diff --git a/monitor-server.spec b/monitor-server.spec index 9f0f1fa..e6b5fac 100644 --- a/monitor-server.spec +++ b/monitor-server.spec @@ -9,6 +9,8 @@ %define taglevel 6 %define release %{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}} +%global python_sitearch %( python -c "from distutils.sysconfig import get_python_lib; print get_python_lib(1)" ) + Summary: Monitor backend scripts for server Name: %{name} @@ -27,11 +29,13 @@ URL: %(echo %{url} | cut -d ' ' -f 2) Requires: curl Requires: coreutils Requires: openssh-clients -Requires: PLCWWW >= 4.2 -Requires: BootCD >= 4.2 +Requires: perl-libwww-perl Requires: MySQL-python Requires: rt3 == 3.4.1 +Requires: PLCWWW >= 4.2 +Requires: BootCD >= 4.2 + %description Scripts for polling PLC, the node, and PCU status. Also a collection of command-line utilities for querying the status database. @@ -66,6 +70,11 @@ install -D -m 755 %{name}.cron $RPM_BUILD_ROOT/%{_sysconfdir}/cron.d/%{name}.cro echo " * TODO: Setting up Monitor account in local MyPLC" # TODO: +mkdir -p $RPM_BUILD_ROOT/%{python_sitearch}/%{name} +install -D -m 755 monitor $RPM_BUILD_ROOT/%{python_sitearch}/%{name} +install -D -m 755 threadpool.py $RPM_BUILD_ROOT/%{python_sitearch}/threadpool.py + +install -D -m 755 monitor-default.conf $RPM_BUILD_ROOT/etc/monitor.conf cp $RPM_BUILD_ROOT/usr/share/%{name}/monitorconfig-default.py $RPM_BUILD_ROOT/usr/share/%{name}/monitorconfig.py %clean @@ -78,6 +87,8 @@ rm -rf $RPM_BUILD_ROOT /var/lib/%{name} /var/www/cgi-bin/monitor %{_sysconfdir}/cron.d/%{name}.cron +%{python_sitearch}/threadpool.py +%{python_sitearch}/%{name} %post echo "Post processing" diff --git a/monitor/__init__.py b/monitor/__init__.py index e69de29..5410e78 100644 --- a/monitor/__init__.py +++ b/monitor/__init__.py @@ -0,0 +1 @@ +import database diff --git a/monitor_policy.py b/monitor_policy.py index 3dd244c..f7c3edb 100644 --- a/monitor_policy.py +++ b/monitor_policy.py @@ -2,15 +2,24 @@ import config import database import time import mailer -from www.printbadnodes import cmpCategoryVal +from unified_model import cmpCategoryVal import sys import emailTxt import string -from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node from rt import is_host_in_rt_tickets import plc +def get_ticket_id(record): + if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None: + return record['ticket_id'] + elif 'found_rt_ticket' in record and \ + record['found_rt_ticket'] is not "" and \ + record['found_rt_ticket'] is not None: + return record['found_rt_ticket'] + else: + return None + # Time to enforce policy POLSLEEP = 7200 @@ -821,6 +830,17 @@ class Diagnose: return up +def close_rt_backoff(args): + if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None): + mailer.closeTicketViaRT(args['ticket_id'], + "Ticket CLOSED automatically by SiteAssist.") + plc.enableSlices(args['hostname']) + plc.enableSliceCreation(args['hostname']) + return + +def reboot_node(args): + host = args['hostname'] + return reboot.reboot_policy(host, True, config.debug) class Action: def __init__(self, diagnose_out): @@ -874,7 +894,7 @@ class Action: #print_stats("sites_observed", stats) #print_stats("sites_diagnosed", stats) #print_stats("nodes_diagnosed", stats) - print_stats("sites_emailed", stats) + self.print_stats("sites_emailed", stats) #print_stats("nodes_actedon", stats) print string.join(stats['allsites'], ",") diff --git a/nodebad.py b/nodebad.py index 3261f88..8aacf71 100755 --- a/nodebad.py +++ b/nodebad.py @@ -16,7 +16,7 @@ from nodecommon import * import plc api = plc.getAuthAPI() from unified_model import * -from monitor_policy import MINUP +from const import MINUP round = 1 externalState = {'round': round, 'nodes': {}} @@ -64,7 +64,6 @@ def checkAndRecordState(l_nodes, l_plcnodes): database.dbDump(config.dbname, externalState) fb = database.dbLoad('findbad') -hn2lb = database.dbLoad("plcdb_hn2lb") def getnodesup(nodelist): up = 0 diff --git a/nodecommon.py b/nodecommon.py index b9027d8..cef1247 100644 --- a/nodecommon.py +++ b/nodecommon.py @@ -1,6 +1,8 @@ import struct import reboot +import time +from monitor import database from unified_model import PersistFlags esc = struct.pack('i', 27) RED = esc + "[1;31m" @@ -137,35 +139,6 @@ def nodegroup_display(node, fb, conf=None): return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)33s %(lastupdate)12s, %(lc)s, %(uptime)s" % node -from model import * -import database - -def node_end_record(node): - act_all = database.dbLoad("act_all") - if node not in act_all: - del act_all - return False - - if len(act_all[node]) == 0: - del act_all - return False - - a = Action(node, act_all[node][0]) - a.delField('rt') - a.delField('found_rt_ticket') - a.delField('second-mail-at-oneweek') - a.delField('second-mail-at-twoweeks') - a.delField('first-found') - rec = a.get() - rec['action'] = ["close_rt"] - rec['category'] = "UNKNOWN" - rec['stage'] = "monitor-end-record" - rec['time'] = time.time() - 7*60*60*24 - act_all[node].insert(0,rec) - database.dbDump("act_all", act_all) - del act_all - return True - def datetime_fromstr(str): if '-' in str: try: diff --git a/nodeinfo.py b/nodeinfo.py index 84d8ae0..30838f1 100755 --- a/nodeinfo.py +++ b/nodeinfo.py @@ -3,12 +3,14 @@ import plc api = plc.getAuthAPI() -import database +from monitor import * +#import database import reboot import time from model import * from nodecommon import * +from unified_model import node_end_record, PersistFlags import util.file @@ -43,12 +45,14 @@ def plc_print_nodeinfo(plcnode): diff_time(plcnode['last_contact']), plcnode['key']) def fb_print_nodeinfo(fbnode): + pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags') + fbnode['last_change'] = diff_time(pf.last_changed) print " Checked: ", if 'checked' in fbnode: print "%11.11s " % diff_time(fbnode['checked']) else: print "Unknown" - print "\t state | ssh | pcu | bootcd | category | kernel" + print "\t state | ssh | pcu | bootcd | category | last change | kernel" if fbnode['bootcd']: fbnode['bootcd'] = fbnode['bootcd'].split()[-1] else: @@ -59,7 +63,7 @@ def fb_print_nodeinfo(fbnode): fbnode['state'] = "none" if len(fbnode['kernel'].split()) > 2: fbnode['kernel'] = fbnode['kernel'].split()[2] - print "\t %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode + print "\t %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(last_change)11s | %(kernel)s" % fbnode def act_print_nodeinfo(actnode, header): if header[0]: @@ -132,25 +136,25 @@ if config.findbad: util.file.setFileFromList(file, config.args) os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file) -fb = database.dbLoad("findbad") -try: - act_all = database.dbLoad("act_all") -except: - act_all = {} - for node in config.args: config.node = node + fb = database.dbLoad("findbad") plc_nodeinfo = api.GetNodes({'hostname': config.node}, None)[0] fb_nodeinfo = fb['nodes'][config.node]['values'] plc_print_nodeinfo(plc_nodeinfo) + fb_nodeinfo['hostname'] = node fb_print_nodeinfo(fb_nodeinfo) if fb_nodeinfo['pcu'] == "PCU": pcu = reboot.get_pcu_values(fb_nodeinfo['plcnode']['pcu_ids'][0]) if pcu: pcu_print_info(pcu, config.node) + try: + act_all = database.dbLoad("act_all") + except: + act_all = {} if config.node in act_all and len(act_all[config.node]) > 0: header = [True] diff --git a/nodequery.py b/nodequery.py index b030344..c3f7ab8 100755 --- a/nodequery.py +++ b/nodequery.py @@ -11,13 +11,14 @@ from unified_model import Record import glob import os from reboot import pcu_name +import reboot import util.file import time import re #fb = {} -fb = database.dbLoad("findbad") +fb = {} fbpcu = {} class NoKeyException(Exception): pass @@ -202,13 +203,14 @@ def pcu_select(str_query, nodelist=None): return (nodenames, pcunames) def node_select(str_query, nodelist=None, fbdb=None): + global fb + hostnames = [] if str_query is None: return hostnames #print str_query dict_query = query_to_dict(str_query) #print dict_query - global fb if fbdb is not None: fb = fbdb @@ -280,6 +282,7 @@ def main(): fb = database.dbLoad("findbad") fbpcu = database.dbLoad("findbadpcus") + reboot.fb = fbpcu if config.nodelist: nodelist = util.file.getListFromFile(config.nodelist) @@ -288,10 +291,10 @@ def main(): pculist = None if config.select is not None and config.pcuselect is not None: - nodelist = node_select(config.select, nodelist) + nodelist = node_select(config.select, nodelist, fb) nodelist, pculist = pcu_select(config.pcuselect, nodelist) elif config.select is not None: - nodelist = node_select(config.select, nodelist) + nodelist = node_select(config.select, nodelist, fb) elif config.pcuselect is not None: nodelist, pculist = pcu_select(config.pcuselect, nodelist) diff --git a/parser.py b/parser.py index 86e08a5..bd15197 100644 --- a/parser.py +++ b/parser.py @@ -75,7 +75,7 @@ def parseSetNodeSets(parser=None): if parser == None: parser = OptionParser() - parser.set_defaults(node=None, site=None, nodelist=None, nodeselect=False, nodegroup=None) + parser.set_defaults(node=None, site=None, nodelist=None, nodeselect=None, nodegroup=None) parser.add_option("", "--node", dest="node", metavar="hostname", help="Provide a single node to operate on") parser.add_option("", "--site", dest="site", metavar="site name", @@ -104,10 +104,11 @@ def getParser(parsesets=[], parser=None): def parse_args(parser): class obj: pass - o = obj() (options, args) = parser.parse_args() + o = obj() o.__dict__.update(options.__dict__) o.__dict__['args'] = args + #config.update(o) config.updatemodule(config, o) return config diff --git a/pcubad.py b/pcubad.py index c2886f5..c782b9a 100755 --- a/pcubad.py +++ b/pcubad.py @@ -18,7 +18,7 @@ from nodecommon import * import plc api = plc.getAuthAPI() from unified_model import * -from monitor_policy import MINUP +from const import MINUP round = 1 externalState = {'round': round, 'nodes': {}} diff --git a/policy.py b/policy.py index 027da35..26187dd 100644 --- a/policy.py +++ b/policy.py @@ -21,7 +21,7 @@ import os import reboot import database import string -from www.printbadnodes import cmpCategoryVal +from unified_model import cmpCategoryVal import config DAT="./monitor.dat" @@ -85,15 +85,6 @@ def getdebug(): def print_stats(key, stats): if key in stats: print "%20s : %d" % (key, stats[key]) -def get_ticket_id(record): - if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None: - return record['ticket_id'] - elif 'found_rt_ticket' in record and \ - record['found_rt_ticket'] is not "" and \ - record['found_rt_ticket'] is not None: - return record['found_rt_ticket'] - else: - return None class Merge(Thread): def __init__(self, l_merge, toRT): @@ -927,17 +918,7 @@ class BackoffActions(SiteAction): # TODO: create class for each action below, # allow for lists of actions to be performed... -def close_rt_backoff(args): - if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None): - mailer.closeTicketViaRT(args['ticket_id'], - "Ticket CLOSED automatically by SiteAssist.") - plc.enableSlices(args['hostname']) - plc.enableSliceCreation(args['hostname']) - return -def reboot_node(args): - host = args['hostname'] - return reboot.reboot_policy(host, True, config.debug) def reset_nodemanager(args): os.system("ssh root@%s /sbin/service nm restart" % nodename) diff --git a/printbadcsv.py b/printbadcsv.py index 5d6989d..f064c11 100755 --- a/printbadcsv.py +++ b/printbadcsv.py @@ -1,6 +1,6 @@ #!/usr/bin/python -import database -import config +from monitor import database +from monitor import config import parser as parsermodule from www.printbadnodes import * diff --git a/reboot.py b/reboot.py index 76cacd3..337b0b4 100755 --- a/reboot.py +++ b/reboot.py @@ -11,8 +11,6 @@ import urllib2 import urllib import threading, popen2 import array, struct -#from socket import * -import socket import plc import base64 from subprocess import PIPE, Popen @@ -1167,12 +1165,16 @@ def pcu_name(pcu): else: return None -import database -fb =database.dbLoad("findbadpcus") +#import database +from monitor import database +fb = None def get_pcu_values(pcu_id): - # TODO: obviously, this shouldn't be loaded each time... - + global fb + if fb == None: + # this shouldn't be loaded each time... + fb = database.dbLoad("findbadpcus") + try: values = fb['nodes']["id_%s" % pcu_id]['values'] except: diff --git a/rt.py b/rt.py index 4c57ea8..0ea0a55 100644 --- a/rt.py +++ b/rt.py @@ -7,7 +7,6 @@ import logging import Queue import time import re -import comon import database from threading import * import monitorconfig diff --git a/sitebad.py b/sitebad.py index dc0e8a7..f55a4d3 100755 --- a/sitebad.py +++ b/sitebad.py @@ -15,7 +15,7 @@ from nodequery import verify,query_to_dict,node_select import plc api = plc.getAuthAPI() from unified_model import * -from monitor_policy import MINUP +from const import MINUP round = 1 externalState = {'round': round, 'sites': {}} diff --git a/threadpool.py b/threadpool.py index a0a8a2f..aa4f1f1 100644 --- a/threadpool.py +++ b/threadpool.py @@ -218,7 +218,6 @@ class ThreadPool: request.callback(request, result) del self.workRequests[request.requestID] except Queue.Empty: - print "queue empty" break def wait(self): diff --git a/todo b/todo index 52a534b..94b2ab4 100644 --- a/todo +++ b/todo @@ -1,20 +1,8 @@ TODO: - * make 'automate.sh' be the real automate script for cron. - * install the right version of RT tools as a dependency? - * www/*.py need appropriate access to database.py, config.py, monitorconfig.py, etc. - * build cmdamt with g++ prior to packaging * install openssh-server, passwd, perl-libwww-perl (for rt), rt-3.4.1, MySQL-python * had to mount -t devpts devpts /dev/pts to get ssh to work inside the chroot. also, disable the pam modules in /etc/pam.d/sshd - * threadpool package. - * reboot.py loads findbadpcus unconditionally. - * unified_model loads findbad unconditionally - * nodequery loads findbad unconditionally - * remove deps on www.printbadnodes - * change findbad.py default db name - * nodebad loads plc_hn2lb unconditionally - * nodeinfo loads act_all unconditionally * A setup script of some kind would be nice that walked through : - writing monitorconfig.py @@ -55,6 +43,13 @@ TODO: * fix BayTechCtrlCUnibe expect script. + * separate modules into different, logical categories, and create a python + module as part of the install: + command line, + configuration, + policy, + data model, + object interfaces. Lower priority: * Add a more structured, 'automate' library of scripts and means of making @@ -66,6 +61,20 @@ Lower priority: availble with PLC. Done: + * nodebad loads plc_hn2lb unconditionally + * nodeinfo loads act_all unconditionally + * change findbad.py default db name + * remove deps on www.printbadnodes + * reboot.py loads findbadpcus unconditionally. + * nodequery loads findbad unconditionally + * unified_model loads findbad unconditionally + + * threadpool package. + * build cmdamt with g++ prior to packaging + + * www/*.py need appropriate access to database.py, config.py, monitorconfig.py, etc. + - need to convert monitor.conf into monitorconf.sh and monitorconf.php + * pull out global configuration information from various files, like rt_db, mailer.py, auth.py, and any others. Create a single configuration file from which all others pull. diff --git a/unified_model.py b/unified_model.py index 2fbb6e2..acc89d8 100755 --- a/unified_model.py +++ b/unified_model.py @@ -1,14 +1,14 @@ #!/usr/bin/python -import database +from monitor import database import plc api = plc.getAuthAPI() import mailer import time -from nodecommon import * +from model import * from const import * import util.file import config @@ -70,7 +70,7 @@ class PenaltyMap: # condition/penalty is applied, move to the next phase. -fb = database.dbLoad("findbad") +#fb = database.dbLoad("findbad") class RT(object): def __init__(self, ticket_id = None): @@ -565,10 +565,10 @@ class NodeRecord: self.hostname = hostname self.ticket = None self.target = target - if hostname in fb['nodes']: - self.data = fb['nodes'][hostname]['values'] - else: - raise Exception("Hostname not in scan database") + #if hostname in fb['nodes']: + # self.data = fb['nodes'][hostname]['values'] + #else: + # raise Exception("Hostname not in scan database") def stageIswaitforever(self): if 'waitforever' in self.data['stage']: @@ -638,6 +638,43 @@ class NodeRecord: def _get_contacts_for_condition(self): pass +class Action(MonRecord): + def __init__(self, host, data): + self.host = host + MonRecord.__init__(self, data) + return + + def deltaDays(self, delta): + t = datetime.fromtimestamp(self.__dict__['time']) + d = t + timedelta(delta) + self.__dict__['time'] = time.mktime(d.timetuple()) + +def node_end_record(node): + act_all = database.dbLoad("act_all") + if node not in act_all: + del act_all + return False + + if len(act_all[node]) == 0: + del act_all + return False + + a = Action(node, act_all[node][0]) + a.delField('rt') + a.delField('found_rt_ticket') + a.delField('second-mail-at-oneweek') + a.delField('second-mail-at-twoweeks') + a.delField('first-found') + rec = a.get() + rec['action'] = ["close_rt"] + rec['category'] = "UNKNOWN" + rec['stage'] = "monitor-end-record" + rec['time'] = time.time() - 7*60*60*24 + act_all[node].insert(0,rec) + database.dbDump("act_all", act_all) + del act_all + return True + if __name__ == "__main__": #r = RT() #r.email("test", "body of test message", ['database@cs.princeton.edu']) diff --git a/www/printbadnodes.py b/www/printbadnodes.py index af3f73d..3bfc7bd 100755 --- a/www/printbadnodes.py +++ b/www/printbadnodes.py @@ -1,11 +1,7 @@ #!/usr/bin/python -import soltesz -from config import config -from optparse import OptionParser +from monitor import database +from monitor import config import string -#from HyperText.HTML import A, BR, IMG, TABLE, TR, TH, TD, EM, quote_body -#from HyperText.Documents import Document - import sys categories = {} @@ -243,8 +239,8 @@ def fields_to_html(fields, vals): def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter): global fb - db = soltesz.dbLoad(config.dbname) - fb = soltesz.dbLoad("findbadpcus") + db = database.dbLoad(config.dbname) + fb = database.dbLoad("findbadpcus") ## Field widths used for printing maxFieldLengths = { 'nodename' : -45, @@ -498,30 +494,16 @@ if __name__ == '__main__': else: mynodeonly = None - parser = OptionParser() - parser.set_defaults(cmpdays=False, - comon="sshstatus", - fields="nodename,ping,ssh,pcu,category,state,comonstats,kernel,bootcd", - dbname="findbad", # -070724-1", - cmpping=False, - cmpdns=False, - cmploginbase=False, - cmpssh=False, - cmpcategory=False, - cmpstate=False) - parser.add_option("", "--fields", dest="fields", help="") - parser.add_option("", "--dbname", dest="dbname", help="") - parser.add_option("", "--days", dest="cmpdays", action="store_true", help="") - parser.add_option("", "--ping", dest="cmpping", action="store_true", help="") - parser.add_option("", "--dns", dest="cmpdns", action="store_true", help="") - parser.add_option("", "--ssh", dest="cmpssh", action="store_true", help="") - parser.add_option("", "--loginbase",dest="cmploginbase",action="store_true", help="") - parser.add_option("", "--category", dest="cmpcategory", action="store_true", help="") - parser.add_option("", "--kernel", dest="cmpkernel", action="store_true", help="") - parser.add_option("", "--state", dest="cmpstate", action="store_true", help="") - parser.add_option("", "--comon", dest="comon", help="") - config = config(parser) - config.parse_args() + config.cmpdays=False + config.comon="sshstatus" + config.fields="nodename,ping,ssh,pcu,category,state,comonstats,kernel,bootcd" + config.dbname="findbad" + config.cmpping=False + config.cmpdns=False + config.cmploginbase=False + config.cmpssh=False + config.cmpcategory=False + print "Content-Type: text/html\r\n" if mynodeonly == None: print "\n" diff --git a/www/runlevels.py b/www/runlevels.py index bb07f92..3e16dc2 100755 --- a/www/runlevels.py +++ b/www/runlevels.py @@ -2,7 +2,7 @@ import cgi import cgitb; -import soltesz +from monitor import database import time cgitb.enable() @@ -32,7 +32,7 @@ vals['princeton_comon_procs'] = get_value('princeton_comon_procs') rows = "" -fb = soltesz.dbLoad("findbad") +fb = database.dbLoad("findbad") packed_values = [] for mynode in fb['nodes'].keys(): fbnode = fb['nodes'][mynode]['values'] -- 2.43.0