import sys
import os
-import policy
+import const
from getsshkeys import SSHKnownHosts
mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
print "\tDisabling %s due to out-of-date BOOTCD" % hostname
api.UpdateNode(hostname, {'boot_state' : 'disable'})
mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
conn.set_nodestate('disable')
return False
# update_node_config_email
for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
- "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
]:
sequences.update({n : "update_node_config_email"})
- for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+ for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
+ "bminit-cfg-update-exception-nodehostname-update-debug-done",
+ ]:
sequences.update({n : "nodenetwork_email"})
# update_bootcd_email
sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
# bad_dns_email
- sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"})
+ for n in [
+ "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+ "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+ ]:
+ sequences.update( { n : "bad_dns_email"})
flag_set = True
m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
True, db='nodeid_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
conn.dump_plconf_file()
conn.set_nodestate('disable')
m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args,
True, db='nodenet_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
conn.dump_plconf_file()
conn.set_nodestate('disable')
mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
print "\tDisabling %s due to out-of-date BOOTCD" % hostname
conn.set_nodestate('disable')
mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
conn.set_nodestate('disable')
elif sequences[s] == "update_hardware_email":
mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
conn.set_nodestate('disable')
elif sequences[s] == "bad_dns_email":
mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
conn.set_nodestate('disable')
if flag_set:
import database
import time
import mailer
-from www.printbadnodes import cmpCategoryVal
+from unified_model import cmpCategoryVal
import sys
import emailTxt
import string
-from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
from rt import is_host_in_rt_tickets
import plc
from unified_model import *
+def get_ticket_id(record):
+ if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
+ return record['ticket_id']
+ elif 'found_rt_ticket' in record and \
+ record['found_rt_ticket'] is not "" and \
+ record['found_rt_ticket'] is not None:
+ return record['found_rt_ticket']
+ else:
+ return None
+
class MonitorMergeDiagnoseSendEscellate:
def __init__(self, hostname, act):
self.hostname = hostname
#!/usr/bin/python
-import pickle
+
+# load defaults from /etc/monitor.conf
+# home/.monitor.conf
+# $PWD/.monitor.conf
import os
-import getopt
-import sys
-import __main__
-from optparse import OptionParser
-from parser import parse_bool
+import ConfigParser
+
+class Options(object):
+ def __init__(self):
+ cp = ConfigParser.ConfigParser()
+ cp.optionxform = str
+ # load defaults from global, home dir, then $PWD
+ cp.read(['/etc/monitor.conf', os.path.expanduser('~/.monitor.conf'),
+ '.monitor.conf', 'monitor.conf'])
+ self.cp = cp
+ self.section = "default"
+ def __getattr__(self, name):
+ if name in self.cp.sections():
+ self.section = name
+ return self
+ else:
+ return self.cp.get(self.section, name)
-debug=0
-mail=0
-bcc=0
-email="soltesz@cs.utk.edu"
-run=False
-checkopt=False
-squeeze=0
-policysavedb=0
-config_command = False
+import config
+imported = False
def updatemodule(module, cf):
module.__dict__.update(cf.__dict__)
-class config:
- debug=0
- mail=0
- bcc=0
- email="soltesz@cs.utk.edu"
- run=False
- checkopt=False
- squeeze=0
- policysavedb=0
- __file = ".config"
-
- def __init__(self, parser=None):
- if os.path.exists(self.__file): # file exists, read that.
- f = open(self.__file, 'r')
- o = pickle.load(f)
- self.__dict__.update(o)
- f.close()
-
- if parser == None:
- self.parser = OptionParser()
+def update_section(options, section, bool=False):
+ # Place all default commandline values at the top level of this module
+ for key in options.cp.options(section):
+ if bool:
+ config.__dict__.update({key : options.cp.getboolean(section, key)})
else:
- self.parser = parser
-
- self.parser.set_defaults(debug = self.debug,
- mail = self.mail,
- bcc = self.bcc,
- email = self.email,
- run = self.run,
- checkopt = False,
- squeeze = self.squeeze,
- policysavedb = self.policysavedb)
-
- self.parser.add_option("", "--debug", dest="debug",
- help="Enable debugging",
- type="int",
- metavar="[0|1]",
- action="callback",
- callback=parse_bool)
- self.parser.add_option("", "--mail", dest="mail",
- help="Enable sending email",
- type="int",
- metavar="[0|1]",
- action="callback",
- callback=parse_bool)
- self.parser.add_option("", "--bcc", dest="bcc",
- help="Include BCC to user",
- type="int",
- metavar="[0|1]",
- action="callback",
- callback=parse_bool)
- self.parser.add_option("", "--squeeze", dest="squeeze",
- help="Squeeze sites or not",
- type="int",
- metavar="[0|1]",
- action="callback",
- callback=parse_bool)
- self.parser.add_option("", "--policysavedb", dest="policysavedb",
- help="Save the policy event database after a run",
- type="int",
- metavar="[0|1]",
- action="callback",
- callback=parse_bool)
- self.parser.add_option("", "--checkopt", dest="checkopt",
- action="store_true",
- help="print current options")
- self.parser.add_option("", "--run", dest="run",
- action="store_true",
- help="Perform monitor or print configs")
- self.parser.add_option("", "--email", dest="email",
- help="Specify an email address to use for mail when "+\
- "debug is enabled or for bcc when it is not")
-
- # config_command is needed to keep subsequent loads of config() from
- # trying to parse the arguments that have already been parsed by
- # the new main().
- if parser == None and config_command:
- print "calling parse_args"
- self.parse_args()
-
- def parse_args(self):
- #print "self: %s" % self
- #import traceback
- #print traceback.print_stack()
- #print "Ccalling parse_args"
- (options, args) = self.parser.parse_args()
- #for o in options.__dict__:
- # print "optin: %s == %s" % (o, options.__dict__[o])
- self.__dict__.update(options.__dict__)
- self.__dict__['args'] = args
- self.save(options)
- if options.checkopt:
- self.usage()
- # print "\nAdd --run to actually perform the command"
- sys.exit(1)
-
- def getListFromFile(self, file):
- f = open(file, 'r')
- list = []
- for line in f:
- line = line.strip()
- list += [line]
- return list
-
- def print_values(self):
- exclude = ['parser']
- for key in self.__dict__.keys():
- if key not in exclude:
- print "%20s == %s" % (key, self.__dict__[key])
-
- def save(self, options=None):
- f = open(self.__file, 'w')
- if options == None:
- o = {'debug': self.debug,
- 'mail': self.mail,
- 'bcc': self.bcc,
- 'email':self.email,
- 'squeeze':self.squeeze,
- 'policysavedb':self.policysavedb}
- else:
- o = options.__dict__
-
- pickle.dump(o, f)
- f.close()
-
- def usage(self):
- self.print_values()
- self.parser.print_help()
-
+ config.__dict__.update({key : options.cp.get(section, key)})
-def main():
- """ Start threads, do some housekeeping, then daemonize. """
- # Defaults
- global config_command
- config_command = True
- config = __main__.config()
+def update(parseoptions):
+ update_commandline()
+ # now update the top-level module with all other args passed in here.
+ for key in parseoptions.__dict__.keys():
+ config.__dict__.update({key: parseoptions.__dict__[key]})
- try:
- print "acalling parse_args"
- config.parse_args()
-
- except Exception, err:
- print "Error: %s " % err
- config.usage()
- sys.exit(1)
+if not config.imported:
+ imported = True
- config.usage()
+ #from config import options as config
+ options = Options()
+ update_section(options, 'commandline', True)
+ update_section(options, 'monitorconfig')
+#for i in dir(config):
+# if "__" not in i:
+# print i, "==", config.__dict__[i]
+#print "======================================"
-if __name__ == '__main__':
- main()
import inspect
import shutil
import config
-import monitorconfig
+import config as monitorconfig
DEBUG= 0
PICKLE_PATH=monitorconfig.MONITOR_DATA_ROOT
raise Exception, "No such file %s" % name
+ #import traceback
+ #print traceback.print_stack()
#print "loading %s" % self.__file(name, type)
+ #sys.stderr.write("-----------------------------\n")
f = open(self.__file(name, type), 'r')
if type == None:
o = pickle.load(f)
if not os.path.exists(outdir):
os.system('mkdir -p %s' % outdir)
- if config.site is not None or config.nodeselect is not None or config.nodegroup is not None:
+ if config.site is not None or \
+ config.nodeselect is not None or \
+ config.nodegroup is not None:
print "TODO: implement support for nodeselect and site queries."
+ print "%s %s %s" % (config.site, config.nodeselect, config.nodegroup)
sys.exit(1)
if config.nodelist == None and config.node == None:
f = open(config.cmdfile,'r')
cmd = f.read()
+ print filelist
+
vx_start(filelist, outdir, cmd, int(config.timeout))
# perform this query after the above options, so that the filter above
# does not break.
if config.nodeselect:
- l_nodes = node_select(config.nodeselect)
+ fb = database.dbLoad("findbad")
+ l_nodes = node_select(config.nodeselect, fb['nodes'].keys(), fb)
print "fetching %s hosts" % len(l_nodes)
parser = parsermodule.getParser(['nodesets'])
- parser.set_defaults( increment=False, dbname="findbadnodes", cachenodes=False)
+ parser.set_defaults( increment=False, dbname="findbad", cachenodes=False)
parser.add_option("", "--cachenodes", action="store_true",
help="Cache node lookup from PLC")
parser.add_option("", "--dbname", dest="dbname", metavar="FILE",
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)
- from config import config
- from optparse import OptionParser
- parser = OptionParser()
+ import parser as parsermodule
+ parser = parsermodule.getParser()
parser.set_defaults(nodelist=None,
increment=False,
pcuid=None,
help="Refresh the cached values")
parser.add_option("-i", "--increment", action="store_true", dest="increment",
help="Increment round number to force refresh or retry")
- config = config(parser)
- config.parse_args()
+ parser = parsermodule.getParser(['defaults'], parser)
+ config = parsermodule.parse_args(parser)
try:
# NOTE: evidently, there is a bizarre interaction between iLO and ssh
# when LANG is set... Do not know why. Unsetting LANG, fixes the problem.
import plc
api = plc.getAuthAPI()
-import policy
import traceback
-from config import config as cfg
+import config
import util.file
from optparse import OptionParser
+import const
from nodecommon import *
from nodequery import verify,query_to_dict,node_select
import database
from model import *
import bootman # debug nodes
-import monitor # down nodes with pcu
import reboot # down nodes without pcu
+import mailmonitor # down nodes with pcu
from emailTxt import mailtxt
#reboot.verbose = 0
import sys
mailtxt.pcudown_one[1] % args, True, db='pcu_persistmessages')
loginbase = plc.siteId(host)
- m.send([policy.TECHEMAIL % loginbase])
+ m.send([const.TECHEMAIL % loginbase])
def pcu(self, host):
# TODO: It should be possible to diagnose the various conditions of
# the PCU here, and send different messages as appropriate.
- if self.fbnode['pcu'] == "PCU":
+ print "'%s'" % self.fbnode['pcu']
+ if self.fbnode['pcu'] == "PCU" or "PCUOK" in self.fbnode['pcu']:
self.action = "reboot.reboot('%s')" % host
pflags = PersistFlags(host, 2*60*60*24, db='pcu_persistflags')
+ pflags.resetRecentFlag('pcutried')
if not pflags.getRecentFlag('pcutried'):
pflags.setRecentFlag('pcutried')
try:
+ print "CALLING REBOOT!!!"
ret = reboot.reboot(host)
pflags.save()
return True
else:
+ print "GetRecentFlag()"
return False
else:
+ print "NO PCUOK"
self.action = "None"
return False
pflags.setRecentFlag('endrecord')
pflags.save()
- # Then in either case, run monitor.reboot()
- self.action = "monitor.reboot('%s')" % host
+ # Then in either case, run mailmonitor.reboot()
+ self.action = "mailmonitor.reboot('%s')" % host
try:
- return monitor.reboot(host)
+ return mailmonitor.reboot(host)
except Exception, e:
print traceback.print_exc(); print e
return False
if config.node: hostnames = [ config.node ]
else: hostnames = config.getListFromFile(config.nodelist)
+fb = database.dbLoad("findbad")
+
if config.nodeselect:
- hostnames = node_select(config.nodeselect)
+ hostnames = node_select(config.nodeselect, fb['nodes'].keys(), fb)
if config.findbad:
# rerun findbad with the nodes in the given nodes.
util.file.setFileFromList(file, hostnames)
os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
-fb = database.dbLoad("findbad")
# commands:
i = 1
count = 1
print "%-2d" % i, nodegroup_display(node, fb)
i += 1
- if i < int(config.skip): continue
+ if i-1 <= int(config.skip): continue
if config.stopselect:
dict_query = query_to_dict(config.stopselect)
import database
-from monitor_policy import *
import rt
import sys
return True
-def reboot2(hostname):
- l_nodes = api.GetNodes(hostname)
- if len(l_nodes) == 0:
- raise Exception("No such host: %s" % hostname)
-
- l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
- l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
-
- l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
- if len(l_nodes) == 0:
- raise Exception("Host removed via blacklist: %s" % hostname)
-
- ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None)
- if ad_dbTickets == None:
- raise Exception("Could not find cached dbTickets")
-
-
- args = {}
- args['hostname'] = "%s" % hostname
- args['hostname_list'] = "%s" % hostname
- args['loginbase'] = plc.siteId(hostname)
-
- m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
- mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
-
- #print "merge"
- merge = Merge( [node['hostname'] for node in l_nodes])
- record_list = merge.run()
- #print "rt"
- rt = RT(record_list, ad_dbTickets, l_ticket_blacklist)
- record_list = rt.run()
- #print "diagnose"
- diag = Diagnose(record_list)
- diagnose_out = diag.run()
- #print diagnose_out
- #print "action"
- action = Action(diagnose_out)
- action.run()
-
- return True
+#def reboot2(hostname):
+# l_nodes = api.GetNodes(hostname)
+# if len(l_nodes) == 0:
+# raise Exception("No such host: %s" % hostname)
+#
+# l_blacklist = database.if_cached_else(1, "l_blacklist", lambda : [])
+# l_ticket_blacklist = database.if_cached_else(1,"l_ticket_blacklist",lambda : [])
+#
+# l_nodes = filter(lambda x : not x['hostname'] in l_blacklist, l_nodes)
+# if len(l_nodes) == 0:
+# raise Exception("Host removed via blacklist: %s" % hostname)
+#
+# ad_dbTickets = database.if_cached_else_refresh(True, False, "ad_dbTickets", lambda : None)
+# if ad_dbTickets == None:
+# raise Exception("Could not find cached dbTickets")
+#
+#
+# args = {}
+# args['hostname'] = "%s" % hostname
+# args['hostname_list'] = "%s" % hostname
+# args['loginbase'] = plc.siteId(hostname)
+#
+# m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
+# mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
+#
+# #print "merge"
+# merge = Merge( [node['hostname'] for node in l_nodes])
+# record_list = merge.run()
+# #print "rt"
+# rt = RT(record_list, ad_dbTickets, l_ticket_blacklist)
+# record_list = rt.run()
+# #print "diagnose"
+# diag = Diagnose(record_list)
+# diagnose_out = diag.run()
+# #print diagnose_out
+# #print "action"
+# action = Action(diagnose_out)
+# action.run()
+#
+# return True
def main():
MonRecord.__init__(self, data)
return
-class Action(MonRecord):
- def __init__(self, host, data):
- self.host = host
- MonRecord.__init__(self, data)
- return
-
- def deltaDays(self, delta):
- t = datetime.fromtimestamp(self.__dict__['time'])
- d = t + timedelta(delta)
- self.__dict__['time'] = time.mktime(d.timetuple())
-
-
%define taglevel 6
%define release %{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}}
+%global python_sitearch %( python -c "from distutils.sysconfig import get_python_lib; print get_python_lib(1)" )
+
Summary: Monitor backend scripts for server
Name: %{name}
Requires: curl
Requires: coreutils
Requires: openssh-clients
-Requires: PLCWWW >= 4.2
-Requires: BootCD >= 4.2
+Requires: perl-libwww-perl
Requires: MySQL-python
Requires: rt3 == 3.4.1
+Requires: PLCWWW >= 4.2
+Requires: BootCD >= 4.2
+
%description
Scripts for polling PLC, the node, and PCU status. Also a collection of
command-line utilities for querying the status database.
echo " * TODO: Setting up Monitor account in local MyPLC"
# TODO:
+mkdir -p $RPM_BUILD_ROOT/%{python_sitearch}/%{name}
+install -D -m 755 monitor $RPM_BUILD_ROOT/%{python_sitearch}/%{name}
+install -D -m 755 threadpool.py $RPM_BUILD_ROOT/%{python_sitearch}/threadpool.py
+
+install -D -m 755 monitor-default.conf $RPM_BUILD_ROOT/etc/monitor.conf
cp $RPM_BUILD_ROOT/usr/share/%{name}/monitorconfig-default.py $RPM_BUILD_ROOT/usr/share/%{name}/monitorconfig.py
%clean
/var/lib/%{name}
/var/www/cgi-bin/monitor
%{_sysconfdir}/cron.d/%{name}.cron
+%{python_sitearch}/threadpool.py
+%{python_sitearch}/%{name}
%post
echo "Post processing"
import database
import time
import mailer
-from www.printbadnodes import cmpCategoryVal
+from unified_model import cmpCategoryVal
import sys
import emailTxt
import string
-from policy import get_ticket_id, print_stats, close_rt_backoff, reboot_node
from rt import is_host_in_rt_tickets
import plc
+def get_ticket_id(record):
+ if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
+ return record['ticket_id']
+ elif 'found_rt_ticket' in record and \
+ record['found_rt_ticket'] is not "" and \
+ record['found_rt_ticket'] is not None:
+ return record['found_rt_ticket']
+ else:
+ return None
+
# Time to enforce policy
POLSLEEP = 7200
return up
+def close_rt_backoff(args):
+ if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
+ mailer.closeTicketViaRT(args['ticket_id'],
+ "Ticket CLOSED automatically by SiteAssist.")
+ plc.enableSlices(args['hostname'])
+ plc.enableSliceCreation(args['hostname'])
+ return
+
+def reboot_node(args):
+ host = args['hostname']
+ return reboot.reboot_policy(host, True, config.debug)
class Action:
def __init__(self, diagnose_out):
#print_stats("sites_observed", stats)
#print_stats("sites_diagnosed", stats)
#print_stats("nodes_diagnosed", stats)
- print_stats("sites_emailed", stats)
+ self.print_stats("sites_emailed", stats)
#print_stats("nodes_actedon", stats)
print string.join(stats['allsites'], ",")
import plc
api = plc.getAuthAPI()
from unified_model import *
-from monitor_policy import MINUP
+from const import MINUP
round = 1
externalState = {'round': round, 'nodes': {}}
database.dbDump(config.dbname, externalState)
fb = database.dbLoad('findbad')
-hn2lb = database.dbLoad("plcdb_hn2lb")
def getnodesup(nodelist):
up = 0
import struct
import reboot
+import time
+from monitor import database
from unified_model import PersistFlags
esc = struct.pack('i', 27)
RED = esc + "[1;31m"
return "%(hostname)-42s %(boot_state)8s %(current)5s %(pcu)6s %(key)10.10s... %(kernel)33s %(lastupdate)12s, %(lc)s, %(uptime)s" % node
-from model import *
-import database
-
-def node_end_record(node):
- act_all = database.dbLoad("act_all")
- if node not in act_all:
- del act_all
- return False
-
- if len(act_all[node]) == 0:
- del act_all
- return False
-
- a = Action(node, act_all[node][0])
- a.delField('rt')
- a.delField('found_rt_ticket')
- a.delField('second-mail-at-oneweek')
- a.delField('second-mail-at-twoweeks')
- a.delField('first-found')
- rec = a.get()
- rec['action'] = ["close_rt"]
- rec['category'] = "UNKNOWN"
- rec['stage'] = "monitor-end-record"
- rec['time'] = time.time() - 7*60*60*24
- act_all[node].insert(0,rec)
- database.dbDump("act_all", act_all)
- del act_all
- return True
-
def datetime_fromstr(str):
if '-' in str:
try:
import plc
api = plc.getAuthAPI()
-import database
+from monitor import *
+#import database
import reboot
import time
from model import *
from nodecommon import *
+from unified_model import node_end_record, PersistFlags
import util.file
diff_time(plcnode['last_contact']), plcnode['key'])
def fb_print_nodeinfo(fbnode):
+ pf = PersistFlags(fbnode['hostname'], 1, db='node_persistflags')
+ fbnode['last_change'] = diff_time(pf.last_changed)
print " Checked: ",
if 'checked' in fbnode:
print "%11.11s " % diff_time(fbnode['checked'])
else:
print "Unknown"
- print "\t state | ssh | pcu | bootcd | category | kernel"
+ print "\t state | ssh | pcu | bootcd | category | last change | kernel"
if fbnode['bootcd']:
fbnode['bootcd'] = fbnode['bootcd'].split()[-1]
else:
fbnode['state'] = "none"
if len(fbnode['kernel'].split()) > 2:
fbnode['kernel'] = fbnode['kernel'].split()[2]
- print "\t %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode
+ print "\t %(state)5s | %(ssh)5.5s | %(pcu)5.5s | %(bootcd)6.6s | %(category)8.8s | %(last_change)11s | %(kernel)s" % fbnode
def act_print_nodeinfo(actnode, header):
if header[0]:
util.file.setFileFromList(file, config.args)
os.system("./findbad.py --cachenodes --debug=0 --dbname=findbad --increment --nodelist %s" % file)
-fb = database.dbLoad("findbad")
-try:
- act_all = database.dbLoad("act_all")
-except:
- act_all = {}
-
for node in config.args:
config.node = node
+ fb = database.dbLoad("findbad")
plc_nodeinfo = api.GetNodes({'hostname': config.node}, None)[0]
fb_nodeinfo = fb['nodes'][config.node]['values']
plc_print_nodeinfo(plc_nodeinfo)
+ fb_nodeinfo['hostname'] = node
fb_print_nodeinfo(fb_nodeinfo)
if fb_nodeinfo['pcu'] == "PCU":
pcu = reboot.get_pcu_values(fb_nodeinfo['plcnode']['pcu_ids'][0])
if pcu: pcu_print_info(pcu, config.node)
+ try:
+ act_all = database.dbLoad("act_all")
+ except:
+ act_all = {}
if config.node in act_all and len(act_all[config.node]) > 0:
header = [True]
import glob
import os
from reboot import pcu_name
+import reboot
import util.file
import time
import re
#fb = {}
-fb = database.dbLoad("findbad")
+fb = {}
fbpcu = {}
class NoKeyException(Exception): pass
return (nodenames, pcunames)
def node_select(str_query, nodelist=None, fbdb=None):
+ global fb
+
hostnames = []
if str_query is None: return hostnames
#print str_query
dict_query = query_to_dict(str_query)
#print dict_query
- global fb
if fbdb is not None:
fb = fbdb
fb = database.dbLoad("findbad")
fbpcu = database.dbLoad("findbadpcus")
+ reboot.fb = fbpcu
if config.nodelist:
nodelist = util.file.getListFromFile(config.nodelist)
pculist = None
if config.select is not None and config.pcuselect is not None:
- nodelist = node_select(config.select, nodelist)
+ nodelist = node_select(config.select, nodelist, fb)
nodelist, pculist = pcu_select(config.pcuselect, nodelist)
elif config.select is not None:
- nodelist = node_select(config.select, nodelist)
+ nodelist = node_select(config.select, nodelist, fb)
elif config.pcuselect is not None:
nodelist, pculist = pcu_select(config.pcuselect, nodelist)
if parser == None:
parser = OptionParser()
- parser.set_defaults(node=None, site=None, nodelist=None, nodeselect=False, nodegroup=None)
+ parser.set_defaults(node=None, site=None, nodelist=None, nodeselect=None, nodegroup=None)
parser.add_option("", "--node", dest="node", metavar="hostname",
help="Provide a single node to operate on")
parser.add_option("", "--site", dest="site", metavar="site name",
def parse_args(parser):
class obj: pass
- o = obj()
(options, args) = parser.parse_args()
+ o = obj()
o.__dict__.update(options.__dict__)
o.__dict__['args'] = args
+ #config.update(o)
config.updatemodule(config, o)
return config
import plc
api = plc.getAuthAPI()
from unified_model import *
-from monitor_policy import MINUP
+from const import MINUP
round = 1
externalState = {'round': round, 'nodes': {}}
import reboot
import database
import string
-from www.printbadnodes import cmpCategoryVal
+from unified_model import cmpCategoryVal
import config
DAT="./monitor.dat"
def print_stats(key, stats):
if key in stats: print "%20s : %d" % (key, stats[key])
-def get_ticket_id(record):
- if 'ticket_id' in record and record['ticket_id'] is not "" and record['ticket_id'] is not None:
- return record['ticket_id']
- elif 'found_rt_ticket' in record and \
- record['found_rt_ticket'] is not "" and \
- record['found_rt_ticket'] is not None:
- return record['found_rt_ticket']
- else:
- return None
class Merge(Thread):
def __init__(self, l_merge, toRT):
# TODO: create class for each action below,
# allow for lists of actions to be performed...
-def close_rt_backoff(args):
- if 'ticket_id' in args and (args['ticket_id'] != "" and args['ticket_id'] != None):
- mailer.closeTicketViaRT(args['ticket_id'],
- "Ticket CLOSED automatically by SiteAssist.")
- plc.enableSlices(args['hostname'])
- plc.enableSliceCreation(args['hostname'])
- return
-def reboot_node(args):
- host = args['hostname']
- return reboot.reboot_policy(host, True, config.debug)
def reset_nodemanager(args):
os.system("ssh root@%s /sbin/service nm restart" % nodename)
#!/usr/bin/python
-import database
-import config
+from monitor import database
+from monitor import config
import parser as parsermodule
from www.printbadnodes import *
import urllib
import threading, popen2
import array, struct
-#from socket import *
-import socket
import plc
import base64
from subprocess import PIPE, Popen
else:
return None
-import database
-fb =database.dbLoad("findbadpcus")
+#import database
+from monitor import database
+fb = None
def get_pcu_values(pcu_id):
- # TODO: obviously, this shouldn't be loaded each time...
-
+ global fb
+ if fb == None:
+ # this shouldn't be loaded each time...
+ fb = database.dbLoad("findbadpcus")
+
try:
values = fb['nodes']["id_%s" % pcu_id]['values']
except:
import Queue
import time
import re
-import comon
import database
from threading import *
import monitorconfig
import plc
api = plc.getAuthAPI()
from unified_model import *
-from monitor_policy import MINUP
+from const import MINUP
round = 1
externalState = {'round': round, 'sites': {}}
request.callback(request, result)
del self.workRequests[request.requestID]
except Queue.Empty:
- print "queue empty"
break
def wait(self):
TODO:
- * make 'automate.sh' be the real automate script for cron.
- * install the right version of RT tools as a dependency?
- * www/*.py need appropriate access to database.py, config.py, monitorconfig.py, etc.
- * build cmdamt with g++ prior to packaging
* install openssh-server, passwd, perl-libwww-perl (for rt), rt-3.4.1, MySQL-python
* had to mount -t devpts devpts /dev/pts to get ssh to work inside the
chroot. also, disable the pam modules in /etc/pam.d/sshd
- * threadpool package.
- * reboot.py loads findbadpcus unconditionally.
- * unified_model loads findbad unconditionally
- * nodequery loads findbad unconditionally
- * remove deps on www.printbadnodes
- * change findbad.py default db name
- * nodebad loads plc_hn2lb unconditionally
- * nodeinfo loads act_all unconditionally
* A setup script of some kind would be nice that walked through :
- writing monitorconfig.py
* fix BayTechCtrlCUnibe expect script.
+ * separate modules into different, logical categories, and create a python
+ module as part of the install:
+ command line,
+ configuration,
+ policy,
+ data model,
+ object interfaces.
Lower priority:
* Add a more structured, 'automate' library of scripts and means of making
availble with PLC.
Done:
+ * nodebad loads plc_hn2lb unconditionally
+ * nodeinfo loads act_all unconditionally
+ * change findbad.py default db name
+ * remove deps on www.printbadnodes
+ * reboot.py loads findbadpcus unconditionally.
+ * nodequery loads findbad unconditionally
+ * unified_model loads findbad unconditionally
+
+ * threadpool package.
+ * build cmdamt with g++ prior to packaging
+
+ * www/*.py need appropriate access to database.py, config.py, monitorconfig.py, etc.
+ - need to convert monitor.conf into monitorconf.sh and monitorconf.php
+
* pull out global configuration information from various files, like rt_db,
mailer.py, auth.py, and any others. Create a single configuration file
from which all others pull.
#!/usr/bin/python
-import database
+from monitor import database
import plc
api = plc.getAuthAPI()
import mailer
import time
-from nodecommon import *
+from model import *
from const import *
import util.file
import config
# condition/penalty is applied, move to the next phase.
-fb = database.dbLoad("findbad")
+#fb = database.dbLoad("findbad")
class RT(object):
def __init__(self, ticket_id = None):
self.hostname = hostname
self.ticket = None
self.target = target
- if hostname in fb['nodes']:
- self.data = fb['nodes'][hostname]['values']
- else:
- raise Exception("Hostname not in scan database")
+ #if hostname in fb['nodes']:
+ # self.data = fb['nodes'][hostname]['values']
+ #else:
+ # raise Exception("Hostname not in scan database")
def stageIswaitforever(self):
if 'waitforever' in self.data['stage']:
def _get_contacts_for_condition(self):
pass
+class Action(MonRecord):
+ def __init__(self, host, data):
+ self.host = host
+ MonRecord.__init__(self, data)
+ return
+
+ def deltaDays(self, delta):
+ t = datetime.fromtimestamp(self.__dict__['time'])
+ d = t + timedelta(delta)
+ self.__dict__['time'] = time.mktime(d.timetuple())
+
+def node_end_record(node):
+ act_all = database.dbLoad("act_all")
+ if node not in act_all:
+ del act_all
+ return False
+
+ if len(act_all[node]) == 0:
+ del act_all
+ return False
+
+ a = Action(node, act_all[node][0])
+ a.delField('rt')
+ a.delField('found_rt_ticket')
+ a.delField('second-mail-at-oneweek')
+ a.delField('second-mail-at-twoweeks')
+ a.delField('first-found')
+ rec = a.get()
+ rec['action'] = ["close_rt"]
+ rec['category'] = "UNKNOWN"
+ rec['stage'] = "monitor-end-record"
+ rec['time'] = time.time() - 7*60*60*24
+ act_all[node].insert(0,rec)
+ database.dbDump("act_all", act_all)
+ del act_all
+ return True
+
if __name__ == "__main__":
#r = RT()
#r.email("test", "body of test message", ['database@cs.princeton.edu'])
#!/usr/bin/python
-import soltesz
-from config import config
-from optparse import OptionParser
+from monitor import database
+from monitor import config
import string
-#from HyperText.HTML import A, BR, IMG, TABLE, TR, TH, TD, EM, quote_body
-#from HyperText.Documents import Document
-
import sys
categories = {}
def main(sitefilter, catfilter, statefilter, comonfilter, nodeonlyfilter):
global fb
- db = soltesz.dbLoad(config.dbname)
- fb = soltesz.dbLoad("findbadpcus")
+ db = database.dbLoad(config.dbname)
+ fb = database.dbLoad("findbadpcus")
## Field widths used for printing
maxFieldLengths = { 'nodename' : -45,
else:
mynodeonly = None
- parser = OptionParser()
- parser.set_defaults(cmpdays=False,
- comon="sshstatus",
- fields="nodename,ping,ssh,pcu,category,state,comonstats,kernel,bootcd",
- dbname="findbad", # -070724-1",
- cmpping=False,
- cmpdns=False,
- cmploginbase=False,
- cmpssh=False,
- cmpcategory=False,
- cmpstate=False)
- parser.add_option("", "--fields", dest="fields", help="")
- parser.add_option("", "--dbname", dest="dbname", help="")
- parser.add_option("", "--days", dest="cmpdays", action="store_true", help="")
- parser.add_option("", "--ping", dest="cmpping", action="store_true", help="")
- parser.add_option("", "--dns", dest="cmpdns", action="store_true", help="")
- parser.add_option("", "--ssh", dest="cmpssh", action="store_true", help="")
- parser.add_option("", "--loginbase",dest="cmploginbase",action="store_true", help="")
- parser.add_option("", "--category", dest="cmpcategory", action="store_true", help="")
- parser.add_option("", "--kernel", dest="cmpkernel", action="store_true", help="")
- parser.add_option("", "--state", dest="cmpstate", action="store_true", help="")
- parser.add_option("", "--comon", dest="comon", help="")
- config = config(parser)
- config.parse_args()
+ config.cmpdays=False
+ config.comon="sshstatus"
+ config.fields="nodename,ping,ssh,pcu,category,state,comonstats,kernel,bootcd"
+ config.dbname="findbad"
+ config.cmpping=False
+ config.cmpdns=False
+ config.cmploginbase=False
+ config.cmpssh=False
+ config.cmpcategory=False
+
print "Content-Type: text/html\r\n"
if mynodeonly == None:
print "<html><body>\n"
import cgi
import cgitb;
-import soltesz
+from monitor import database
import time
cgitb.enable()
rows = ""
-fb = soltesz.dbLoad("findbad")
+fb = database.dbLoad("findbad")
packed_values = []
for mynode in fb['nodes'].keys():
fbnode = fb['nodes'][mynode]['values']