From 3f2acbd8b7761dae4bb02fedd3557e1003cafb43 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Tue, 2 Jun 2009 21:30:28 +0000 Subject: [PATCH] added comonquery command-line tool. added flush and clear commands to the beginning of each web entry point in controllers.py; I think this will help address the IntegrityErrors seen here and at PLE. moved plccache to local functions to speed invocation of some calls (nodequery) added several tags to bootman.py to help with new 3.0 BootManager issue. moved bootman import in monitor/database/info/interface.py due to import error. still need to investigate this added extra RPM checks to node environment checks. This isn't put in the db but the log files can be queried over time. --- comonquery.py | 152 +++++++++++++++++++++++ monitor/bootman.py | 11 +- monitor/common.py | 3 +- monitor/database/info/interface.py | 2 +- monitor/model.py | 3 +- monitor/scanapi.py | 4 + monitor/wrapper/plccache.py | 3 + nodequery.py | 13 +- nodesets.py | 8 +- web/MonitorWeb/monitorweb/controllers.py | 16 ++- 10 files changed, 196 insertions(+), 19 deletions(-) create mode 100755 comonquery.py diff --git a/comonquery.py b/comonquery.py new file mode 100755 index 0000000..72e5d13 --- /dev/null +++ b/comonquery.py @@ -0,0 +1,152 @@ +#!/usr/bin/python + + +import sys +from monitor import database +from monitor.common import * +from monitor.model import Record +import glob +import os +import traceback + +import time +import re +import string + +from monitor.wrapper import plc +api = plc.getAuthAPI() + +from monitor.util import file +from monitor import config + +from monitor.sources import comon + +default_fields="name,resptime,sshstatus,date,uptime,lastcotop,cpuspeed,memsize,disksize" + +class NoKeyException(Exception): pass + +def daysdown_print_nodeinfo(co_nodeinfo, hostname): + co_nodeinfo['hostname'] = hostname + co_nodeinfo['daysdown'] = Record.getStrDaysDown(co_nodeinfo) + co_nodeinfo['intdaysdown'] = Record.getDaysDown(co_nodeinfo) + + print "%(intdaysdown)5s %(hostname)-44s | %(state)10.10s | %(daysdown)s" % co_nodeinfo + +def co_print_nodeinfo(co_nodeinfo, hostname, fields=None): + + # co_nodeinfo['bootstate'] : unknown pattern + co_nodeinfo['name'] = hostname + + if 'uptime' in co_nodeinfo and co_nodeinfo['uptime'] != "null": + co_nodeinfo['uptime'] = diff_time(time.time()-float(co_nodeinfo['uptime'])) + + if 'date' in co_nodeinfo and co_nodeinfo['date'] != "null": + co_nodeinfo['date'] = diff_time(float(co_nodeinfo['date'])) + + if fields == default_fields.split(','): + + print "%(name)-40s %(sshstatus)5.5s %(resptime)6.6s %(lastcotop)6.6s %(uptime)s" % co_nodeinfo + else: + format = "" + for f in fields: + format += "%%(%s)s " % f + print format % co_nodeinfo + +def main(): + + from monitor import parser as parsermodule + parser = parsermodule.getParser() + + parser.set_defaults(node=None, + select=None, + list=None, + dns=False, + listkeys=False, + pcuselect=None, + nodelist=None, + daysdown=None, + fields=default_fields) + parser.add_option("", "--daysdown", dest="daysdown", action="store_true", + help="List the node state and days down...") + + parser.add_option("", "--select", dest="select", metavar="key=value", + help="List all nodes with the given key=value pattern") + parser.add_option("", "--fields", dest="fields", metavar="key,list,...", + help="a list of keys to display for each entry.") + parser.add_option("", "--list", dest="list", action="store_true", + help="Write only the hostnames as output.") + parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", + help="A list of nodes to bring out of debug mode.") + parser.add_option("", "--listkeys", dest="listkeys", action="store_true", + help="A list of nodes to bring out of debug mode.") + + parser.add_option("", "--dns", dest="dns", action="store_true", + help="A convenience query for dns values") + + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) + + #if config.fromtime: + # fb = None + #else: + # fb = None + + # lastcotop measures whether cotop is actually running. this is a better + # metric than sshstatus, or other values from CoMon + + COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \ + "table=table_nodeview&formatcsv" + if config.dns: + config.fields = "name,dns1udp,dns1tcp,dns2udp,dns2tcp" + config.select = "dns1udp>0||dns1tcp>0||dns2udp>0||dns2tcp>0" + + if config.fields == "all": + cotop_url = COMON_COTOPURL + else: + cotop_url = COMON_COTOPURL + "&dumpcols='%s'" % config.fields + + if config.select: + cotop_url = cotop_url + "&select='%s'" % config.select + + if config.listkeys: + cotop_url = COMON_COTOPURL + "&limit=1" + + cotop = comon.Comon() + cohash = cotop.coget(cotop_url) + + if config.nodelist: + nodelist = file.getListFromFile(config.nodelist) + else: + # NOTE: list of nodes should come from comon query. + nodelist = cohash.keys() + + print "%(name)-40s %(sshstatus)5.5s %(resptime)6.6s %(lastcotop)6.6s %(uptime)s" % { + 'name' : 'hostname', + 'sshstatus' : 'sshstatus', + 'resptime' : 'resptime', + 'lastcotop' : 'lastcotop', + 'uptime' : 'uptime'} + for node in nodelist: + config.node = node + + if node not in cohash: continue + + co_nodeinfo = cohash[node] + + if config.listkeys: + print "Primary keys available in the comon object:" + for key in co_nodeinfo.keys(): + print "\t",key + sys.exit(0) + + if config.list: + print node + else: + if config.daysdown: + daysdown_print_nodeinfo(co_nodeinfo, node) + else: + fields = config.fields.split(",") + co_print_nodeinfo(co_nodeinfo, node, fields) + +if __name__ == "__main__": + main() diff --git a/monitor/bootman.py b/monitor/bootman.py index 531f883..4693315 100755 --- a/monitor/bootman.py +++ b/monitor/bootman.py @@ -430,11 +430,16 @@ class DebugInterface: "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done", ]: sequences.update({n : "restart_bootmanager_rins"}) # repair_node_keys - sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"}) + for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done", + "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done", + ]: + sequences.update({n: "repair_node_keys"}) # conn.restart_node('reinstall') for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", @@ -459,6 +464,7 @@ class DebugInterface: "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done", "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done", "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done", + "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done", ]: sequences.update({n: "restart_node_boot"}) @@ -748,7 +754,8 @@ def restore(sitehist, hostname, config=None, forced_action=None): if conn.compare_and_repair_nodekeys(): # the keys either are in sync or were forced in sync. # so try to reboot the node again. - conn.restart_bootmanager('reinstall') + # TODO: why was this originally 'reinstall' instead of 'boot'?? + conn.restart_bootmanager('boot') pass else: # there was some failure to synchronize the keys. diff --git a/monitor/common.py b/monitor/common.py index 9878d52..da174d8 100644 --- a/monitor/common.py +++ b/monitor/common.py @@ -4,7 +4,7 @@ import struct from monitor import reboot from monitor import util from monitor import database -from monitor.wrapper import plc, plccache +from monitor.wrapper import plc from datetime import datetime, timedelta from monitor.model import Message @@ -187,6 +187,7 @@ def get_nodeset(config): Given the config values passed in, return the set of hostnames that it evaluates to. """ + from monitor.wrapper import plccache api = plc.getAuthAPI() l_nodes = plccache.l_nodes diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py index 47c7553..7a41b89 100644 --- a/monitor/database/info/interface.py +++ b/monitor/database/info/interface.py @@ -1,4 +1,3 @@ -from monitor import bootman # debug nodes from monitor import reboot from monitor.common import * @@ -162,6 +161,7 @@ class SiteInterface(HistorySiteRecord): self.db.message_status = "new" def runBootManager(self, hostname): + from monitor import bootman print "attempting BM reboot of %s" % hostname ret = "" try: diff --git a/monitor/model.py b/monitor/model.py index 2f2f5e3..5d0fc05 100755 --- a/monitor/model.py +++ b/monitor/model.py @@ -2,7 +2,7 @@ from monitor import database -from monitor.wrapper import plc, plccache +from monitor.wrapper import plc from monitor.wrapper import mailer import time @@ -413,6 +413,7 @@ class Target: class Record(object): def __init__(self, hostname, data): + from monitor.wrapper import plccache self.hostname = hostname self.data = data self.plcdb_hn2lb = plccache.plcdb_hn2lb diff --git a/monitor/scanapi.py b/monitor/scanapi.py index 667c504..35f24ac 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -212,6 +212,7 @@ class ScanNodeInternal(ScanInterface): echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",' echo ' "rpm_version":"'`rpm -q NodeManager`'",' + echo ' "rpm_versions":"'`rpm -q -a`'",' echo "}" EOF """) @@ -227,6 +228,7 @@ EOF """) 'fs_status' : '', 'dns_status' : '', 'rpm_version' : '', + 'rpm_versions' : '', 'princeton_comon_dir' : "", 'princeton_comon_running' : "", 'princeton_comon_procs' : "", 'ssh_portused' : None}) @@ -234,6 +236,8 @@ EOF """) print traceback.print_exc() sys.exit(1) + print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions']) + print "RPMVERSION: %s %s" % (nodename, values['rpm_version']) ### RUN SSH ###################### b_getbootcd_id = True diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py index f92fa85..fea4c72 100755 --- a/monitor/wrapper/plccache.py +++ b/monitor/wrapper/plccache.py @@ -62,6 +62,9 @@ plcdb_lb2hn = None plcdb_id2lb = None def init(): + import traceback + print "IMPORTING PLCCACHE: ", + traceback.print_stack() global l_sites global l_nodes global l_pcus diff --git a/nodequery.py b/nodequery.py index e9001a6..738e58d 100755 --- a/nodequery.py +++ b/nodequery.py @@ -13,11 +13,11 @@ import time import re import string -from monitor.wrapper import plc, plccache +from monitor.wrapper import plc api = plc.getAuthAPI() -from monitor.database.info.model import FindbadNodeRecord, FindbadPCURecord, session -from monitor import util +from monitor.database.info.model import HistoryNodeRecord, FindbadNodeRecord, FindbadPCURecord, session +from monitor.util import file as utilfile from monitor import config @@ -383,13 +383,12 @@ def main(): fb = None if config.nodelist: - nodelist = util.file.getListFromFile(config.nodelist) + nodelist = utilfile.getListFromFile(config.nodelist) else: # NOTE: list of nodes should come from findbad db. Otherwise, we # don't know for sure that there's a record in the db.. - plcnodes = plccache.l_nodes - nodelist = [ node['hostname'] for node in plcnodes ] - #nodelist = ['planetlab-1.cs.princeton.edu'] + fbquery = HistoryNodeRecord.query.all() + nodelist = [ n.hostname for n in fbquery ] pculist = None if config.select is not None and config.pcuselect is not None: diff --git a/nodesets.py b/nodesets.py index ea69d6b..6461dfb 100755 --- a/nodesets.py +++ b/nodesets.py @@ -3,8 +3,8 @@ import sys import os from sets import Set -import parser as parsermodule -import util.file +from monitor import parser as parsermodule +from monitor.util import file def main(): parser = parsermodule.getParser() @@ -17,8 +17,8 @@ def main(): f1 = config.args[0] f2 = config.args[1] - s1 = util.file.getListFromFile(f1) - s2 = util.file.getListFromFile(f2) + s1 = file.getListFromFile(f1) + s2 = file.getListFromFile(f2) s = nodesets(config.operation, s1, s2) diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index 9bdb912..e2fb9bd 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -15,6 +15,7 @@ from monitor_xmlrpc import MonitorXmlrpcServer from monitor import reboot from monitor import scanapi +import time from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb @@ -155,7 +156,6 @@ def prep_node_for_display(node): class Root(controllers.RootController, MonitorXmlrpcServer): @expose(template="monitorweb.templates.welcome") def index(self): - import time # log.debug("Happy TurboGears Controller Responding For Duty") flash("Your application is now running") return dict(now=time.ctime()) @@ -173,7 +173,10 @@ class Root(controllers.RootController, MonitorXmlrpcServer): @expose(template="monitorweb.templates.nodelist") def node(self, filter='boot'): - import time + print "NODE------------------" + print "befor-len: ", len( [ i for i in session] ) + session.flush(); session.clear() + print "after-len: ", len( [ i for i in session] ) fbquery = FindbadNodeRecord.get_all_latest() query = [] filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0, @@ -428,7 +431,10 @@ class Root(controllers.RootController, MonitorXmlrpcServer): @expose(template="monitorweb.templates.pculist") def pcu(self, filter='all'): - import time + print "PCUVIEW------------------" + print "befor-len: ", len( [ i for i in session] ) + session.flush(); session.clear() + print "after-len: ", len( [ i for i in session] ) fbquery = FindbadPCURecord.get_all_latest() query = [] filtercount = {'ok' : 0, 'NetDown': 0, 'Not_Run' : 0, 'pending' : 0, 'all' : 0} @@ -475,6 +481,10 @@ class Root(controllers.RootController, MonitorXmlrpcServer): @expose(template="monitorweb.templates.sitelist") def site(self, filter='all'): + print "SITE------------------" + print "befor-len: ", len( [ i for i in session] ) + session.flush(); session.clear() + print "after-len: ", len( [ i for i in session] ) filtercount = {'good' : 0, 'down': 0, 'online':0, 'offline' : 0, 'new' : 0, 'pending' : 0, 'all' : 0} fbquery = HistorySiteRecord.query.all() query = [] -- 2.43.0