--- /dev/null
+#!/usr/bin/python
+
+
+import sys
+from monitor import database
+from monitor.common import *
+from monitor.model import Record
+import glob
+import os
+import traceback
+
+import time
+import re
+import string
+
+from monitor.wrapper import plc
+api = plc.getAuthAPI()
+
+from monitor.util import file
+from monitor import config
+
+from monitor.sources import comon
+
+default_fields="name,resptime,sshstatus,date,uptime,lastcotop,cpuspeed,memsize,disksize"
+
+class NoKeyException(Exception): pass
+
+def daysdown_print_nodeinfo(co_nodeinfo, hostname):
+ co_nodeinfo['hostname'] = hostname
+ co_nodeinfo['daysdown'] = Record.getStrDaysDown(co_nodeinfo)
+ co_nodeinfo['intdaysdown'] = Record.getDaysDown(co_nodeinfo)
+
+ print "%(intdaysdown)5s %(hostname)-44s | %(state)10.10s | %(daysdown)s" % co_nodeinfo
+
+def co_print_nodeinfo(co_nodeinfo, hostname, fields=None):
+
+ # co_nodeinfo['bootstate'] : unknown pattern
+ co_nodeinfo['name'] = hostname
+
+ if 'uptime' in co_nodeinfo and co_nodeinfo['uptime'] != "null":
+ co_nodeinfo['uptime'] = diff_time(time.time()-float(co_nodeinfo['uptime']))
+
+ if 'date' in co_nodeinfo and co_nodeinfo['date'] != "null":
+ co_nodeinfo['date'] = diff_time(float(co_nodeinfo['date']))
+
+ if fields == default_fields.split(','):
+
+ print "%(name)-40s %(sshstatus)5.5s %(resptime)6.6s %(lastcotop)6.6s %(uptime)s" % co_nodeinfo
+ else:
+ format = ""
+ for f in fields:
+ format += "%%(%s)s " % f
+ print format % co_nodeinfo
+
+def main():
+
+ from monitor import parser as parsermodule
+ parser = parsermodule.getParser()
+
+ parser.set_defaults(node=None,
+ select=None,
+ list=None,
+ dns=False,
+ listkeys=False,
+ pcuselect=None,
+ nodelist=None,
+ daysdown=None,
+ fields=default_fields)
+ parser.add_option("", "--daysdown", dest="daysdown", action="store_true",
+ help="List the node state and days down...")
+
+ parser.add_option("", "--select", dest="select", metavar="key=value",
+ help="List all nodes with the given key=value pattern")
+ parser.add_option("", "--fields", dest="fields", metavar="key,list,...",
+ help="a list of keys to display for each entry.")
+ parser.add_option("", "--list", dest="list", action="store_true",
+ help="Write only the hostnames as output.")
+ parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt",
+ help="A list of nodes to bring out of debug mode.")
+ parser.add_option("", "--listkeys", dest="listkeys", action="store_true",
+ help="A list of nodes to bring out of debug mode.")
+
+ parser.add_option("", "--dns", dest="dns", action="store_true",
+ help="A convenience query for dns values")
+
+ parser = parsermodule.getParser(['defaults'], parser)
+ config = parsermodule.parse_args(parser)
+
+ #if config.fromtime:
+ # fb = None
+ #else:
+ # fb = None
+
+ # lastcotop measures whether cotop is actually running. this is a better
+ # metric than sshstatus, or other values from CoMon
+
+ COMON_COTOPURL= "http://summer.cs.princeton.edu/status/tabulator.cgi?" + \
+ "table=table_nodeview&formatcsv"
+ if config.dns:
+ config.fields = "name,dns1udp,dns1tcp,dns2udp,dns2tcp"
+ config.select = "dns1udp>0||dns1tcp>0||dns2udp>0||dns2tcp>0"
+
+ if config.fields == "all":
+ cotop_url = COMON_COTOPURL
+ else:
+ cotop_url = COMON_COTOPURL + "&dumpcols='%s'" % config.fields
+
+ if config.select:
+ cotop_url = cotop_url + "&select='%s'" % config.select
+
+ if config.listkeys:
+ cotop_url = COMON_COTOPURL + "&limit=1"
+
+ cotop = comon.Comon()
+ cohash = cotop.coget(cotop_url)
+
+ if config.nodelist:
+ nodelist = file.getListFromFile(config.nodelist)
+ else:
+ # NOTE: list of nodes should come from comon query.
+ nodelist = cohash.keys()
+
+ print "%(name)-40s %(sshstatus)5.5s %(resptime)6.6s %(lastcotop)6.6s %(uptime)s" % {
+ 'name' : 'hostname',
+ 'sshstatus' : 'sshstatus',
+ 'resptime' : 'resptime',
+ 'lastcotop' : 'lastcotop',
+ 'uptime' : 'uptime'}
+ for node in nodelist:
+ config.node = node
+
+ if node not in cohash: continue
+
+ co_nodeinfo = cohash[node]
+
+ if config.listkeys:
+ print "Primary keys available in the comon object:"
+ for key in co_nodeinfo.keys():
+ print "\t",key
+ sys.exit(0)
+
+ if config.list:
+ print node
+ else:
+ if config.daysdown:
+ daysdown_print_nodeinfo(co_nodeinfo, node)
+ else:
+ fields = config.fields.split(",")
+ co_print_nodeinfo(co_nodeinfo, node, fields)
+
+if __name__ == "__main__":
+ main()
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
"bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
]:
sequences.update({n : "restart_bootmanager_rins"})
# repair_node_keys
- sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
+ for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
+ "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
+ ]:
+ sequences.update({n: "repair_node_keys"})
# conn.restart_node('reinstall')
for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
"bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
+ "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
]:
sequences.update({n: "restart_node_boot"})
if conn.compare_and_repair_nodekeys():
# the keys either are in sync or were forced in sync.
# so try to reboot the node again.
- conn.restart_bootmanager('reinstall')
+ # TODO: why was this originally 'reinstall' instead of 'boot'??
+ conn.restart_bootmanager('boot')
pass
else:
# there was some failure to synchronize the keys.
from monitor import reboot
from monitor import util
from monitor import database
-from monitor.wrapper import plc, plccache
+from monitor.wrapper import plc
from datetime import datetime, timedelta
from monitor.model import Message
Given the config values passed in, return the set of hostnames that it
evaluates to.
"""
+ from monitor.wrapper import plccache
api = plc.getAuthAPI()
l_nodes = plccache.l_nodes
-from monitor import bootman # debug nodes
from monitor import reboot
from monitor.common import *
self.db.message_status = "new"
def runBootManager(self, hostname):
+ from monitor import bootman
print "attempting BM reboot of %s" % hostname
ret = ""
try:
from monitor import database
-from monitor.wrapper import plc, plccache
+from monitor.wrapper import plc
from monitor.wrapper import mailer
import time
class Record(object):
def __init__(self, hostname, data):
+ from monitor.wrapper import plccache
self.hostname = hostname
self.data = data
self.plcdb_hn2lb = plccache.plcdb_hn2lb
echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",'
echo ' "princeton_comon_procs":"'`vps ax | grep $ID | grep -v grep | wc -l`'",'
echo ' "rpm_version":"'`rpm -q NodeManager`'",'
+ echo ' "rpm_versions":"'`rpm -q -a`'",'
echo "}"
EOF """)
'fs_status' : '',
'dns_status' : '',
'rpm_version' : '',
+ 'rpm_versions' : '',
'princeton_comon_dir' : "",
'princeton_comon_running' : "",
'princeton_comon_procs' : "", 'ssh_portused' : None})
print traceback.print_exc()
sys.exit(1)
+ print "ALLVERSIONS: %s %s" % (nodename, values['rpm_versions'])
+
print "RPMVERSION: %s %s" % (nodename, values['rpm_version'])
### RUN SSH ######################
b_getbootcd_id = True
plcdb_id2lb = None
def init():
+ import traceback
+ print "IMPORTING PLCCACHE: ",
+ traceback.print_stack()
global l_sites
global l_nodes
global l_pcus
import re
import string
-from monitor.wrapper import plc, plccache
+from monitor.wrapper import plc
api = plc.getAuthAPI()
-from monitor.database.info.model import FindbadNodeRecord, FindbadPCURecord, session
-from monitor import util
+from monitor.database.info.model import HistoryNodeRecord, FindbadNodeRecord, FindbadPCURecord, session
+from monitor.util import file as utilfile
from monitor import config
fb = None
if config.nodelist:
- nodelist = util.file.getListFromFile(config.nodelist)
+ nodelist = utilfile.getListFromFile(config.nodelist)
else:
# NOTE: list of nodes should come from findbad db. Otherwise, we
# don't know for sure that there's a record in the db..
- plcnodes = plccache.l_nodes
- nodelist = [ node['hostname'] for node in plcnodes ]
- #nodelist = ['planetlab-1.cs.princeton.edu']
+ fbquery = HistoryNodeRecord.query.all()
+ nodelist = [ n.hostname for n in fbquery ]
pculist = None
if config.select is not None and config.pcuselect is not None:
import sys
import os
from sets import Set
-import parser as parsermodule
-import util.file
+from monitor import parser as parsermodule
+from monitor.util import file
def main():
parser = parsermodule.getParser()
f1 = config.args[0]
f2 = config.args[1]
- s1 = util.file.getListFromFile(f1)
- s2 = util.file.getListFromFile(f2)
+ s1 = file.getListFromFile(f1)
+ s2 = file.getListFromFile(f2)
s = nodesets(config.operation, s1, s2)
from monitor import reboot
from monitor import scanapi
+import time
from monitor.wrapper.plccache import plcdb_id2lb as site_id2lb
from monitor.wrapper.plccache import plcdb_hn2lb as site_hn2lb
class Root(controllers.RootController, MonitorXmlrpcServer):
@expose(template="monitorweb.templates.welcome")
def index(self):
- import time
# log.debug("Happy TurboGears Controller Responding For Duty")
flash("Your application is now running")
return dict(now=time.ctime())
@expose(template="monitorweb.templates.nodelist")
def node(self, filter='boot'):
- import time
+ print "NODE------------------"
+ print "befor-len: ", len( [ i for i in session] )
+ session.flush(); session.clear()
+ print "after-len: ", len( [ i for i in session] )
fbquery = FindbadNodeRecord.get_all_latest()
query = []
filtercount = {'down' : 0, 'boot': 0, 'debug' : 0, 'diagnose' : 0, 'disabled': 0,
@expose(template="monitorweb.templates.pculist")
def pcu(self, filter='all'):
- import time
+ print "PCUVIEW------------------"
+ print "befor-len: ", len( [ i for i in session] )
+ session.flush(); session.clear()
+ print "after-len: ", len( [ i for i in session] )
fbquery = FindbadPCURecord.get_all_latest()
query = []
filtercount = {'ok' : 0, 'NetDown': 0, 'Not_Run' : 0, 'pending' : 0, 'all' : 0}
@expose(template="monitorweb.templates.sitelist")
def site(self, filter='all'):
+ print "SITE------------------"
+ print "befor-len: ", len( [ i for i in session] )
+ session.flush(); session.clear()
+ print "after-len: ", len( [ i for i in session] )
filtercount = {'good' : 0, 'down': 0, 'online':0, 'offline' : 0, 'new' : 0, 'pending' : 0, 'all' : 0}
fbquery = HistorySiteRecord.query.all()
query = []