From bbdd1222ad57a915bbb3d872a1cf1da759ef85e3 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Thu, 18 Dec 2008 00:57:49 +0000 Subject: [PATCH] modified *list templates with abreviated information consolidated *view templates into a single template, pcuview. should rename it. updated findbad/findbadpcu to update the global round only after data collection is complete. this solves the 'no information' errors when new scan is started. --- findbad.py | 222 ++++++++------- findbadpcu.py | 60 ++-- nodebad.py | 6 + nodecommon.py | 14 + nodequery.py | 37 ++- pcubad.py | 2 + pcucontrol/models/racadm.py | 6 +- pcucontrol/reboot.py | 70 ++++- web/MonitorWeb/monitorweb/controllers.py | 187 ++++++++++--- .../monitorweb/static/css/style.css | 14 +- .../monitorweb/templates/actionlist.kid | 3 +- .../monitorweb/templates/nodelist.kid | 17 +- .../monitorweb/templates/nodeview.kid | 45 ++- .../monitorweb/templates/pculist.kid | 18 +- .../monitorweb/templates/pcuview.kid | 262 ++++++++++-------- .../monitorweb/templates/sitelist.kid | 22 +- .../monitorweb/templates/siteview.kid | 24 +- 17 files changed, 660 insertions(+), 349 deletions(-) diff --git a/findbad.py b/findbad.py index c7449d2..4d1beed 100755 --- a/findbad.py +++ b/findbad.py @@ -19,6 +19,7 @@ from monitor.wrapper import plc, plccache from nodequery import verify,query_to_dict,node_select import traceback +from nodecommon import nmap_port_status #print "starting sqlfindbad.py" # QUERY all nodes. @@ -35,6 +36,19 @@ round = 1 global_round = round count = 0 +def collectNMAP(nodename, cohash): + #### RUN NMAP ############################### + values = {} + nmap = util.command.CMD() + print "nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename + (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,80,806 %s | grep Host:" % nodename) + # NOTE: an empty / error value for oval, will still work. + (values['port_status'], continue_probe) = nmap_port_status(oval) + + values['date_checked'] = datetime.now() + + return (nodename, values) + def collectPingAndSSH(nodename, cohash): ### RUN PING ###################### ping = command.CMD() @@ -45,9 +59,9 @@ def collectPingAndSSH(nodename, cohash): if oval == "": # An error occurred - values['ping'] = "NOPING" + values['ping_status'] = False else: - values['ping'] = "PING" + values['ping_status'] = True try: for port in [22, 806]: @@ -55,13 +69,13 @@ def collectPingAndSSH(nodename, cohash): (oval, errval) = ssh.run_noexcept2(""" <<\EOF echo "{" - echo ' "kernel":"'`uname -a`'",' + echo ' "kernel_version":"'`uname -a`'",' echo ' "bmlog":"'`ls /tmp/bm.log`'",' - echo ' "bootcd":"'`cat /mnt/cdrom/bootme/ID`'",' - echo ' "nm":"'`ps ax | grep nm.py | grep -v grep`'",' - echo ' "readonlyfs":"'`touch /var/log/monitor 2>&1`'",' - echo ' "dns":"'`host boot.planet-lab.org 2>&1`'",' - echo ' "princeton_comon":"'`ls -d /vservers/princeton_comon`'",' + echo ' "bootcd_version":"'`cat /mnt/cdrom/bootme/ID`'",' + echo ' "nm_status":"'`ps ax | grep nm.py | grep -v grep`'",' + echo ' "fs_status":"'`touch /var/log/monitor 2>&1`'",' + echo ' "dns_status":"'`host boot.planet-lab.org 2>&1`'",' + echo ' "princeton_comon_dir":"'`ls -d /vservers/princeton_comon`'",' ID=`grep princeton_comon /etc/passwd | awk -F : '{if ( $3 > 500 ) { print $3}}'` echo ' "princeton_comon_running":"'`ls -d /proc/virtual/$ID`'",' @@ -69,20 +83,20 @@ def collectPingAndSSH(nodename, cohash): echo "}" EOF """) - values['ssherror'] = errval + values['ssh_error'] = errval if len(oval) > 0: #print "OVAL: %s" % oval values.update(eval(oval)) - values['sshport'] = port + values['ssh_portused'] = port break else: - values.update({'kernel': "", 'bmlog' : "", 'bootcd' : '', - 'nm' : '', - 'readonlyfs' : '', - 'dns' : '', - 'princeton_comon' : "", + values.update({'kernel_version': "", 'bmlog' : "", 'bootcd_version' : '', + 'nm_status' : '', + 'fs_status' : '', + 'dns_status' : '', + 'princeton_comon_dir' : "", 'princeton_comon_running' : "", - 'princeton_comon_procs' : "", 'sshport' : None}) + 'princeton_comon_procs' : "", 'ssh_portused' : None}) except: print traceback.print_exc() sys.exit(1) @@ -94,79 +108,79 @@ EOF """) #errval = "" #(oval, errval) = ssh.run_noexcept('echo `uname -a ; ls /tmp/bm.log`') - oval = values['kernel'] + oval = values['kernel_version'] if "2.6.17" in oval or "2.6.2" in oval: - values['ssh'] = 'SSH' - values['category'] = 'PROD' + values['ssh_status'] = True + values['observed_category'] = 'PROD' if "bm.log" in values['bmlog']: - values['state'] = 'DEBUG' + values['observed_status'] = 'DEBUG' else: - values['state'] = 'BOOT' + values['observed_status'] = 'BOOT' elif "2.6.12" in oval or "2.6.10" in oval: - values['ssh'] = 'SSH' - values['category'] = 'OLDPROD' + values['ssh_status'] = True + values['observed_category'] = 'OLDPROD' if "bm.log" in values['bmlog']: - values['state'] = 'DEBUG' + values['observed_status'] = 'DEBUG' else: - values['state'] = 'BOOT' + values['observed_status'] = 'BOOT' # NOTE: on 2.6.8 kernels, with 4.2 bootstrapfs, the chroot command fails. I have no idea why. elif "2.4" in oval or "2.6.8" in oval: b_getbootcd_id = False - values['ssh'] = 'SSH' - values['category'] = 'OLDBOOTCD' - values['state'] = 'DEBUG' + values['ssh_status'] = True + values['observed_category'] = 'OLDBOOTCD' + values['observed_status'] = 'DEBUG' elif oval != "": - values['ssh'] = 'SSH' - values['category'] = 'UNKNOWN' + values['ssh_status'] = True + values['observed_category'] = 'UNKNOWN' if "bm.log" in values['bmlog']: - values['state'] = 'DEBUG' + values['observed_status'] = 'DEBUG' else: - values['state'] = 'BOOT' + values['observed_status'] = 'BOOT' else: # An error occurred. b_getbootcd_id = False - values['ssh'] = 'NOSSH' - values['category'] = 'ERROR' - values['state'] = 'DOWN' + values['ssh_status'] = False + values['observed_category'] = 'ERROR' + values['observed_status'] = 'DOWN' val = errval.strip() - values['ssherror'] = val - values['kernel'] = "" + values['ssh_error'] = val + values['kernel_version'] = "" - #values['kernel'] = val + #values['kernel_version'] = val if b_getbootcd_id: # try to get BootCD for all nodes that are not 2.4 nor inaccessible #(oval, errval) = ssh.run_noexcept('cat /mnt/cdrom/bootme/ID') - oval = values['bootcd'] + oval = values['bootcd_version'] if "BootCD" in oval: - values['bootcd'] = oval + values['bootcd_version'] = oval if "v2" in oval and \ ( nodename is not "planetlab1.cs.unc.edu" and \ nodename is not "planetlab2.cs.unc.edu" ): - values['category'] = 'OLDBOOTCD' + values['observed_category'] = 'OLDBOOTCD' else: - values['bootcd'] = "" + values['bootcd_version'] = "" else: - values['bootcd'] = "" + values['bootcd_version'] = "" # TODO: get bm.log for debug nodes. # 'zcat /tmp/bm.log' #(oval, errval) = ssh.run_noexcept('ps ax | grep nm.py | grep -v grep') - oval = values['nm'] + oval = values['nm_status'] if "nm.py" in oval: - values['nm'] = "Y" + values['nm_status'] = "Y" else: - values['nm'] = "N" + values['nm_status'] = "N" continue_slice_check = True #(oval, errval) = ssh.run_noexcept('ls -d /vservers/princeton_comon') - oval = values['princeton_comon'] - if "princeton_comon" in oval: - values['princeton_comon'] = True + oval = values['princeton_comon_dir'] + if "princeton_comon_dir" in oval: + values['princeton_comon_dir'] = True else: - values['princeton_comon'] = False + values['princeton_comon_dir'] = False continue_slice_check = False if continue_slice_check: @@ -189,9 +203,9 @@ EOF """) if nodename in cohash: - values['comonstats'] = cohash[nodename] + values['comon_stats'] = cohash[nodename] else: - values['comonstats'] = {'resptime': '-1', + values['comon_stats'] = {'resptime': '-1', 'uptime': '-1', 'sshstatus': '-1', 'lastcotop': '-1', @@ -208,7 +222,11 @@ EOF """) except: traceback.print_exc() plc_lock.release() - values['plcnode'] = d_node + values['plc_node_stats'] = d_node + + ##### NMAP ################### + (n, v) = collectNMAP(nodename, None) + values.update(v) ### GET PLC PCU ###################### site_id = -1 @@ -220,7 +238,7 @@ EOF """) site_id = d_node['site_id'] - values['pcu'] = d_pcu + values['plc_pcuid'] = d_pcu ### GET PLC SITE ###################### plc_lock.acquire() @@ -234,8 +252,8 @@ EOF """) traceback.print_exc() plc_lock.release() - values['plcsite'] = d_site - values['date_checked'] = time.time() + values['plc_site_stats'] = d_site + values['date_checked'] = datetime.now() except: print traceback.print_exc() @@ -248,9 +266,9 @@ def recordPingAndSSH(request, result): try: if values is not None: - fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", - if_new_set={'round' : global_round}) - global_round = fbsync.round + #fbsync = FindbadNodeRecordSync.findby_or_create(hostname="global", + # if_new_set={'round' : global_round}) + #global_round = fbsync.round fbnodesync = FindbadNodeRecordSync.findby_or_create(hostname=nodename, if_new_set={'round' : global_round}) @@ -262,43 +280,38 @@ def recordPingAndSSH(request, result): fbrec = FindbadNodeRecord.findby_or_create( round=global_round, hostname=nodename) - before = fbrec.to_dict() - print "BEFORE, ", before - fbrec.flush() - time.sleep(2) - print "Setting VALUES" - fbrec.set( date_checked=datetime.fromtimestamp(values['date_checked']), - loginbase=values['loginbase'], - kernel_version=values['kernel'], - bootcd_version=values['bootcd'], - nm_status=values['nm'], - fs_status=values['readonlyfs'], - dns_status=values['dns'], - princeton_comon_dir=values['princeton_comon'], - princeton_comon_running=values['princeton_comon_running'], - princeton_comon_procs=values['princeton_comon_procs'], - plc_node_stats = values['plcnode'], - plc_site_stats = values['plcsite'], - plc_pcuid = values['pcu'], - comon_stats = values['comonstats'], - ping_status = (values['ping'] == "PING"), - ssh_portused = values['sshport'], - ssh_status = (values['ssh'] == "SSH"), - ssh_error = values['ssherror'], - observed_status = values['state'], - observed_category = values['category']) - after = fbrec.to_dict() - print "AFTER , ", after - - for v in before.keys(): - if before[v] == after[v]: - print "SAME FOR KEY %s" % v - print "%s : %s\t%s" % ( v, before[v], after[v] ) + + fbrec.set( **values ) + #date_checked=values['date_checked'], + #loginbase=values['loginbase'], + #kernel_version=values['kernel_version'], + #bootcd_version=values['bootcd_version'], + #nm_status=values['nm_status'], + #fs_status=values['fs_status'], + #dns_status=values['dns_status'], + #princeton_comon_dir=values['princeton_comon_dir'], + #princeton_comon_running=values['princeton_comon_running'], + #princeton_comon_procs=values['princeton_comon_procs'], + #plc_node_stats = values['plc_node_stats'], + #plc_site_stats = values['plc_site_stats'], + #plc_pcuid = values['plc_pcuid'], + #comon_stats = values['comon_stats'], + #ping_status = values['ping_status'], + #ssh_portused = values['ssh_portused'], + #ssh_status = values['ssh_status'], + #ssh_error = values['ssh_error'], + #observed_status = values['observed_status'], + #observed_category = values['observed_category']) + + #for v in before.keys(): + # if before[v] == after[v]: + # print "SAME FOR KEY %s" % v + # print "%s : %s\t%s" % ( v, before[v], after[v] ) fbrec.flush() fbnodesync.round = global_round fbnodesync.flush() - fbsync.flush() + #fbsync.flush() count += 1 print "%d %s %s" % (count, nodename, values) @@ -312,6 +325,16 @@ def handle_exception(request, result): for i in result: print "Result: %s" % i +def externalprobe(hostname): + try: + (nodename, values) = collectNMAP(hostname, {}) + recordPingAndSSH(None, (nodename, values)) + session.flush() + return True + except: + print traceback.print_exc() + return False + def probe(hostname): try: (nodename, values) = collectPingAndSSH(hostname, {}) @@ -335,7 +358,7 @@ def checkAndRecordState(l_nodes, cohash): node_round = fbnodesync.round fbnodesync.flush() - if node_round < global_round: + if node_round < global_round or config.force: # recreate node stats when refreshed #print "%s" % nodename req = threadpool.WorkRequest(collectPingAndSSH, [nodename, cohash], {}, @@ -378,9 +401,6 @@ def main(): if config.increment: # update global round number to force refreshes across all nodes global_round += 1 - fbsync.round = global_round - - fbsync.flush() cotop = comon.Comon() # lastcotop measures whether cotop is actually running. this is a better @@ -417,6 +437,11 @@ def main(): checkAndRecordState(l_nodes, cohash) + if config.increment: + # update global round number to force refreshes across all nodes + fbsync.round = global_round + fbsync.flush() + return 0 @@ -425,13 +450,16 @@ if __name__ == '__main__': parser = parsermodule.getParser(['nodesets']) - parser.set_defaults( increment=False, dbname="findbad", cachenodes=False) + parser.set_defaults( increment=False, dbname="findbad", cachenodes=False, + force=False,) parser.add_option("", "--cachenodes", action="store_true", help="Cache node lookup from PLC") parser.add_option("", "--dbname", dest="dbname", metavar="FILE", help="Specify the name of the database to which the information is saved") parser.add_option("-i", "--increment", action="store_true", dest="increment", help="Increment round number to force refresh or retry") + parser.add_option("", "--force", action="store_true", dest="force", + help="Force probe without incrementing global 'round'.") parser = parsermodule.getParser(['defaults'], parser) diff --git a/findbadpcu.py b/findbadpcu.py index 468107d..0d06d1e 100755 --- a/findbadpcu.py +++ b/findbadpcu.py @@ -20,25 +20,13 @@ from monitor import database from monitor import util from monitor.wrapper import plc, plccache from nodequery import pcu_select +from nodecommon import nmap_port_status plc_lock = threading.Lock() global_round = 1 errorState = {} count = 0 -def nmap_port_status(status): - ps = {} - l_nmap = status.split() - ports = l_nmap[4:] - - continue_probe = False - for port in ports: - results = port.split('/') - ps[results[0]] = results[1] - if results[1] == "open": - continue_probe = True - return (ps, continue_probe) - def get_pcu(pcuname): plc_lock.acquire() try: @@ -176,7 +164,16 @@ def collectPingAndSSH(pcuname, cohash): if b_except or not continue_probe: return (None, None, None) - + #### RUN NMAP ############################### + if continue_probe: + nmap = util.command.CMD() + print "nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats']) + (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])) + # NOTE: an empty / error value for oval, will still work. + (values['port_status'], continue_probe) = nmap_port_status(oval) + else: + values['port_status'] = None + #### COMPLETE ENTRY ####################### values['entry_complete'] = [] @@ -203,7 +200,8 @@ def collectPingAndSSH(pcuname, cohash): # If there are no nodes associated with this PCU, then we cannot continue. if len(values['plc_pcu_stats']['node_ids']) == 0: continue_probe = False - values['entry_complete'] += ['NoNodeIds'] + values['entry_complete'] += ['nodeids'] + #### DNS and IP MATCH ####################### if values['plc_pcu_stats']['hostname'] is not None and values['plc_pcu_stats']['hostname'] is not "" and \ @@ -230,19 +228,11 @@ def collectPingAndSSH(pcuname, cohash): values['plc_pcu_stats']['hostname'] = "No_entry_in_DB" continue_probe = False - #### RUN NMAP ############################### - if continue_probe: - nmap = util.command.CMD() - (oval,eval) = nmap.run_noexcept("nmap -oG - -P0 -p22,23,80,443,5869,9100,16992 %s | grep Host:" % reboot.pcu_name(values['plc_pcu_stats'])) - # NOTE: an empty / error value for oval, will still work. - (values['port_status'], continue_probe) = nmap_port_status(oval) - else: - values['port_status'] = None - ###### DRY RUN ############################ if 'node_ids' in values['plc_pcu_stats'] and len(values['plc_pcu_stats']['node_ids']) > 0: - rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], values, continue_probe, 1, True) + rb_ret = reboot.reboot_test_new(values['plc_pcu_stats']['nodenames'][0], + values, 1, True) else: rb_ret = "Not_Run" # No nodes to test" @@ -268,15 +258,15 @@ def recordPingAndSSH(request, result): if values is not None: pcu_id = int(nodename) - fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, - if_new_set={'round': global_round}) - global_round = fbsync.round + #fbsync = FindbadPCURecordSync.findby_or_create(plc_pcuid=0, + # if_new_set={'round': global_round}) + #global_round = fbsync.round fbnodesync = FindbadPCURecordSync.findby_or_create(plc_pcuid=pcu_id, if_new_set={'round' : global_round}) fbrec = FindbadPCURecord( date_checked=datetime.fromtimestamp(values['date_checked']), - round=fbsync.round, + round=global_round, plc_pcuid=pcu_id, plc_pcu_stats=values['plc_pcu_stats'], dns_status=values['dns_status'], @@ -287,7 +277,7 @@ def recordPingAndSSH(request, result): fbnodesync.round = global_round fbnodesync.flush() - fbsync.flush() + #fbsync.flush() fbrec.flush() count += 1 @@ -379,7 +369,7 @@ def main(): l_pcus = [pcu for pcu in sets.Set(pcus)] elif config.nodelist == None and config.pcuid == None: - print "Calling API GetPCUs() : refresh(%s)" % config.refresh + print "Calling API GetPCUs() : cachecalls(%s)" % config.cachecalls l_pcus = [pcu['pcu_id'] for pcu in l_pcus] elif config.nodelist is not None: l_pcus = util.file.getListFromFile(config.nodelist) @@ -391,11 +381,15 @@ def main(): if config.increment: # update global round number to force refreshes across all nodes global_round += 1 - fbsync.round = global_round - fbsync.flush() checkAndRecordState(l_pcus, cohash) + if config.increment: + # update global round number to force refreshes across all nodes + fbsync.round = global_round + fbsync.flush() + session.flush() + return 0 diff --git a/nodebad.py b/nodebad.py index baa016c..f9f6edf 100755 --- a/nodebad.py +++ b/nodebad.py @@ -14,6 +14,7 @@ from monitor import config from monitor.wrapper import plc,plccache from monitor.const import MINUP from monitor.database.info.model import FindbadNodeRecord, HistoryNodeRecord +from monitor.database.dborm import mon_session as session from monitor.model import * @@ -54,6 +55,10 @@ def checkAndRecordState(l_nodes, l_plcnodes): print traceback.print_exc() continue + if not noderec: + print "none object for %s"% nodename + continue + node_state = noderec.observed_status if noderec.plc_node_stats: boot_state = noderec.plc_node_stats['boot_state'] @@ -80,6 +85,7 @@ def checkAndRecordState(l_nodes, l_plcnodes): # replace with another operations that also commits all pending ops, such # as session.commit() or flush() or something print HistoryNodeRecord.query.count() + session.flush() return True diff --git a/nodecommon.py b/nodecommon.py index 082550b..051cd61 100644 --- a/nodecommon.py +++ b/nodecommon.py @@ -122,6 +122,20 @@ def getvalue(fb, path): return None return values +def nmap_port_status(status): + ps = {} + l_nmap = status.split() + ports = l_nmap[4:] + + continue_probe = False + for port in ports: + results = port.split('/') + ps[results[0]] = results[1] + if results[1] == "open": + continue_probe = True + return (ps, continue_probe) + + def nodegroup_display(node, fbdata, conf=None): node['current'] = get_current_state(fbdata) diff --git a/nodequery.py b/nodequery.py index 48a5f73..bcebf15 100755 --- a/nodequery.py +++ b/nodequery.py @@ -256,7 +256,7 @@ def query_to_dict(query): def pcu_in(fbdata): #if 'plcnode' in fbdata: if 'plc_node_stats' in fbdata: - if 'pcu_ids' in fbdata['plc_node_stats']: + if fbdata['plc_node_stats'] and 'pcu_ids' in fbdata['plc_node_stats']: if len(fbdata['plc_node_stats']['pcu_ids']) > 0: return True return False @@ -275,19 +275,28 @@ def pcu_select(str_query, nodelist=None): dict_query = query_to_dict(str_query) print "dict_query", dict_query - - for noderec in fbquery: - if nodelist is not None: - if noderec.hostname not in nodelist: continue - - fb_nodeinfo = noderec.to_dict() - if pcu_in(fb_nodeinfo): - pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=get(fb_nodeinfo, 'plc_node_stats.pcu_ids')[0]).first() - if pcurec: - pcuinfo = pcurec.to_dict() - if verify(dict_query, pcuinfo): - nodenames.append(noderec.hostname) - pcunames.append(pcuinfo['plc_pcuid']) + print 'length %s' % len(fbpcuquery.all()) + + for pcurec in fbpcuquery: + pcuinfo = pcurec.to_dict() + if verify(dict_query, pcuinfo): + #nodenames.append(noderec.hostname) + #print 'appending %s' % pcuinfo['plc_pcuid'] + pcunames.append(pcuinfo['plc_pcuid']) + + #for noderec in fbquery: + # if nodelist is not None: + # if noderec.hostname not in nodelist: continue +# +# fb_nodeinfo = noderec.to_dict() +# if pcu_in(fb_nodeinfo): +# pcurec = FindbadPCURecord.get_latest_by(plc_pcuid=get(fb_nodeinfo, +# 'plc_node_stats.pcu_ids')[0]).first() +# if pcurec: +# pcuinfo = pcurec.to_dict() +# if verify(dict_query, pcuinfo): +# nodenames.append(noderec.hostname) +# pcunames.append(pcuinfo['plc_pcuid']) return (nodenames, pcunames) def node_select(str_query, nodelist=None, fb=None): diff --git a/pcubad.py b/pcubad.py index b31599f..6ca478f 100755 --- a/pcubad.py +++ b/pcubad.py @@ -11,6 +11,7 @@ from pcucontrol import reboot from monitor import parser as parsermodule from monitor import config from monitor.database.info.model import HistoryPCURecord, FindbadPCURecord +from monitor.database.dborm import mon_session as session from monitor.wrapper import plc,plccache from monitor.const import MINUP @@ -93,6 +94,7 @@ def checkAndRecordState(l_pcus, l_plcpcus): # replace with another operations that also commits all pending ops, such # as session.commit() or flush() or something print HistoryPCURecord.query.count() + session.flush() return True diff --git a/pcucontrol/models/racadm.py b/pcucontrol/models/racadm.py index 8dec875..f4e69dc 100755 --- a/pcucontrol/models/racadm.py +++ b/pcucontrol/models/racadm.py @@ -100,7 +100,9 @@ def racadm_reboot(host, username, password, dryrun, state="powercycle"): from optparse import OptionParser parser = OptionParser() -parser.set_defaults(ip="", user="", password="", state="powercycle") +parser.set_defaults(ip="", user="", password="", dryrun=False, state="powercycle") +parser.add_option("-d", "", dest="dryrun", action="store_true", + help="enable dryrun tests. no action is taken") parser.add_option("-r", "", dest="ip", metavar="nodename.edu", help="A single node name to add to the nodegroup") parser.add_option("-u", "", dest="user", metavar="username", @@ -117,6 +119,6 @@ if __name__ == '__main__': options.user is not "" and \ options.password is not "": - racadm_reboot(options.ip, options.user, options.password, False, options.state) + racadm_reboot(options.ip, options.user, options.password, options.dryrun, options.state) else: parser.print_help() diff --git a/pcucontrol/reboot.py b/pcucontrol/reboot.py index 04fe4da..decaf1d 100755 --- a/pcucontrol/reboot.py +++ b/pcucontrol/reboot.py @@ -120,6 +120,7 @@ class Transport: HTTP = 3 HTTPS = 4 IPAL = 5 + DRAC = 6 TELNET_TIMEOUT = 120 @@ -231,7 +232,10 @@ class PCUControl(Transport,PCUModel,PCURecord): PCURecord.__init__(self, plc_pcu_record) type = None if self.port_status: - if '22' in supported_ports and self.port_status['22'] == "open": + # NOTE: prefer racadm port over ssh + if '5869' in supported_ports and self.port_status['5869'] == "open": + type = Transport.DRAC# DRAC cards user this port. + elif '22' in supported_ports and self.port_status['22'] == "open": type = Transport.SSH elif '23' in supported_ports and self.port_status['23'] == "open": type = Transport.TELNET @@ -240,9 +244,6 @@ class PCUControl(Transport,PCUModel,PCURecord): type = Transport.HTTPS elif '80' in supported_ports and self.port_status['80'] == "open": type = Transport.HTTP - elif '5869' in supported_ports and self.port_status['5869'] == "open": - # For DRAC cards. Racadm opens this port. - type = Transport.HTTP elif '9100' in supported_ports and self.port_status['9100'] == "open": type = Transport.IPAL elif '16992' in supported_ports and self.port_status['16992'] == "open": @@ -343,7 +344,13 @@ class IPAL(PCUControl): def run(self, node_port, dryrun): if self.type == Transport.IPAL: - return self.run_ipal(node_port, dryrun) + ret = self.run_ipal(node_port, dryrun) + if ret != 0: + ret2 = self.run_telnet(node_port, dryrun) + if ret2 != 0: + return ret + return ret2 + return ret elif self.type == Transport.TELNET: return self.run_telnet(node_port, dryrun) else: @@ -636,10 +643,53 @@ class IntelAMT(PCUControl): return cmd.system(cmd_str, self.TELNET_TIMEOUT) class DRAC(PCUControl): + supported_ports = [22,443,5869] def run(self, node_port, dryrun): + if self.type == Transport.DRAC: + print "trying racadm_reboot..." + return racadm_reboot(self.host, self.username, self.password, node_port, dryrun) + elif self.type == Transport.SSH: + return self.run_ssh(node_port, dryrun) + else: + raise ExceptionNoTransport("No implementation for open ports") - print "trying racadm_reboot..." - racadm_reboot(self.host, self.username, self.password, node_port, dryrun) + def run_ssh(self, node_port, dryrun): + ssh_options="-o StrictHostKeyChecking=no "+\ + "-o PasswordAuthentication=yes "+\ + "-o PubkeyAuthentication=no" + s = pxssh.pxssh() + if not s.login(self.host, self.username, self.password, ssh_options, + original_prompts="Dell", login_timeout=TELNET_TIMEOUT): + raise ExceptionPassword("Invalid Password") + + print "logging in..." + s.send("\r\n\r\n") + try: + # Testing Reboot ? + #index = s.expect(["DRAC 5", "[%s]#" % self.username ]) + # NOTE: be careful to escape any characters used by 're.compile' + index = s.expect(["\$", "\[%s\]#" % self.username ]) + print "INDEX:", index + if dryrun: + if index == 0: + s.send("racadm getsysinfo") + elif index == 1: + s.send("getsysinfo") + else: + if index == 0: + s.send("racadm serveraction powercycle") + elif index == 1: + s.send("serveraction powercycle") + + s.send("exit") + + except pexpect.EOF: + raise ExceptionPrompt("EOF before expected Prompt") + except pexpect.TIMEOUT: + print s + raise ExceptionPrompt("Timeout before expected Prompt") + + s.close() return 0 @@ -1080,7 +1130,7 @@ class ePowerSwitchOld(PCUControl): return 0 class ManualPCU(PCUControl): - supported_ports = [22,23,80,443,9100,16992] + supported_ports = [22,23,80,443] def run(self, node_port, dryrun): if not dryrun: @@ -1291,7 +1341,7 @@ def racadm_reboot(host, username, password, port, dryrun): logger.debug("runcmd raised exception %s" % err) if verbose: logger.debug(err) - return -1 + return err def pcu_name(pcu): if pcu['hostname'] is not None and pcu['hostname'] is not "": @@ -1372,6 +1422,8 @@ class Unknown(PCUControl): supported_ports = [22,23,80,443,5869,9100,16992] def model_to_object(modelname): + if modelname is None: + return ManualPCU if "AMT" in modelname: return IntelAMT elif "BayTech" in modelname: diff --git a/web/MonitorWeb/monitorweb/controllers.py b/web/MonitorWeb/monitorweb/controllers.py index e5d0da2..a3e3021 100644 --- a/web/MonitorWeb/monitorweb/controllers.py +++ b/web/MonitorWeb/monitorweb/controllers.py @@ -1,5 +1,6 @@ import turbogears as tg from turbogears import controllers, expose, flash, exception_handler +from turbogears import widgets from cherrypy import request, response import cherrypy # from monitorweb import model @@ -20,28 +21,52 @@ from monitorweb.templates.links import * import findbad -def format_ports(pcu): + +def query_to_dict(query): + """ take a url query string and chop it up """ + val = {} + query_fields = query.split('&') + for f in query_fields: + (k,v) = urllib.splitvalue(f) + val[k] = v + + return val + +def format_ports(data, pcumodel=None): retval = [] - if pcu.port_status and len(pcu.port_status.keys()) > 0 : - obj = reboot.model_to_object(pcu.plc_pcu_stats['model']) - for port in obj.supported_ports: + filtered_length=0 + + if pcumodel: + supported_ports=reboot.model_to_object(pcumodel).supported_ports + else: + # ports of a production node + supported_ports=[22,80,806] + + if data and len(data.keys()) > 0 : + for port in supported_ports: try: - state = pcu.port_status[str(port)] + state = data[str(port)] except: state = "unknown" + + if state == "filtered": + filtered_length += 1 retval.append( (port, state) ) if retval == []: retval = [( "Closed/Filtered", "state" )] + if filtered_length == len(supported_ports): + retval = [( "All Filtered", "state" )] + return retval def format_pcu_shortstatus(pcu): status = "error" if pcu: if pcu.reboot_trial_status == str(0): - status = "ok" + status = "Ok" elif pcu.reboot_trial_status == "NetDown" or pcu.reboot_trial_status == "Not_Run": status = pcu.reboot_trial_status else: @@ -56,21 +81,44 @@ def prep_pcu_for_display(pcu): except: pcu.loginbase = "unknown" - pcu.ports = format_ports(pcu) + pcu.ports = format_ports(pcu.port_status, pcu.plc_pcu_stats['model']) pcu.status = format_pcu_shortstatus(pcu) + #print pcu.entry_complete + pcu.entry_complete_str = pcu.entry_complete + #pcu.entry_complete_str += "".join([ f[0] for f in pcu.entry_complete.split() ]) + if pcu.dns_status == "NOHOSTNAME": + pcu.dns_short_status = 'NoHost' + elif pcu.dns_status == "DNS-OK": + pcu.dns_short_status = 'Ok' + elif pcu.dns_status == "DNS-NOENTRY": + pcu.dns_short_status = 'NoEntry' + elif pcu.dns_status == "NO-DNS-OR-IP": + pcu.dns_short_status = 'NoHostOrIP' + elif pcu.dns_status == "DNS-MISMATCH": + pcu.dns_short_status = 'Mismatch' + +class NodeWidget(widgets.Widget): + pass + def prep_node_for_display(node): if node.plc_pcuid: pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first() if pcu: node.pcu_status = pcu.reboot_trial_status + node.pcu_short_status = format_pcu_shortstatus(pcu) + node.pcu = pcu + prep_pcu_for_display(node.pcu) else: + node.pcu_short_status = "none" node.pcu_status = "nodata" - node.pcu_short_status = format_pcu_shortstatus(pcu) + node.pcu = None else: node.pcu_status = "nopcu" node.pcu_short_status = "none" + node.pcu = None + if node.kernel_version: node.kernel = node.kernel_version.split()[2] @@ -81,7 +129,20 @@ def prep_node_for_display(node): node.loginbase = site_id2lb[node.plc_node_stats['site_id']] except: node.loginbase = "unknown" - + + if node.loginbase: + node.site = HistorySiteRecord.by_loginbase(node.loginbase) + + node.history = HistoryNodeRecord.by_hostname(node.hostname) + + if node.port_status: + node.ports = format_ports(node.port_status) + try: + exists = node.plc_node_stats['last_contact'] + except: + node.plc_node_stats = {'last_contact' : None} + + class Root(controllers.RootController): @expose(template="monitorweb.templates.welcome") @@ -91,7 +152,7 @@ class Root(controllers.RootController): flash("Your application is now running") return dict(now=time.ctime()) - @expose(template="monitorweb.templates.nodeview") + @expose(template="monitorweb.templates.pcuview") def nodeview(self, hostname=None): nodequery=[] if hostname: @@ -100,7 +161,7 @@ class Root(controllers.RootController): prep_node_for_display(node) nodequery += [node] - return dict(nodequery=nodequery) + return self.pcuview(None, hostname) # dict(nodequery=nodequery) @expose(template="monitorweb.templates.nodelist") def node(self, filter='BOOT'): @@ -116,7 +177,7 @@ class Root(controllers.RootController): if node.observed_status != 'DOWN': filtercount[node.observed_status] += 1 else: - if node.plc_node_stats['last_contact'] != None: + if node.plc_node_stats and node.plc_node_stats['last_contact'] != None: filtercount[node.observed_status] += 1 else: filtercount['neverboot'] += 1 @@ -129,7 +190,7 @@ class Root(controllers.RootController): else: query.append(node) elif filter == "neverboot": - if node.plc_node_stats['last_contact'] == None: + if not node.plc_node_stats or node.plc_node_stats['last_contact'] == None: query.append(node) elif filter == "pending": # TODO: look in message logs... @@ -137,15 +198,22 @@ class Root(controllers.RootController): elif filter == "all": query.append(node) - return dict(now=time.ctime(), query=query, fc=filtercount) + widget = NodeWidget(template='monitorweb.templates.node_template') + return dict(now=time.ctime(), query=query, fc=filtercount, nodewidget=widget) def nodeaction_handler(self, tg_exceptions=None): """Handle any kind of error.""" refurl = request.headers.get("Referer",link("pcu")) print refurl + # TODO: do this more intelligently... - if len(urllib.splitquery(refurl)) > 1: - pcuid = urllib.splitvalue(urllib.splitquery(refurl)[1])[1] + uri_fields = urllib.splitquery(refurl) + if uri_fields[1] is not None: + val = query_to_dict(uri_fields[1]) + if 'pcuid' in val: + pcuid = val['pcuid'] + elif 'hostname' in val: + pcuid = FindbadNodeRecord.get_latest_by(hostname=val['hostname']).first().plc_pcuid else: pcuid=None @@ -155,7 +223,6 @@ class Root(controllers.RootController): print pcuid return self.pcuview(pcuid, **dict(exceptions=tg_exceptions)) - #return dict(pcuquery=[], nodequery=[], exceptions=tg_exceptions) def nodeaction(self, **data): for item in data.keys(): @@ -167,8 +234,11 @@ class Root(controllers.RootController): flash("No hostname given in submitted data") return - if 'submit' in data: - action = data['submit'] + if 'submit' in data or 'type' in data: + try: + action = data['submit'] + except: + action = data['type'] else: flash("No submit action given in submitted data") return @@ -178,43 +248,86 @@ class Root(controllers.RootController): ret = reboot.reboot_str(str(hostname)) print ret if ret: raise RuntimeError("Error using PCU: " + ret) + flash("Reboot appeared to work. All at most 5 minutes. Run ExternalScan to check current status.") - elif action == "ExternalProbe": - raise RuntimeError("THIS IS A PROBLEM") - - elif action == "DeepProbe": + elif action == "ExternalScan": + findbad.externalprobe(str(hostname)) + flash("External Scan Successful!") + elif action == "InternalScan": findbad.probe(str(hostname)) + flash("Internal Scan Successful!") else: # unknown action - flash("Unknown action given") + raise RuntimeError("Unknown action given") return # TODO: add form validation @expose(template="monitorweb.templates.pcuview") @exception_handler(nodeaction_handler,"isinstance(tg_exceptions,RuntimeError)") - def pcuview(self, pcuid=None, **data): + def pcuview(self, loginbase=None, pcuid=None, hostname=None, **data): + sitequery=[] pcuquery=[] nodequery=[] - if 'submit' in data.keys(): + exceptions = None + + for key in data: + print key, data[key] + + if 'submit' in data.keys() or 'type' in data.keys(): + if hostname: data['hostname'] = hostname self.nodeaction(**data) if 'exceptions' in data: exceptions = data['exceptions'] - else: - exceptions = None - if pcuid: + if loginbase: + sitequery = [HistorySiteRecord.by_loginbase(loginbase)] + pcus = {} + for plcnode in site_lb2hn[loginbase]: + for node in FindbadNodeRecord.get_latest_by(hostname=plcnode['hostname']): + # NOTE: reformat some fields. + prep_node_for_display(node) + nodequery += [node] + if node.plc_pcuid: # not None + pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first() + prep_pcu_for_display(pcu) + pcus[node.plc_pcuid] = pcu + + for pcuid_key in pcus: + pcuquery += [pcus[pcuid_key]] + + if pcuid and hostname is None: + print "pcuid: %s" % pcuid for pcu in FindbadPCURecord.get_latest_by(plc_pcuid=pcuid): # NOTE: count filter prep_pcu_for_display(pcu) pcuquery += [pcu] - for nodename in pcu.plc_pcu_stats['nodenames']: - print "query for %s" % nodename - node = FindbadNodeRecord.get_latest_by(hostname=nodename).first() - print "%s" % node - if node: - prep_node_for_display(node) - nodequery += [node] - return dict(pcuquery=pcuquery, nodequery=nodequery, exceptions=exceptions) + if 'site_id' in pcu.plc_pcu_stats: + sitequery = [HistorySiteRecord.by_loginbase(pcu.loginbase)] + + if 'nodenames' in pcu.plc_pcu_stats: + for nodename in pcu.plc_pcu_stats['nodenames']: + print "query for %s" % nodename + q = FindbadNodeRecord.get_latest_by(hostname=nodename) + node = q.first() + print "%s" % node.port_status + print "%s" % node.to_dict() + print "%s" % len(q.all()) + if node: + prep_node_for_display(node) + nodequery += [node] + + if hostname and pcuid is None: + for node in FindbadNodeRecord.get_latest_by(hostname=hostname): + # NOTE: reformat some fields. + prep_node_for_display(node) + sitequery = [node.site] + nodequery += [node] + if node.plc_pcuid: # not None + pcu = FindbadPCURecord.get_latest_by(plc_pcuid=node.plc_pcuid).first() + prep_pcu_for_display(pcu) + pcuquery += [pcu] + + return dict(sitequery=sitequery, pcuquery=pcuquery, nodequery=nodequery, exceptions=exceptions) @expose(template="monitorweb.templates.pculist") def pcu(self, filter='all'): diff --git a/web/MonitorWeb/monitorweb/static/css/style.css b/web/MonitorWeb/monitorweb/static/css/style.css index 7bb4078..df07184 100644 --- a/web/MonitorWeb/monitorweb/static/css/style.css +++ b/web/MonitorWeb/monitorweb/static/css/style.css @@ -57,12 +57,18 @@ a.info span{display: none} a.info:hover span{ /*the span will display just on :hover state*/ display:block; position:absolute; - top:2em; left:2em; width:15em; + top:1em; left:-7em; width: 100%; border:1px solid #AAA; color:#DDD; background-color:black; text-align: center} +div#legend a:hover span {display: block; + float: left; width: 30em; + padding: 5px; margin: 5px; z-index: 100; + color: #333; background: white; + font: 10px Verdana, sans-serif; text-align: left;} + div#links a:hover span {display: block; /*position: absolute; top: 200px; left: 0; width: 125px;*/ /*position: relative; top: 0px; left: 40; width: 30em;*/ @@ -84,14 +90,14 @@ a.right { float: right; } #portfiltered { background-color: gold; } #dns-DNS-OK { background-color: lightgreen; } -#dns-NOHOSTNAME { background-color: white; } +/*#dns-NOHOSTNAME { background-color: white; }*/ #dns-DNS-MISMATCH { background-color: gold; } #dns-DNS-NOENTRY { background-color: indianred; } #dns-NO-DNS-OR-IP { background-color: indianred; } #status-NetDown { background-color: lightgrey; } #status-Not_Run { background-color: lightgrey; } -#status-ok { background-color: darkseagreen; } +#status-Ok { background-color: darkseagreen; } #status-0 { background-color: darkseagreen; } #status-error { background-color: indianred; } #status-none { background-color: white; } @@ -196,8 +202,6 @@ h2 { span.code { font-size: 120%; /*font-weight: bold;*/ - margin: 20 20 20 20; - padding: 20 20 20 20; } #status_block { diff --git a/web/MonitorWeb/monitorweb/templates/actionlist.kid b/web/MonitorWeb/monitorweb/templates/actionlist.kid index 843906a..eb79269 100644 --- a/web/MonitorWeb/monitorweb/templates/actionlist.kid +++ b/web/MonitorWeb/monitorweb/templates/actionlist.kid @@ -4,6 +4,7 @@ layout_params['page_title'] = "Monitor Node View" from monitor.util import diff_time from monitor import config from time import mktime +from links import * def zabbix_event_ack_link(eventid): return "http://" + config.MONITOR_HOSTNAME + "/zabbix/acknow.php?eventid=" + str(eventid) @@ -40,7 +41,7 @@ def zabbix_event_ack_link(eventid): - ${node[0]} + ${node[0]} diff --git a/web/MonitorWeb/monitorweb/templates/nodelist.kid b/web/MonitorWeb/monitorweb/templates/nodelist.kid index 669f02f..5b4e7c3 100644 --- a/web/MonitorWeb/monitorweb/templates/nodelist.kid +++ b/web/MonitorWeb/monitorweb/templates/nodelist.kid @@ -28,26 +28,13 @@ from links import * - Site - Hostname - ping - - pcu - status - kernel - last_contact + ${nodewidget.display(node=None, header=True)} - ${node.loginbase} - - - - - - + ${nodewidget.display(node=node, header=None)} diff --git a/web/MonitorWeb/monitorweb/templates/nodeview.kid b/web/MonitorWeb/monitorweb/templates/nodeview.kid index 354761c..dc2820e 100644 --- a/web/MonitorWeb/monitorweb/templates/nodeview.kid +++ b/web/MonitorWeb/monitorweb/templates/nodeview.kid @@ -2,6 +2,8 @@ Hostname ping - pcu kernel + last_change last_contact @@ -34,16 +36,47 @@ from links import * ${node.hostname} - - ${node.pcu_short_status} - - ${node.pcu_short_status} + -

PCU Status

+

Controlling PCU

+ + + + + + + + + + + + + + + + + + +
PCU NameModelTest Results
+ + ${pcu_name(pcu.plc_pcu_stats)} + + + +

Actions Taken

diff --git a/web/MonitorWeb/monitorweb/templates/pculist.kid b/web/MonitorWeb/monitorweb/templates/pculist.kid index 99ad41a..f6043dd 100644 --- a/web/MonitorWeb/monitorweb/templates/pculist.kid +++ b/web/MonitorWeb/monitorweb/templates/pculist.kid @@ -29,8 +29,7 @@ from links import * Site PCU Name - Missing Fields - DNS Status + Config Port Status Test Results Model @@ -40,7 +39,13 @@ from links import * - ${node.loginbase} + +
+ ${node.loginbase} + + +
+
${pcu_name(node.plc_pcu_stats)} @@ -48,9 +53,8 @@ from links import *
- - - + + 80 @@ -58,7 +62,7 @@ from links import * diff --git a/web/MonitorWeb/monitorweb/templates/sitelist.kid b/web/MonitorWeb/monitorweb/templates/sitelist.kid index 50b296e..a9b7685 100644 --- a/web/MonitorWeb/monitorweb/templates/sitelist.kid +++ b/web/MonitorWeb/monitorweb/templates/sitelist.kid @@ -1,6 +1,8 @@ Site name - Status - Slices (created / max) - Nodes (online / registered) + Enabled + Penalty + Slices/Max + Nodes/Total + Last Change - ${site.loginbase} - + +
+ ${site.loginbase} + + +
+ + + n/a ${site.slices_used}/${site.slices_total} ${site.nodes_up} / ${site.nodes_total} + diff --git a/web/MonitorWeb/monitorweb/templates/siteview.kid b/web/MonitorWeb/monitorweb/templates/siteview.kid index 039a2b7..0999b31 100644 --- a/web/MonitorWeb/monitorweb/templates/siteview.kid +++ b/web/MonitorWeb/monitorweb/templates/siteview.kid @@ -2,6 +2,7 @@ Site name - Status Enabled - Slices (created / max) - Nodes (online / registered) + Penalty + Slices/Max + Nodes/Total + Status @@ -25,15 +27,19 @@ from links import * ${site.loginbase} - - - ${site.slices_used}/${site.slices_total} - ${site.nodes_up} / ${site.nodes_total} + + n/a + ${site.slices_used}/${site.slices_total} + ${site.nodes_up} / ${site.nodes_total} +

Node List

- +

+ There are no registered nodes for this PCU. +

+
@@ -49,7 +55,7 @@ from links import * + your.host.org -- 2.43.0
- your.host.org ${node.pcu_short_status}