From: Stephen Soltesz Date: Thu, 9 Oct 2008 20:58:59 +0000 (+0000) Subject: just commit everything... X-Git-Tag: Monitor-2.0-0~84 X-Git-Url: http://git.onelab.eu/?p=monitor.git;a=commitdiff_plain;h=b548c69db3d1f302b4d0d08377f0231eb3c4fd58 just commit everything... --- diff --git a/bootman.py b/bootman.py index ff2a6d5..e7a47c3 100755 --- a/bootman.py +++ b/bootman.py @@ -505,6 +505,7 @@ def reboot(hostname, config=None, forced_action=None): ('nodehostname' , 'Configured node hostname does not resolve'), ('implementerror', 'Implementation Error'), ('readonlyfs' , '[Errno 30] Read-only file system'), + ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"), ('noinstall' , 'notinstalled'), ('bziperror' , 'bzip2: Data integrity error when decompressing.'), ('noblockdev' , "No block devices detected."), @@ -514,6 +515,7 @@ def reboot(hostname, config=None, forced_action=None): ('hardwarerequirefail' , 'Hardware requirements not met'), ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'), ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"), + ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"), ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'), ('modulefail' , 'Unable to get list of system modules'), ('writeerror' , 'write error: No space left on device'), @@ -583,6 +585,7 @@ def reboot(hostname, config=None, forced_action=None): "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done", "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done", ]: sequences.update({n : "restart_bootmanager_rins"}) diff --git a/config.py b/config.py index b37e04a..0cde6f3 100644 --- a/config.py +++ b/config.py @@ -50,6 +50,7 @@ if not config.imported: options = Options() update_section(options, 'commandline', True) update_section(options, 'monitorconfig') + update_section(options, 'monitordatabase') #for i in dir(config): # if "__" not in i: diff --git a/moncommands.py b/moncommands.py index 869cc96..1b67570 100644 --- a/moncommands.py +++ b/moncommands.py @@ -66,16 +66,14 @@ class CMD: o_value = "" e_value = "" - #print "reading from f_out" - if len(lout) > 0: o_value = f_out.read() - #print "reading from f_err" - if len(lerr) > 0: e_value = f_err.read() + o_value = f_out.read() + e_value = f_err.read() #print "striping output" o_value = o_value.strip() e_value = e_value.strip() - #print "OUTPUT", o_value, e_value + #print "OUTPUT -%s-%s-" % (o_value, e_value) #print "closing files" f_out.close() diff --git a/monitor/database.py b/monitor/database.py index 3b5bd65..e127791 100644 --- a/monitor/database.py +++ b/monitor/database.py @@ -3,8 +3,8 @@ import sys import pickle noserial=False try: - from PHPSerialize import * - from PHPUnserialize import * + from util.PHPSerialize import * + from util.PHPUnserialize import * except: #print >>sys.stderr, "PHPSerial db type not allowed." noserial=True @@ -15,7 +15,7 @@ import config import config as monitorconfig DEBUG= 0 -PICKLE_PATH=monitorconfig.MONITOR_DATA_ROOT +PICKLE_PATH=config.MONITOR_DATA_ROOT def dbLoad(name, type=None): @@ -90,6 +90,7 @@ class SPickle: Otherwise, it's normal mode, if the file doesn't exist, raise error Load the file """ + print "loading %s" % name if config.debug: if self.exists("debug.%s" % name, type): diff --git a/nodebad.py b/nodebad.py index 0130c3e..8d7650c 100755 --- a/nodebad.py +++ b/nodebad.py @@ -12,6 +12,14 @@ import threadpool import syncplcdb from nodequery import verify,query_to_dict,node_select from nodecommon import * +from datetime import datetime,timedelta +import config + +from sqlobject import connectionForURI,sqlhub +connection = connectionForURI(config.sqlobjecturi) +sqlhub.processConnection = connection +from infovacuum.model_findbadrecord import * +from infovacuum.model_historyrecord import * import plc api = plc.getAuthAPI() @@ -19,129 +27,71 @@ from unified_model import * from const import MINUP round = 1 -externalState = {'round': round, 'nodes': {}} count = 0 def main(config): - global externalState - externalState = database.if_cached_else(1, config.dbname, lambda : externalState) - if config.increment: - # update global round number to force refreshes across all nodes - externalState['round'] += 1 l_nodes = syncplcdb.create_plcdb() l_plcnodes = database.dbLoad("l_plcnodes") - l_nodes = get_nodeset(config) - #if config.node: - # l_nodes = [config.node] - ##else: - # l_nodes = [node['hostname'] for node in l_plcnodes] checkAndRecordState(l_nodes, l_plcnodes) def checkAndRecordState(l_nodes, l_plcnodes): - global externalState global count - global_round = externalState['round'] for nodename in l_nodes: - if nodename not in externalState['nodes']: - externalState['nodes'][nodename] = {'round': 0, 'values': []} - - node_round = externalState['nodes'][nodename]['round'] - if node_round < global_round: - # do work - values = collectStatusAndState(nodename, l_plcnodes) - global_round = externalState['round'] - externalState['nodes'][nodename]['values'] = values - externalState['nodes'][nodename]['round'] = global_round + d_node = None + for node in l_plcnodes: + if node['hostname'] == nodename: + d_node = node + break + if not d_node: + continue + + try: + pf = HistoryNodeRecord.by_hostname(nodename) + except: + pf = HistoryNodeRecord(hostname=nodename) + + pf.last_checked = datetime.now() + + try: + # Find the most recent record + noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==nodename, + orderBy='date_checked').reversed()[0] + except: + # or create an empty one. + noderec = FindbadNodeRecord(hostname=nodename) + + node_state = noderec.observed_status + if noderec.plc_node_stats: + boot_state = noderec.plc_node_stats['boot_state'] else: - count += 1 - - if count % 20 == 0: - database.dbDump(config.dbname, externalState) - - database.dbDump(config.dbname, externalState) - -fb = database.dbLoad('findbad') - -def getnodesup(nodelist): - up = 0 - for node in nodelist: - if node['hostname'] in fb['nodes'].keys(): - try: - if fb['nodes'][node['hostname']]['values']['state'] == "BOOT": - up = up + 1 - except: - pass - return up - -def get(fb, path): - indexes = path.split("/") - values = fb - for index in indexes: - if index in values: - values = values[index] + boot_state = "unknown" + + if node_state == "BOOT": + if pf.status != "good": + pf.last_changed = datetime.now() + pf.status = "good" + elif node_state == "DEBUG": + if pf.status != boot_state: + pf.last_changed = datetime.now() + pf.status = boot_state else: - return None - return values + if pf.status != "down": + pf.last_changed = datetime.now() + pf.status = "down" -def collectStatusAndState(nodename, l_plcnodes): - global count - - d_node = None - for node in l_plcnodes: - if node['hostname'] == nodename: - d_node = node - break - if not d_node: - return None - - pf = PersistFlags(nodename, 1, db='node_persistflags') - - if not pf.checkattr('last_changed'): - pf.last_changed = time.time() - - pf.last_checked = time.time() - - if not pf.checkattr('status'): - pf.status = "unknown" - - state_path = "nodes/" + nodename + "/values/state" - bootstate_path = "nodes/" + nodename + "/values/plcnode/boot_state" - - if get(fb, state_path) == "BOOT": - if pf.status != "good": pf.last_changed = time.time() - pf.status = "good" - elif get(fb, state_path) == "DEBUG": - bs = get(fb, bootstate_path) - if pf.status != bs: pf.last_changed = time.time() - pf.status = bs - else: - if pf.status != "down": pf.last_changed = time.time() - pf.status = "down" - - count += 1 - print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(pf.last_changed)) - # updated by other modules - #pf.enabled = - #pf.suspended = - - pf.save() + count += 1 + print "%d %35s %s since(%s)" % (count, nodename, pf.status, diff_time(time.mktime(pf.last_changed.timetuple()))) return True if __name__ == '__main__': import parser as parsermodule parser = parsermodule.getParser(['nodesets']) - parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, - increment=False, dbname="nodebad", cachenodes=False) - - parser.add_option("", "--dbname", dest="dbname", metavar="FILE", - help="Specify the name of the database to which the information is saved") - parser.add_option("-i", "--increment", action="store_true", dest="increment", - help="Increment round number to force refresh or retry") + parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False) parser = parsermodule.getParser(['defaults'], parser) config = parsermodule.parse_args(parser) @@ -151,6 +101,4 @@ if __name__ == '__main__': import traceback print traceback.print_exc() print "Exception: %s" % err - print "Saving data... exitting." - database.dbDump(config.dbname, externalState) sys.exit(0) diff --git a/nodequery.py b/nodequery.py index 16c0bad..691ead5 100755 --- a/nodequery.py +++ b/nodequery.py @@ -13,13 +13,22 @@ import os from reboot import pcu_name import reboot import util.file +import traceback import time import re +import config + +from sqlobject import connectionForURI,sqlhub +connection = connectionForURI(config.sqlobjecturi) +sqlhub.processConnection = connection +from infovacuum.model.findbadrecord import * + #fb = {} fb = None fbpcu = None +import string class NoKeyException(Exception): pass @@ -31,20 +40,26 @@ def daysdown_print_nodeinfo(fbnode, hostname): print "%(intdaysdown)5s %(hostname)-44s | %(state)10.10s | %(daysdown)s" % fbnode def fb_print_nodeinfo(fbnode, hostname, fields=None): - fbnode['hostname'] = hostname - fbnode['checked'] = diff_time(fbnode['checked']) - if fbnode['bootcd']: - fbnode['bootcd'] = fbnode['bootcd'].split()[-1] + #fbnode['hostname'] = hostname + #fbnode['checked'] = diff_time(fbnode['checked']) + if fbnode['bootcd_version']: + fbnode['bootcd_version'] = fbnode['bootcd_version'].split()[-1] else: - fbnode['bootcd'] = "unknown" + fbnode['bootcd_version'] = "unknown" fbnode['pcu'] = color_pcu_state(fbnode) if not fields: - if 'ERROR' in fbnode['category']: - fbnode['kernel'] = "" + if ( fbnode['observed_status'] is not None and \ + 'DOWN' in fbnode['observed_status'] ) or \ + fbnode['kernel_version'] is None: + fbnode['kernel_version'] = "" else: - fbnode['kernel'] = fbnode['kernel'].split()[2] - fbnode['boot_state'] = fbnode['plcnode']['boot_state'] + fbnode['kernel_version'] = fbnode['kernel_version'].split()[2] + + if fbnode['plc_node_stats'] is not None: + fbnode['boot_state'] = fbnode['plc_node_stats']['boot_state'] + else: + fbnode['boot_state'] = "unknown" try: if len(fbnode['nodegroups']) > 0: @@ -53,7 +68,7 @@ def fb_print_nodeinfo(fbnode, hostname, fields=None): #print "ERROR!!!!!!!!!!!!!!!!!!!!!" pass - print "%(hostname)-45s | %(checked)11.11s | %(boot_state)5.5s| %(state)8.8s | %(ssh)5.5s | %(pcu)6.6s | %(bootcd)6.6s | %(category)8.8s | %(kernel)s" % fbnode + print "%(hostname)-45s | %(date_checked)11.11s | %(boot_state)5.5s| %(observed_status)8.8s | %(ssh_status)5.5s | %(pcu)6.6s | %(bootcd_version)6.6s | %(kernel_version)s" % fbnode else: format = "" for f in fields: @@ -133,6 +148,65 @@ def verifyType(constraints, data): return con_or_true +def verifyDBrecord(constraints, record): + """ + constraints is a list of key, value pairs. + # [ {... : ...}==AND , ... , ... , ] == OR + """ + def has_key(obj, key): + try: + x = obj.__getattribute__(key) + return True + except: + return False + + def get_val(obj, key): + try: + return obj.__getattribute__(key) + except: + return None + + def get(obj, path): + indexes = path.split("/") + value = get_val(obj,indexes[0]) + if value is not None and len(indexes) > 1: + for key in indexes[1:]: + if key in value: + value = value[key] + else: + raise NoKeyException(key) + return value + + #print constraints, record + + con_or_true = False + for con in constraints: + #print "con: %s" % con + if len(con.keys()) == 0: + con_and_true = False + else: + con_and_true = True + + for key in con.keys(): + #print "looking at key: %s" % key + if has_key(record, key): + value_re = re.compile(con[key]) + if type([]) == type(get(record,key)): + local_or_true = False + for val in get(record,key): + local_or_true = local_or_true | (value_re.search(val) is not None) + con_and_true = con_and_true & local_or_true + else: + if get(record,key) is not None: + con_and_true = con_and_true & (value_re.search(get(record,key)) is not None) + else: + print "missing key %s" % key, + pass + + con_or_true = con_or_true | con_and_true + + return con_or_true + def verify(constraints, data): """ constraints is a list of key, value pairs. @@ -156,12 +230,11 @@ def verify(constraints, data): local_or_true = local_or_true | (value_re.search(val) is not None) con_and_true = con_and_true & local_or_true else: - con_and_true = con_and_true & (value_re.search(data[key]) is not None) + if data[key] is not None: + con_and_true = con_and_true & (value_re.search(data[key]) is not None) elif key not in data: print "missing key %s" % key, pass - #print "missing key %s" % key - #con_and_true = False con_or_true = con_or_true | con_and_true @@ -239,18 +312,21 @@ def node_select(str_query, nodelist=None, fbdb=None): for node in fb['nodes'].keys(): if nodelist is not None: if node not in nodelist: continue - - fb_nodeinfo = fb['nodes'][node]['values'] - if fb_nodeinfo == []: - #print node, "has lost values" + try: + fb_noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node, + orderBy='date_checked').reversed()[0] + except: continue - #sys.exit(1) - fb_nodeinfo['pcu'] = color_pcu_state(fb_nodeinfo) - fb_nodeinfo['hostname'] = node - if 'plcnode' in fb_nodeinfo: - fb_nodeinfo.update(fb_nodeinfo['plcnode']) + + fb_nodeinfo = fb_noderec.toDict() + + #fb_nodeinfo['pcu'] = color_pcu_state(fb_nodeinfo) + #if 'plcnode' in fb_nodeinfo: + # fb_nodeinfo.update(fb_nodeinfo['plcnode']) + + #if verifyDBrecord(dict_query, fb_nodeinfo): if verify(dict_query, fb_nodeinfo): #print node #fb_nodeinfo hostnames.append(node) @@ -300,6 +376,7 @@ def main(): os.chdir("..") fb = archive.load(file[:-4]) else: + fbnodes = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname, orderBy='date_checked',distinct=True).reversed() fb = database.dbLoad("findbad") fbpcu = database.dbLoad("findbadpcus") @@ -329,7 +406,13 @@ def main(): if node not in fb['nodes']: continue - fb_nodeinfo = fb['nodes'][node]['values'] + try: + # Find the most recent record + fb_noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node, + orderBy='date_checked').reversed()[0] + except: + print traceback.print_exc() + pass #fb_nodeinfo = fb['nodes'][node]['values'] if config.list: print node @@ -337,6 +420,7 @@ def main(): if config.daysdown: daysdown_print_nodeinfo(fb_nodeinfo, node) else: + fb_nodeinfo = fb_noderec.toDict() if config.select: if config.fields: fields = config.fields.split(",") diff --git a/pcubad.py b/pcubad.py index c782b9a..38cf897 100755 --- a/pcubad.py +++ b/pcubad.py @@ -14,31 +14,32 @@ import syncplcdb from nodequery import verify,query_to_dict,node_select import parser as parsermodule from nodecommon import * +from datetime import datetime,timedelta +import config + +from sqlobject import connectionForURI,sqlhub +connection = connectionForURI(config.sqlobjecturi) +sqlhub.processConnection = connection +from infovacuum.model_findbadrecord import * +from infovacuum.model_historyrecord import * import plc api = plc.getAuthAPI() from unified_model import * from const import MINUP -round = 1 -externalState = {'round': round, 'nodes': {}} -count = 0 def main(config): - global externalState - externalState = database.if_cached_else(1, config.dbname, lambda : externalState) - if config.increment: - # update global round number to force refreshes across all pcus - externalState['round'] += 1 l_plcpcus = database.if_cached_else_refresh(1, 1, "pculist", lambda : plc.GetPCUs()) - l_pcu = None + l_pcus = None if config.pcu: for pcu in l_plcpcus: - if pcu['hostname'] == config.pcu or pcu['ip'] == config.pcu: + if ( pcu['hostname'] is not None and config.pcu in pcu['hostname'] ) or \ + ( pcu['ip'] is not None and config.pcu in pcu['ip'] ): l_pcus = [pcu['pcu_id']] - if not l_pcu: + if not l_pcus: print "ERROR: could not find pcu %s" % config.pcu sys.exit(1) else: @@ -46,108 +47,68 @@ def main(config): checkAndRecordState(l_pcus, l_plcpcus) -def checkAndRecordState(l_pcus, l_plcpcus): - global externalState - global count - global_round = externalState['round'] +hn2lb = database.dbLoad("plcdb_hn2lb") +def checkAndRecordState(l_pcus, l_plcpcus): + count = 0 for pcuname in l_pcus: - if pcuname not in externalState['nodes']: - externalState['nodes'][pcuname] = {'round': 0, 'values': []} - - pcu_round = externalState['nodes'][pcuname]['round'] - if pcu_round < global_round: - # do work - values = collectStatusAndState(pcuname, l_plcpcus) - global_round = externalState['round'] - externalState['nodes'][pcuname]['values'] = values - externalState['nodes'][pcuname]['round'] = global_round - else: - count += 1 - - if count % 20 == 0: - database.dbDump(config.dbname, externalState) - database.dbDump(config.dbname, externalState) - -fbpcu = database.dbLoad('findbadpcus') -hn2lb = database.dbLoad("plcdb_hn2lb") - -def get(fb, path): - indexes = path.split("/") - values = fb - for index in indexes: - if index in values: - values = values[index] + d_pcu = None + for pcu in l_plcpcus: + if pcu['pcu_id'] == pcuname: + d_pcu = pcu + break + if not d_pcu: + continue + + try: + pf = HistoryPCURecord.by_pcuid(d_pcu['pcu_id']) + except: + pf = HistoryPCURecord(plc_pcuid=pcuname) + + pf.last_checked = datetime.now() + + try: + # Find the most recent record + pcurec = FindbadPCURecord.select(FindbadPCURecord.q.plc_pcuid==pcuname, + orderBy='date_checked').reversed()[0] + except: + # don't have the info to create a new entry right now, so continue. + continue + + pcu_state = pcurec.reboot_trial_status + current_state = pcu_state + + if current_state == 0 or current_state == "0": + if pf.status != "good": + pf.last_changed = datetime.now() + pf.status = "good" + elif current_state == 'NetDown': + if pf.status != "netdown": + pf.last_changed = datetime.now() + pf.status = "netdown" + elif current_state == 'Not_Run': + if pf.status != "badconfig": + pf.last_changed = datetime.now() + pf.status = "badconfig" else: - return None - return values - -def collectStatusAndState(pcuname, l_plcpcus): - global count - - d_pcu = None - for pcu in l_plcpcus: - if pcu['pcu_id'] == pcuname: - d_pcu = pcu - break - if not d_pcu: - return None - - pf = PersistFlags(pcuname, 1, db='pcu_persistflags') - - if not pf.checkattr('last_changed'): - pf.last_changed = time.time() - - pf.last_checked = time.time() - - if not pf.checkattr('valid'): - pf.valid = "unknown" - pf.last_valid = 0 - - if not pf.checkattr('status'): - pf.status = "unknown" - - state_path = "nodes/id_" + str(pcuname) + "/values/reboot" - bootstate_path = "nodes/id_" + str(pcuname) + "/values/plcpcu/boot_state" - - current_state = get(fbpcu, state_path) - if current_state == 0: - if pf.status != "good": pf.last_changed = time.time() - pf.status = "good" - elif current_state == 'NetDown': - if pf.status != "netdown": pf.last_changed = time.time() - pf.status = "netdown" - elif current_state == 'Not_Run': - if pf.status != "badconfig": pf.last_changed = time.time() - pf.status = "badconfig" - else: - if pf.status != "error": pf.last_changed = time.time() - pf.status = "error" - - count += 1 - print "%d %35s %s since(%s)" % (count, pcu_name(d_pcu), pf.status, diff_time(pf.last_changed)) - # updated by other modules - #pf.enabled = - #pf.suspended = + if pf.status != "error": + pf.last_changed = datetime.now() + pf.status = "error" - pf.save() + count += 1 + print "%d %35s %s since(%s)" % (count, pcu_name(d_pcu), pf.status, diff_time(time.mktime(pf.last_changed.timetuple()))) return True if __name__ == '__main__': parser = parsermodule.getParser() - parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, - increment=False, dbname="pcubad", cachepcus=False) + parser.set_defaults(filename=None, pcu=None, pcuselect=False, pcugroup=None, cachepcus=False) parser.add_option("", "--pcu", dest="pcu", metavar="hostname", help="Provide a single pcu to operate on") parser.add_option("", "--pculist", dest="pculist", metavar="file.list", help="Provide a list of files to operate on") - parser.add_option("", "--dbname", dest="dbname", metavar="FILE", - help="Specify the name of the database to which the information is saved") - parser.add_option("-i", "--increment", action="store_true", dest="increment", - help="Increment round number to force refresh or retry") config = parsermodule.parse_args(parser) try: @@ -156,6 +117,4 @@ if __name__ == '__main__': import traceback print traceback.print_exc() print "Exception: %s" % err - print "Saving data... exitting." - database.dbDump(config.dbname, externalState) sys.exit(0) diff --git a/reboot.py b/reboot.py index e876a76..f3f7f32 100755 --- a/reboot.py +++ b/reboot.py @@ -1235,6 +1235,8 @@ def reboot_policy(nodename, continue_probe, dryrun): def reboot_test(nodename, values, continue_probe, verbose, dryrun): rb_ret = "" + if 'plc_pcu_stats' in values: + values.update(values['plc_pcu_stats']) try: # DataProbe iPal (many sites) diff --git a/sitebad.py b/sitebad.py index f55a4d3..750572a 100755 --- a/sitebad.py +++ b/sitebad.py @@ -11,22 +11,21 @@ import comon import threadpool import syncplcdb from nodequery import verify,query_to_dict,node_select +from datetime import datetime,timedelta +import config + +from sqlobject import connectionForURI,sqlhub +connection = connectionForURI(config.sqlobjecturi) +sqlhub.processConnection = connection +from infovacuum.model.findbadrecord import * +from infovacuum.model.historyrecord import * import plc api = plc.getAuthAPI() from unified_model import * from const import MINUP -round = 1 -externalState = {'round': round, 'sites': {}} -count = 0 - def main(config): - global externalState - externalState = database.if_cached_else(1, config.dbname, lambda : externalState) - if config.increment: - # update global round number to force refreshes across all nodes - externalState['round'] += 1 l_nodes = syncplcdb.create_plcdb() l_plcsites = database.dbLoad("l_plcsites") @@ -38,83 +37,52 @@ def main(config): checkAndRecordState(l_sites, l_plcsites) -def checkAndRecordState(l_sites, l_plcsites): - global externalState - global count - global_round = externalState['round'] - - for sitename in l_sites: - if sitename not in externalState['sites']: - externalState['sites'][sitename] = {'round': 0, 'values': []} - - site_round = externalState['sites'][sitename]['round'] - if site_round < global_round: - # do work - values = collectStatusAndState(sitename, l_plcsites) - global_round = externalState['round'] - externalState['sites'][sitename]['values'] = values - externalState['sites'][sitename]['round'] = global_round - else: - count += 1 - - if count % 20 == 0: - database.dbDump(config.dbname, externalState) - - database.dbDump(config.dbname, externalState) - -fb = database.dbLoad('findbad') -lb2hn = database.dbLoad("plcdb_lb2hn") - def getnodesup(nodelist): up = 0 for node in nodelist: - if node['hostname'] in fb['nodes'].keys(): + try: + noderec = FindbadNodeRecord.select(FindbadNodeRecord.q.hostname==node['hostname'], + orderBy='date_checked').reversed()[0] + if noderec.observed_status == "BOOT": + up = up + 1 + except: + pass + return up + +def checkAndRecordState(l_sites, l_plcsites): + count = 0 + lb2hn = database.dbLoad("plcdb_lb2hn") + for sitename in l_sites: + d_site = None + for site in l_plcsites: + if site['login_base'] == sitename: + d_site = site + break + if not d_site: + continue + + if sitename in lb2hn: try: - if fb['nodes'][node['hostname']]['values']['state'] == "BOOT": - up = up + 1 + pf = HistorySiteRecord.by_loginbase(sitename) except: - pass - return up + pf = HistorySiteRecord(loginbase=sitename) -def collectStatusAndState(sitename, l_plcsites): - global count - - d_site = None - for site in l_plcsites: - if site['login_base'] == sitename: - d_site = site - break - if not d_site: - return None - - if sitename in lb2hn: - pf = PersistFlags(sitename, 1, db='site_persistflags') - - if not pf.checkattr('last_changed'): - pf.last_changed = time.time() - - pf.last_checked = time.time() - pf.nodes_total = len(lb2hn[sitename]) - pf.slices_used = len(d_site['slice_ids']) - pf.nodes_up = getnodesup(lb2hn[sitename]) - if not pf.checkattr('status'): - pf.status = "unknown" - - if pf.nodes_up >= MINUP: - if pf.status != "good": pf.last_changed = time.time() - pf.status = "good" - else: - if pf.status != "down": pf.last_changed = time.time() - pf.status = "down" - - count += 1 - print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, - pf.nodes_total, pf.nodes_up, pf.status) - # updated by other modules - #pf.enabled = - #pf.suspended = - - pf.save() + pf.last_checked = datetime.now() + + pf.slices_used = len(d_site['slice_ids']) + pf.nodes_total = len(lb2hn[sitename]) + pf.nodes_up = getnodesup(lb2hn[sitename]) + + if pf.nodes_up >= MINUP: + if pf.status != "good": pf.last_changed = datetime.now() + pf.status = "good" + else: + if pf.status != "down": pf.last_changed = datetime.now() + pf.status = "down" + + count += 1 + print "%d %15s slices(%2s) nodes(%2s) up(%2s) %s" % (count, sitename, pf.slices_used, + pf.nodes_total, pf.nodes_up, pf.status) return True @@ -122,17 +90,14 @@ if __name__ == '__main__': import parser as parsermodule parser = parsermodule.getParser() - parser.set_defaults(filename=None, node=None, site=None, nodeselect=False, nodegroup=None, - increment=False, dbname="sitebad", cachenodes=False) + parser.set_defaults(filename=None, node=None, site=None, + nodeselect=False, nodegroup=None, cachenodes=False) + parser.add_option("", "--site", dest="site", metavar="login_base", help="Provide a single site to operate on") parser.add_option("", "--sitelist", dest="sitelist", metavar="file.list", help="Provide a list of files to operate on") - parser.add_option("", "--dbname", dest="dbname", metavar="FILE", - help="Specify the name of the database to which the information is saved") - parser.add_option("-i", "--increment", action="store_true", dest="increment", - help="Increment round number to force refresh or retry") config = parsermodule.parse_args(parser) try: @@ -141,6 +106,4 @@ if __name__ == '__main__': import traceback print traceback.print_exc() print "Exception: %s" % err - print "Saving data... exitting." - database.dbDump(config.dbname, externalState) sys.exit(0) diff --git a/www/gadgets/sitemonitor.py b/www/gadgets/sitemonitor.py index e92a9cc..a52fec5 100755 --- a/www/gadgets/sitemonitor.py +++ b/www/gadgets/sitemonitor.py @@ -102,6 +102,7 @@ def main(): if form.has_key('loginbase'): loginbase = form.getvalue('loginbase') + loginbase = loginbase.rstrip("_") else: loginbase = "undefined" @@ -114,7 +115,10 @@ def main(): r = TR() if loginbase not in lb2hn: - value = ("Select 'Edit settings' to enter your Site's loginbase.", "") + value = ("""Select 'Edit settings' to enter your Site's loginbase.

+ The loginbase is the unchangable portion of your slicename. + For instance, your slice follows the pattern loginbase_slicename.

+ If this hint is unclear, then you can find your loginbase by visiting 'My Site' at 'PlanetLab.org'""", "") r = TR(TD(value[0])) t.append(r) else: diff --git a/www/runlevels.py b/www/runlevels.py index fe44423..a61426e 100755 --- a/www/runlevels.py +++ b/www/runlevels.py @@ -66,6 +66,8 @@ vals = {} vals['ssh'] = get_value('ssh') vals['state'] = get_value('state') vals['nm'] = get_value('nm') +vals['dns'] = None +vals['readonlyfs'] = None vals['plcnode/last_contact'] = None vals['comonstats/uptime'] = None vals['princeton_comon'] = get_value('princeton_comon') @@ -82,7 +84,19 @@ for mynode in fb['nodes'].keys(): row = [] row.append(mynode) add=True - for key in ['ssh', 'state', 'plcnode/last_contact', 'nm', 'princeton_comon', 'princeton_comon_running', 'princeton_comon_procs', 'comonstats/uptime']: + if 'readonlyfs' in fbnode: + if 'Read-only file system' in fbnode['readonlyfs']: + fbnode['readonlyfs'] = 'Y' + else: + fbnode['readonlyfs'] = '_' + + if 'dns' in fbnode: + if 'boot.planet-lab.org has address' in fbnode['dns']: + fbnode['dns'] = '_' + else: + fbnode['dns'] = 'N' + + for key in ['ssh', 'state', 'plcnode/last_contact', 'readonlyfs', 'dns', 'nm', 'princeton_comon', 'princeton_comon_running', 'princeton_comon_procs', 'comonstats/uptime']: if get(fbnode, key) is None: row.append('nokey') else: @@ -116,7 +130,7 @@ packed_values.sort(rowcmp) t = TABLE(border=1) r = TR() -for value in ['num', 'host', 'ssh', 'state', 'last
contact', 'NM', 'comon
dir', 'comon
vserver', 'comon
procs']: +for value in ['num', 'host', 'ssh', 'state', 'last
contact', 'readonlyfs', 'dns', 'NM', 'comon
dir', 'comon
vserver', 'comon
procs']: r.append(TD(value)) t.append(r)