From f4f26439ae2db33f8f9a55e1a3350f6ed4f78278 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Wed, 13 Apr 2011 19:31:43 +0000 Subject: [PATCH] Many small updates and fixes: better logging in plc.py --- Monitor.spec | 10 +- commands/bootman.py | 3 +- commands/checksync.py | 4 +- commands/nodebad.py | 307 +++++++++--------- commands/policy.py | 15 +- commands/shconfig.py | 2 +- config.d/init-bootman-sequence.py | 2 + cron.d/copy-logs.sh | 18 - monitor/bootman.py | 3 +- monitor/common.py | 11 + monitor/generic.py | 4 +- monitor/wrapper/plc.py | 38 ++- monitor/wrapper/plccache.py | 19 +- .../monitorweb/static/images/favicon.ico | Bin 1081 -> 571 bytes 14 files changed, 237 insertions(+), 199 deletions(-) delete mode 100755 cron.d/copy-logs.sh diff --git a/Monitor.spec b/Monitor.spec index 61fe0f1..32ecb44 100644 --- a/Monitor.spec +++ b/Monitor.spec @@ -350,10 +350,12 @@ chkconfig --add monitor chkconfig monitor on %post runlevelagent -chkconfig --add monitor-runlevelagent -chkconfig monitor-runlevelagent on -if [ "$PL_BOOTCD" != "1" ] ; then - service monitor-runlevelagent restart +if [ -f /etc/planetlab/node_id ] ; then + chkconfig --add monitor-runlevelagent + chkconfig monitor-runlevelagent on + if [ "$PL_BOOTCD" != "1" ] ; then + service monitor-runlevelagent restart + fi fi diff --git a/commands/bootman.py b/commands/bootman.py index 347199d..930c8fc 100755 --- a/commands/bootman.py +++ b/commands/bootman.py @@ -13,6 +13,7 @@ import traceback import subprocess from sets import Set from monitor.bootman import * +from monitor.util import file # MAIN ------------------------------------------------------------------- @@ -41,7 +42,7 @@ def main(): config = parsermodule.parse_args(parser) if config.nodelist: - nodes = config.getListFromFile(config.nodelist) + nodes = file.getListFromFile(config.nodelist) elif config.node: nodes = [ config.node ] else: diff --git a/commands/checksync.py b/commands/checksync.py index d92d60f..494f5f7 100755 --- a/commands/checksync.py +++ b/commands/checksync.py @@ -20,7 +20,7 @@ if True: -if True: +if False: fbquery = HistoryNodeRecord.query.all() hostnames = [ n.hostname for n in fbquery ] @@ -35,7 +35,7 @@ if True: session.flush() -if True: +if False: fbquery = HistoryPCURecord.query.all() pcus = [ n.plc_pcuid for n in fbquery ] diff --git a/commands/nodebad.py b/commands/nodebad.py index dc86664..d1b2d35 100755 --- a/commands/nodebad.py +++ b/commands/nodebad.py @@ -6,9 +6,9 @@ import string import time from datetime import datetime,timedelta -from monitor.query import verify,query_to_dict,node_select from monitor.common import * +from monitor.query import verify,query_to_dict,node_select from monitor import config from monitor.wrapper import plc,plccache @@ -23,164 +23,171 @@ api = plc.getAuthAPI() round = 1 count = 0 def main(): - main2(config) + main2(config) def main2(config): - l_plcnodes = plccache.l_nodes - l_nodes = get_nodeset(config) - - checkAndRecordState(l_nodes, l_plcnodes) + l_plcnodes = plccache.l_nodes + l_nodes = get_nodeset(config) + + checkAndRecordState(l_nodes, l_plcnodes) # Node states: def check_node_state(rec, node): - node_state = rec.observed_status - if rec.plc_node_stats: - print rec.plc_node_stats - boot_state = rec.plc_node_stats['boot_state'] - run_level = rec.plc_node_stats['run_level'] - last_contact = rec.plc_node_stats['last_contact'] - node.plc_nodeid = rec.plc_node_stats['node_id'] - else: - boot_state = "unknown" - last_contact = None - - if boot_state == 'disable': boot_state = 'disabled' - if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot' - - if len(rec.plc_node_stats['pcu_ids']) > 0: - node.haspcu = True - else: - node.haspcu = False - - node.firewall = rec.firewall - node.plc_siteid = rec.plc_node_stats['site_id'] - - # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need - # 'translations' into the node.status state - # 'BOOT' is a permanent state, but we want it to have a bit of - # hysteresis (less than 0.5 days) - ################################################################# - # "Initialize" the findbad states into nodebad status if they are not already set - - if node_state == 'DOWN': - if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \ - node.status != 'disabled': - # NOTE: if changed less than 2 months, then we can allow this. - # otherwise, apply 'down' status after greater than 2 months (below). - - print "changed status from %s to %s" % (node.status, boot_state) - node.status = boot_state - node.last_changed = datetime.now() - - if node.status not in ['offline', 'down', 'disabled']: - print "changed status from %s to offline" % node.status - node.status = 'offline' - node.last_changed = datetime.now() - - if node_state == 'DEBUG': - if boot_state != 'disabled' and boot_state != 'safeboot': - print "changed status from %s to failboot" % (node.status) - current_status = "failboot" - else: - print "changed status from %s to %s" % (node.status, boot_state) - current_status = boot_state - - if current_status != node.status and \ - current_status in ['failboot', 'disabled', 'safeboot']: - - node.status = current_status - node.last_changed = datetime.now() - - if node_state == 'BOOT' and node.status != 'online' and node.status != 'good': - print "changed status from %s to online" % node.status - node.status = 'online' - node.last_changed = datetime.now() - - ################################################################# - # Switch temporary hystersis states into their 'firm' states. - # online -> good after half a day - # offline -> down after two days - # failboot -> down after 30 days - # safeboot -> failboot after 60 days - # disabled -> down after 60 days - - if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): - print "changed status from %s to good" % node.status - node.status = 'good' - # NOTE: do not reset last_changed, or you lose how long it's been up. - - if node.status == 'offline' and changed_greaterthan(node.last_changed, 2): - print "changed status from %s to down" % node.status - node.status = 'down' - # NOTE: do not reset last_changed, or you lose how long it's been down. - - if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30): - print "changed status from %s to down" % node.status - node.status = 'down' - # NOTE: do not reset last_changed, or you lose how long it's been down. - - if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60): - print "changed status from %s to down" % node.status - # NOTE: change an admin mode back into failboot after two months. - node.status = 'failboot' - node.last_changed = datetime.now() - - # extreme cases of offline nodes - if ( boot_state == 'disabled' or last_contact == None ) and \ - changed_greaterthan(node.last_changed, 2*30) and \ - node.status != 'down': - print "changed status from %s to down" % node.status - node.status = 'down' - node.last_changed = datetime.now() + node_state = rec.observed_status + if rec.plc_node_stats: + print rec.plc_node_stats + boot_state = rec.plc_node_stats['boot_state'] + run_level = rec.plc_node_stats['run_level'] + last_contact = rec.plc_node_stats['last_contact'] + node.plc_nodeid = rec.plc_node_stats['node_id'] + else: + boot_state = "unknown" + last_contact = None + + if boot_state == 'disable': boot_state = 'disabled' + if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot' + + if rec.plc_node_stats and len(rec.plc_node_stats['pcu_ids']) > 0: + node.haspcu = True + else: + node.haspcu = False + + node.firewall = rec.firewall + node.plc_siteid = rec.plc_node_stats['site_id'] + + # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need + # 'translations' into the node.status state + # 'BOOT' is a permanent state, but we want it to have a bit of + # hysteresis (less than 0.5 days) + ################################################################# + # "Initialize" the findbad states into nodebad status if they are not already set + + if node_state == 'DOWN': + if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \ + node.status != 'disabled': + # NOTE: if changed less than 2 months, then we can allow this. + # otherwise, apply 'down' status after greater than 2 months (below). + + print "changed status from %s to %s" % (node.status, boot_state) + node.status = boot_state + node.last_changed = datetime.now() + + if node.status not in ['offline', 'down', 'disabled']: + print "changed status from %s to offline" % node.status + node.status = 'offline' + node.last_changed = datetime.now() + + if node_state == 'DEBUG': + if boot_state != 'disabled' and boot_state != 'safeboot': + print "changed status from %s to failboot" % (node.status) + current_status = "failboot" + else: + print "changed status from %s to %s" % (node.status, boot_state) + current_status = boot_state + + if current_status != node.status and \ + current_status in ['failboot', 'disabled', 'safeboot']: + + node.status = current_status + node.last_changed = datetime.now() + + if node_state == 'BOOT' and node.status != 'online' and node.status != 'good': + print "changed status from %s to online" % node.status + node.status = 'online' + node.last_changed = datetime.now() + + ################################################################# + # Switch temporary hystersis states into their 'firm' states. + # online -> good after half a day + # offline -> down after two days + # failboot -> down after 30 days + # safeboot -> failboot after 60 days + # disabled -> down after 60 days + + if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): + print "changed status from %s to good" % node.status + node.status = 'good' + # NOTE: do not reset last_changed, or you lose how long it's been up. + + if node.status == 'offline' and changed_greaterthan(node.last_changed, 2): + print "changed status from %s to down" % node.status + node.status = 'down' + # NOTE: do not reset last_changed, or you lose how long it's been down. + + if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30): + print "changed status from %s to down" % node.status + node.status = 'down' + # NOTE: do not reset last_changed, or you lose how long it's been down. + + if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60): + print "changed status from %s to down" % node.status + # NOTE: change an admin mode back into failboot after two months. + node.status = 'failboot' + node.last_changed = datetime.now() + + # extreme cases of offline nodes + if ( boot_state == 'disabled' or last_contact == None ) and \ + changed_greaterthan(node.last_changed, 2*30) and \ + node.status != 'down': + print "changed status from %s to down" % node.status + node.status = 'down' + node.last_changed = datetime.now() def checkAndRecordState(l_nodes, l_plcnodes): - global count - - for nodename in l_nodes: - - nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, - if_new_set={'status' : 'offline', - 'last_changed' : datetime.now()}) - nodehist.last_checked = datetime.now() - - try: - # Find the most recent record - noderec = FindbadNodeRecord.get_latest_by(hostname=nodename) - except: - print "COULD NOT FIND %s" % nodename - import traceback - email_exception() - print traceback.print_exc() - continue - - if not noderec: - print "none object for %s"% nodename - continue - - check_node_state(noderec, nodehist) - - count += 1 - print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple()))) - - # NOTE: this commits all pending operations to the DB. Do not remove. - session.flush() - - return True + global count + + for nodename in l_nodes: + + nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, + if_new_set={'status' : 'offline', + 'last_changed' : datetime.now()}) + nodehist.last_checked = datetime.now() + + try: + # Find the most recent record + noderec = FindbadNodeRecord.get_latest_by(hostname=nodename) + except: + print "COULD NOT FIND %s" % nodename + import traceback + email_exception() + print traceback.print_exc() + continue + + if not noderec: + print "none object for %s"% nodename + continue + + try: + check_node_state(noderec, nodehist) + except: + print "check_node_state failed %s" % nodename + import traceback + email_exception(nodename) + print traceback.print_exc() + continue + + count += 1 + print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple()))) + + # NOTE: this commits all pending operations to the DB. Do not remove. + session.flush() + + return True if __name__ == '__main__': - from monitor import parser as parsermodule - parser = parsermodule.getParser(['nodesets']) - parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False) - parser = parsermodule.getParser(['defaults'], parser) - config = parsermodule.parse_args(parser) - - try: - main2(config) - except Exception, err: - import traceback - print traceback.print_exc() - print "Exception: %s" % err - sys.exit(0) + from monitor import parser as parsermodule + parser = parsermodule.getParser(['nodesets']) + parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False) + parser = parsermodule.getParser(['defaults'], parser) + config = parsermodule.parse_args(parser) + + try: + main2(config) + except Exception, err: + import traceback + print traceback.print_exc() + print "Exception: %s" % err + sys.exit(0) diff --git a/commands/policy.py b/commands/policy.py index 992e578..30b522a 100755 --- a/commands/policy.py +++ b/commands/policy.py @@ -78,12 +78,13 @@ def main(hostnames, sitenames): node_count = 1 site_count = 1 #print "hosts: %s" % hostnames + print "apply-policy" for i,host in enumerate(hostnames): try: lb = plccache.plcdb_hn2lb[host] except: print "unknown host in plcdb_hn2lb %s" % host - email_exception(host) + email_exception("%s %s" % (i,host)) continue nodeblack = BlacklistRecord.get_by(hostname=host) @@ -105,7 +106,7 @@ def main(hostnames, sitenames): not found_within(recent_actions, 'online_notice', 0.5): # NOTE: chronicly flapping nodes will not get 'online' notices # since, they are never up long enough to be 'good'. - # NOTE: searching for down_notice proves that the node has + # NOTE: searching for down_notice proves that the node has # gone through a 'down' state first, rather than just # flapping through: good, offline, online, ... # @@ -139,7 +140,7 @@ def main(hostnames, sitenames): sitehist.attemptReboot(host) print "send message for host %s try_reboot" % host - if not fbpcu.test_is_ok() and \ + if False and not fbpcu.test_is_ok() and \ not found_within(recent_actions, 'pcuerror_notice', 3.0): args = {} @@ -159,7 +160,7 @@ def main(hostnames, sitenames): # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1) # will be false for a day after the above condition is satisfied - if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ + if False and nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.5) and \ not nodehist.firewall and \ found_between(recent_actions, 'try_reboot', 3.5, 1) and \ @@ -198,11 +199,11 @@ def main(hostnames, sitenames): sitehist.sendMessage('down_notice', hostname=host) print "send message for host %s down" % host - if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5): + #if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5): # send down node notice #email_exception(host, "firewall_notice") - sitehist.sendMessage('firewall_notice', hostname=host) - print "send message for host %s down" % host + # sitehist.sendMessage('firewall_notice', hostname=host) + # print "send message for host %s down" % host node_count = node_count + 1 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S') diff --git a/commands/shconfig.py b/commands/shconfig.py index ba2f5e5..0c599ab 100755 --- a/commands/shconfig.py +++ b/commands/shconfig.py @@ -4,5 +4,5 @@ from monitor import config for attr in dir(config): val = config.__getattribute__(attr) - if attr[0].isupper() and attr[1].isupper(): + if (attr[0].isupper() and attr[1].isupper()) or ('email' in attr): print '%s="%s" ' % (attr, val) diff --git a/config.d/init-bootman-sequence.py b/config.d/init-bootman-sequence.py index 59e0e8b..f261693 100755 --- a/config.d/init-bootman-sequence.py +++ b/config.d/init-bootman-sequence.py @@ -29,6 +29,7 @@ def getSequences(): "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done", "bminit-cfg-auth-getplc-implementerror-update-debug-done", "bminit-cfg-auth-authfail2-protoerror2-debug-done", + "bminit-cfg-auth-protoerror-protoerror2-exception-debug-validate-done", ]: sequences.update({n : "restart_bootmanager_boot"}) @@ -62,6 +63,7 @@ def getSequences(): "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done", "bminit-cfg-auth-getplc-update-installinit-validate-exception-missingkernel-debug-validate-done", "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-nospace-debug-validate-done", + "bminit-cfg-auth-getplc-update-installinit-validate-netcfg-disk-update4-update3-rebuildinitrd-update3-implementerror-nospace-debug-validate-done", "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-nospace-nospace-nospace-nospace-nospace-nospace-nospace-nospace-implementerror-nospace-debug-validate-done", ]: sequences.update({n : "restart_bootmanager_rins"}) diff --git a/cron.d/copy-logs.sh b/cron.d/copy-logs.sh deleted file mode 100755 index 5c13a00..0000000 --- a/cron.d/copy-logs.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -cd /usr/share/monitor -source agent.sh &> /dev/null - -rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/php.log /var/lib/monitor/httpd-log -rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*-* /var/lib/monitor/httpd-log -rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*error* /var/lib/monitor/httpd-log - -rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/*-filesystem* /var/lib/monitor/filesystem -rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm - -rsync -qv -az -e ssh root@amber.cs.princeton.edu:/vservers/db-current/var/log/*-filesystem* /var/lib/monitor/filesystem -rsync -qv -az -e ssh root@amber.cs.princeton.edu:/vservers/db-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm - -rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/*-filesystem* /var/lib/monitor/filesystem -rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm -rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/bm/ /var/lib/monitor/bmlogs/ diff --git a/monitor/bootman.py b/monitor/bootman.py index eac2761..2070e00 100755 --- a/monitor/bootman.py +++ b/monitor/bootman.py @@ -291,7 +291,7 @@ class PlanetLabSession: # COPY Rpyc files to host #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args - cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args + cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args if self.verbose: print cmd print cmd # TODO: Add timeout @@ -449,6 +449,7 @@ class DebugInterface: def getDiskSteps(self): steps = [ + ('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'), ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'), ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'), ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'), diff --git a/monitor/common.py b/monitor/common.py index 2eb2bb7..5cf8151 100644 --- a/monitor/common.py +++ b/monitor/common.py @@ -282,3 +282,14 @@ def found_within(recent_actions, action_type, within): print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) ) return False + +class Time: + @classmethod + def dt_to_ts(cls, dt): + t = time.mktime(dt.timetuple()) + return t + + @classmethod + def ts_to_dt(cls, ts): + d = datetime.fromtimestamp(ts) + return d diff --git a/monitor/generic.py b/monitor/generic.py index 657c865..c1680d2 100644 --- a/monitor/generic.py +++ b/monitor/generic.py @@ -38,6 +38,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes): lb2hn = {} dsn = {} hn2lb = {} + exclude = [] for id in id2lb: if id2lb[id] not in lb2hn: lb2hn[id2lb[id]] = [] @@ -48,6 +49,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes): login_base = id2lb[node['site_id']] else: print >>sys.stderr, "%s has a foreign site_id %s" % (node['hostname'], node['site_id']) + exclude.append(node['hostname']) continue for i in id2lb: print i, " ", id2lb[i] @@ -66,7 +68,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes): dsn[login_base][hostname]['monitor'] = {} hn2lb[hostname] = login_base - return (dsn, hn2lb, lb2hn) + return (dsn, hn2lb, lb2hn, exclude) class Time: diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py index 97200d9..00632bf 100644 --- a/monitor/wrapper/plc.py +++ b/monitor/wrapper/plc.py @@ -28,6 +28,21 @@ except: # NOTE: this host is used by default when there are no auth files. XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/" +global_log_api = True +logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(name)s : %(message)s', + datefmt='%s %Y-%m-%dT%H:%M:%S', + filename='/usr/share/monitor/myops-api-log.log', + filemode='a') +apilog = logging.getLogger("api") + +def log_api_call(name, *params): + logstr = "%s(" %name + for x in params: + logstr += "%s," % x + logstr = logstr[:-1] + ")" + if global_log_api: apilog.debug(logstr) + logger = logging.getLogger("monitor") class Auth: @@ -75,7 +90,11 @@ class PLC: raise AssertionError("method does not exist") try: - return lambda *params : method(self.auth, *params) + def call_method(aut, *params): + if global_log_api: log_api_call(name, *params) + return method(aut, *params) + return lambda *params : call_method(self.auth, *params) + #return lambda *params : method(self.auth, *params) except xmlrpclib.ProtocolError: traceback.print_exc() global_error_count += 1 @@ -361,7 +380,7 @@ def suspendSiteSlices(loginbase): try: if not debug: if not isSliceExempt(slice): - api.AddSliceAttribute(auth.auth, slice, "enabled", "0") + api.AddSliceTag(auth.auth, slice, "enabled", "0") except Exception, exc: logger.info("suspendSlices: %s" % exc) @@ -389,11 +408,11 @@ def enableSiteSlices(loginbase): if len(slice_list) == 0: return slice_id = slice_list[0]['slice_id'] - l_attr = api.GetSliceAttributes(auth.auth, {'slice_id': slice_id}, None) + l_attr = api.GetSliceTags(auth.auth, {'slice_id': slice_id}, None) for attr in l_attr: - if "enabled" == attr['name'] and attr['value'] == "0": + if "enabled" == attr['tagname'] and attr['value'] == "0": logger.info("Deleted enable=0 attribute from slice %s" % slice) - api.DeleteSliceAttribute(auth.auth, attr['slice_attribute_id']) + api.DeleteSliceTag(auth.auth, attr['slice_tag_id']) except Exception, exc: logger.info("enableSiteSlices: %s" % exc) print "exception: %s" % exc @@ -411,7 +430,7 @@ def enableSlices(nodename): # api = xmlrpclib.Server(auth.server, verbose=False) # for slice in slices(siteId(nodename)): # logger.info("Suspending slice %s" % slice) -# api.SliceAttributeAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"}) +# api.SliceTagAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"}) # def enableSiteSliceCreation(loginbase): if isPendingSite(loginbase): @@ -427,7 +446,8 @@ def enableSiteSliceCreation(loginbase): site = api.GetSites(auth.auth, loginbase)[0] if site['enabled'] == False: logger.info("\tcalling UpdateSite(%s, enabled=True)" % loginbase) - api.UpdateSite(auth.auth, loginbase, {'enabled': True}) + if not isSiteExempt(loginbase): + api.UpdateSite(auth.auth, loginbase, {'enabled': True}) except Exception, exc: print "ERROR: enableSiteSliceCreation: %s" % exc logger.info("ERROR: enableSiteSliceCreation: %s" % exc) @@ -444,9 +464,9 @@ def areSlicesEnabled(site): return None for slice in slice_list: slice_id = slice['slice_id'] - l_attr = api.GetSliceAttributes({'slice_id': slice_id}) + l_attr = api.GetSliceTags({'slice_id': slice_id}) for attr in l_attr: - if "enabled" == attr['name'] and attr['value'] == "0": + if "enabled" == attr['tagname'] and attr['value'] == "0": return False except Exception, exc: diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py index 60dbd22..4778a7d 100755 --- a/monitor/wrapper/plccache.py +++ b/monitor/wrapper/plccache.py @@ -5,9 +5,9 @@ from monitor.wrapper import plc from monitor.generic import * from monitor.database.info.model import * from monitor import database +from monitor import config import profile - l_sites = None l_nodes = None l_pcus = None @@ -16,7 +16,7 @@ plcdb_hn2lb = None plcdb_lb2hn = None plcdb_id2lb = None -class CachedPLC(PLC): +class CachedPLC(plc.PLC): def _param_to_str(self, name, *params): fields = len(params) @@ -98,11 +98,13 @@ def init(): print >>sys.stderr, "building id2lb" (d_sites,id2lb) = dsites_from_lsites_id(l_sites) print >>sys.stderr, "building lb2hn" - (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes) + (plcdb, hn2lb, lb2hn, exclude) = dsn_from_dsln(d_sites, id2lb, l_nodes) plcdb_hn2lb = hn2lb plcdb_lb2hn = lb2hn plcdb_id2lb = id2lb + + l_nodes = filter(lambda x: x['hostname'] not in exclude, l_nodes) return @@ -146,6 +148,13 @@ def deleteExtra(l_plc, objectClass=PlcSite, dbKey='loginbase', plcKey='login_bas dbobj = objectClass.get_by(**{dbKey : obj}) dbobj.delete() +def conv(s): + # strip non-ascii characters to prvent errors + r = s + if type(s) in (str,unicode): + r = "".join([x for x in s if ord(x) < 128]) + return r + def sync(): l_sites = plc.api.GetSites({'peer_id':None}, ['login_base', 'site_id', 'abbreviated_name', 'latitude', @@ -172,8 +181,8 @@ def sync(): dbpcu = PlcPCU2.findby_or_create(pcu_id=pcu['pcu_id']) dbpcu.date_checked = datetime.now() for key in pcu.keys(): - print >>sys.stderr, "setting %s = %s" % (key, pcu[key]) - setattr(dbpcu, key, pcu[key]) + print >>sys.stderr, "setting %s = %s" % (key, conv(pcu[key])) + setattr(dbpcu, key, conv(pcu[key])) deleteExtra(l_pcus, PlcPCU2, 'pcu_id', 'pcu_id') deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id') diff --git a/web/MonitorWeb/monitorweb/static/images/favicon.ico b/web/MonitorWeb/monitorweb/static/images/favicon.ico index 332557bc307647601389c14939be0671c62efcd7..eb03967aeab2e606e2723f5bbd3fe27e2f78abd2 100644 GIT binary patch literal 571 zcmbV}S5H#`0EKT61c^q}ihJN5xEB>v3JT&X?mcjiKKKhni*J|22Llo{KxvT9ixli$ zS6WH~5mI;{u@wVt32CvnM5q?pWBV8Ua&mHB&dWK)B}F-$vJ4`FnB)_2%87Cfp#cX~ zRSh3%wKa9dx=%)Fy|AfCM~QUJje3d9)ZA=t5gU{elje)btT(@@rK{_$Z-tH;A^lO( zQ?2NIFYj%jI-2ATnVgm>tQ2JtQw|B$MX4N=uK#muduy9rsqRv$dvtBI(O}c-I$KQr zB8#h~+a+rEHCjF5&YueBFKJ&$>W0h>#URuJUe)(er3bAZsJt+YOlkwrj=>a&7y?ma zFlJ<6VqhKz-ob8bci1~<_n39i-`ziK_XX`Ephscx!@@-A-*Lejir6M$|AQ_j(mjoD zamCO#6zq+`8HF>RV;BHjzz<*$7!G47?tmBY0fF(bKMeoWGy{ZyiK*!cAc_Qu9si50 zzsW3u;1k)2cH$EvwdN&XP$8H(e{tsW*Cc=arE{n9vKLeevhF`Pv3t+qohg-TkKHcG zIhfBaU6etto$gxSR1M;3OpD44r Wi@E9P=@#CruUuAU9-Ey;s3` z%*@QbzP_%muFlTRuXR$eu&|(@pvlR}#>U39w6vL-ncLgj_sv?eva+bCsJgnky}iAU zFbC)7=g`p5RFNg6++fty)Q5+M+-+j?^z?2p6YHcgZ*Feo?Dc7BY3=j=*}=5d*4FCk z>e$%WkZD%6mVNm6_)$?&^5)=2M@NQ3Cg0!R!<&WI$-#kvfuvL_*WKs;|Nrdl>}FEk`SkLhzMN`o zagv9Jfw4^C=I!{dq?(?k?eOvV@$SCE%$}mDz_6m>Xz`*|g{`&g*;^N}L z#?NMGZT|oN|Nj2#@%%wTLjL~$`T6<%{r&p;`}Xzq`~Cj#^77Hr)zs3`*VNL(wyegx zwdmZ{ok<+e(bVDL;o;)r_4W1k_V)Mo_07%AQ&Us!?(OO6>Few3%*VrGV`a+9-gXJ=SgSXx?J$iKM${{4J^g{GvV9UL4aBO}Yu&APj}r>CXM&&=!Z@#5p;>gwsi zzrDi2z?Pb!A^sIZa%Ew3Wn>_CX>@2HRA^-&M@dak04x9i000mG5C8xO{s5aaVb~md%O-2S^lHv64-i!($A}fp9Pe9y&;49CUHQ z;Fz%>4uYhycVZEqMJMv5!5PGul{9Hmczdy=sTptC(pZr~t0;$iDIKRaU;LNRWRnT0wg0}PMk>e#R#Md8k