Many small updates and fixes:

author Stephen Soltesz <soltesz@cs.princeton.edu>

Wed, 13 Apr 2011 19:31:43 +0000 (19:31 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Wed, 13 Apr 2011 19:31:43 +0000 (19:31 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Wed, 13 Apr 2011 19:31:43 +0000 (19:31 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Wed, 13 Apr 2011 19:31:43 +0000 (19:31 +0000)
diff --git a/Monitor.spec b/Monitor.spec

index 61fe0f1..32ecb44 100644 (file)
--- a/Monitor.spec
+++ b/Monitor.spec
@@ -350,10 +350,12 @@ chkconfig --add monitor
  chkconfig monitor on
  
  %post runlevelagent
-chkconfig --add monitor-runlevelagent
-chkconfig monitor-runlevelagent on
-if [ "$PL_BOOTCD" != "1" ] ; then
-       service monitor-runlevelagent restart
+if [ -f /etc/planetlab/node_id ] ; then
+    chkconfig --add monitor-runlevelagent
+    chkconfig monitor-runlevelagent on
+    if [ "$PL_BOOTCD" != "1" ] ; then
+        service monitor-runlevelagent restart
+    fi
  fi
  
  
diff --git a/commands/bootman.py b/commands/bootman.py

index 347199d..930c8fc 100755 (executable)
--- a/commands/bootman.py
+++ b/commands/bootman.py
@@ -13,6 +13,7 @@ import traceback
  import subprocess
  from sets import Set
  from monitor.bootman import *
+from monitor.util import file 
  
  # MAIN -------------------------------------------------------------------
  
@@ -41,7 +42,7 @@ def main():
         config = parsermodule.parse_args(parser)
  
         if config.nodelist:
-               nodes = config.getListFromFile(config.nodelist)
+               nodes = file.getListFromFile(config.nodelist)
         elif config.node:
                 nodes = [ config.node ]
         else:
diff --git a/commands/checksync.py b/commands/checksync.py

index d92d60f..494f5f7 100755 (executable)
--- a/commands/checksync.py
+++ b/commands/checksync.py
@@ -20,7 +20,7 @@ if True:
  
  
  
-if True:
+if False:
      fbquery = HistoryNodeRecord.query.all()
      hostnames = [ n.hostname for n in fbquery ]
  
@@ -35,7 +35,7 @@ if True:
      session.flush()
  
  
-if True:
+if False:
      fbquery = HistoryPCURecord.query.all()
      pcus = [ n.plc_pcuid for n in fbquery ]
  
diff --git a/commands/nodebad.py b/commands/nodebad.py

index dc86664..d1b2d35 100755 (executable)
--- a/commands/nodebad.py
+++ b/commands/nodebad.py
@@ -6,9 +6,9 @@ import string
  import time
  from datetime import datetime,timedelta
  
-from monitor.query import verify,query_to_dict,node_select
  
  from monitor.common import *
+from monitor.query import verify,query_to_dict,node_select
  
  from monitor import config
  from monitor.wrapper import plc,plccache
@@ -23,164 +23,171 @@ api = plc.getAuthAPI()
  round = 1
  count = 0
  def main():
-       main2(config)
+    main2(config)
  
  def main2(config):
  
-       l_plcnodes = plccache.l_nodes
-       l_nodes = get_nodeset(config)
-       
-       checkAndRecordState(l_nodes, l_plcnodes)
+    l_plcnodes = plccache.l_nodes
+    l_nodes = get_nodeset(config)
+    
+    checkAndRecordState(l_nodes, l_plcnodes)
  
  # Node states:
  
  def check_node_state(rec, node):
  
-       node_state = rec.observed_status
-       if rec.plc_node_stats:
-               print rec.plc_node_stats
-               boot_state = rec.plc_node_stats['boot_state']
-               run_level = rec.plc_node_stats['run_level']
-               last_contact = rec.plc_node_stats['last_contact']
-               node.plc_nodeid = rec.plc_node_stats['node_id']
-       else:
-               boot_state = "unknown"
-               last_contact = None
-
-       if boot_state == 'disable': boot_state = 'disabled'
-       if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
-
-       if len(rec.plc_node_stats['pcu_ids']) > 0:
-               node.haspcu = True
-       else:
-               node.haspcu = False
-
-       node.firewall = rec.firewall
-       node.plc_siteid = rec.plc_node_stats['site_id']
-
-       # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
-       #                       'translations' into the node.status state
-       #               'BOOT' is a permanent state, but we want it to have a bit of
-       #                       hysteresis (less than 0.5 days)
-       #################################################################
-       # "Initialize" the findbad states into nodebad status if they are not already set
-
-       if node_state == 'DOWN':
-               if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \
-                       node.status != 'disabled':
-                       # NOTE: if changed less than 2 months, then we can allow this. 
-                       # otherwise, apply 'down' status after greater than 2 months (below).
-
-                       print "changed status from %s to %s" % (node.status, boot_state)
-                       node.status = boot_state
-                       node.last_changed = datetime.now()
-
-               if node.status not in ['offline', 'down', 'disabled']:
-                       print "changed status from %s to offline" % node.status
-                       node.status = 'offline'
-                       node.last_changed = datetime.now()
-
-       if node_state == 'DEBUG':
-               if boot_state != 'disabled' and boot_state != 'safeboot':
-                       print "changed status from %s to failboot" % (node.status)
-                       current_status = "failboot"
-               else:
-                       print "changed status from %s to %s" % (node.status, boot_state)
-                       current_status = boot_state
-
-               if current_status != node.status and \
-                       current_status in ['failboot', 'disabled', 'safeboot']:
-
-                       node.status = current_status
-                       node.last_changed = datetime.now()
-
-       if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
-               print "changed status from %s to online" % node.status
-               node.status = 'online'
-               node.last_changed = datetime.now()
-
-       #################################################################
-       # Switch temporary hystersis states into their 'firm' states.
-       #         online -> good                after half a day
-       #         offline -> down               after two days
-       #         failboot -> down  after 30 days
-       #         safeboot -> failboot after 60 days
-       #         disabled -> down              after 60 days
-
-       if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
-               print "changed status from %s to good" % node.status
-               node.status = 'good'
-               # NOTE: do not reset last_changed, or you lose how long it's been up.
-
-       if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
-               print "changed status from %s to down" % node.status
-               node.status = 'down'
-               # NOTE: do not reset last_changed, or you lose how long it's been down.
-
-       if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
-               print "changed status from %s to down" % node.status
-               node.status = 'down'
-               # NOTE: do not reset last_changed, or you lose how long it's been down.
-
-       if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
-               print "changed status from %s to down" % node.status
-               # NOTE: change an admin mode back into failboot after two months.
-               node.status = 'failboot'
-               node.last_changed = datetime.now()
-
-       # extreme cases of offline nodes
-       if ( boot_state == 'disabled' or last_contact == None ) and \
-                       changed_greaterthan(node.last_changed, 2*30) and \
-                       node.status != 'down':
-               print "changed status from %s to down" % node.status
-               node.status = 'down'
-               node.last_changed = datetime.now()
+    node_state = rec.observed_status
+    if rec.plc_node_stats:
+        print rec.plc_node_stats
+        boot_state = rec.plc_node_stats['boot_state']
+        run_level = rec.plc_node_stats['run_level']
+        last_contact = rec.plc_node_stats['last_contact']
+        node.plc_nodeid = rec.plc_node_stats['node_id']
+    else:
+        boot_state = "unknown"
+        last_contact = None
+
+    if boot_state == 'disable': boot_state = 'disabled'
+    if boot_state == 'diag' or boot_state == 'diagnose': boot_state = 'safeboot'
+
+    if rec.plc_node_stats and len(rec.plc_node_stats['pcu_ids']) > 0:
+        node.haspcu = True
+    else:
+        node.haspcu = False
+
+    node.firewall = rec.firewall
+    node.plc_siteid = rec.plc_node_stats['site_id']
+
+    # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
+    #             'translations' into the node.status state
+    #        'BOOT' is a permanent state, but we want it to have a bit of
+    #            hysteresis (less than 0.5 days)
+    #################################################################
+    # "Initialize" the findbad states into nodebad status if they are not already set
+
+    if node_state == 'DOWN':
+        if boot_state == 'disabled' and changed_lessthan(node.last_changed, 60) and \
+            node.status != 'disabled':
+            # NOTE: if changed less than 2 months, then we can allow this. 
+            # otherwise, apply 'down' status after greater than 2 months (below).
+
+            print "changed status from %s to %s" % (node.status, boot_state)
+            node.status = boot_state
+            node.last_changed = datetime.now()
+
+        if node.status not in ['offline', 'down', 'disabled']:
+            print "changed status from %s to offline" % node.status
+            node.status = 'offline'
+            node.last_changed = datetime.now()
+
+    if node_state == 'DEBUG':
+        if boot_state != 'disabled' and boot_state != 'safeboot':
+            print "changed status from %s to failboot" % (node.status)
+            current_status = "failboot"
+        else:
+            print "changed status from %s to %s" % (node.status, boot_state)
+            current_status = boot_state
+
+        if current_status != node.status and \
+            current_status in ['failboot', 'disabled', 'safeboot']:
+
+            node.status = current_status
+            node.last_changed = datetime.now()
+
+    if node_state == 'BOOT' and node.status != 'online' and node.status != 'good':
+        print "changed status from %s to online" % node.status
+        node.status = 'online'
+        node.last_changed = datetime.now()
+
+    #################################################################
+    # Switch temporary hystersis states into their 'firm' states.
+    #      online -> good        after half a day
+    #      offline -> down        after two days
+    #      failboot -> down  after 30 days
+    #      safeboot -> failboot after 60 days
+    #      disabled -> down        after 60 days
+
+    if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
+        print "changed status from %s to good" % node.status
+        node.status = 'good'
+        # NOTE: do not reset last_changed, or you lose how long it's been up.
+
+    if node.status == 'offline' and changed_greaterthan(node.last_changed, 2):
+        print "changed status from %s to down" % node.status
+        node.status = 'down'
+        # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+    if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
+        print "changed status from %s to down" % node.status
+        node.status = 'down'
+        # NOTE: do not reset last_changed, or you lose how long it's been down.
+
+    if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
+        print "changed status from %s to down" % node.status
+        # NOTE: change an admin mode back into failboot after two months.
+        node.status = 'failboot'
+        node.last_changed = datetime.now()
+
+    # extreme cases of offline nodes
+    if ( boot_state == 'disabled' or last_contact == None ) and \
+            changed_greaterthan(node.last_changed, 2*30) and \
+            node.status != 'down':
+        print "changed status from %s to down" % node.status
+        node.status = 'down'
+        node.last_changed = datetime.now()
  
  def checkAndRecordState(l_nodes, l_plcnodes):
-       global count
-
-       for nodename in l_nodes:
-
-               nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
-                                                       if_new_set={'status' : 'offline', 
-                                                                               'last_changed' : datetime.now()})
-               nodehist.last_checked = datetime.now()
-
-               try:
-                       # Find the most recent record
-                       noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
-               except:
-                       print "COULD NOT FIND %s" % nodename
-                       import traceback
-                       email_exception()
-                       print traceback.print_exc()
-                       continue
-
-               if not noderec:
-                       print "none object for %s"% nodename
-                       continue
-
-               check_node_state(noderec, nodehist)
-
-               count += 1
-               print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
-
-       # NOTE: this commits all pending operations to the DB.  Do not remove. 
-       session.flush()
-
-       return True
+    global count
+
+    for nodename in l_nodes:
+
+        nodehist = HistoryNodeRecord.findby_or_create(hostname=nodename, 
+                            if_new_set={'status' : 'offline', 
+                                        'last_changed' : datetime.now()})
+        nodehist.last_checked = datetime.now()
+
+        try:
+            # Find the most recent record
+            noderec = FindbadNodeRecord.get_latest_by(hostname=nodename)
+        except:
+            print "COULD NOT FIND %s" % nodename
+            import traceback
+            email_exception()
+            print traceback.print_exc()
+            continue
+
+        if not noderec:
+            print "none object for %s"% nodename
+            continue
+
+        try:
+            check_node_state(noderec, nodehist)
+        except:
+            print "check_node_state failed %s" % nodename
+            import traceback
+            email_exception(nodename)
+            print traceback.print_exc()
+            continue
+
+        count += 1
+        print "%d %35s %s since(%s)" % (count, nodename, nodehist.status, diff_time(time.mktime(nodehist.last_changed.timetuple())))
+
+    # NOTE: this commits all pending operations to the DB.  Do not remove. 
+    session.flush()
+
+    return True
  
  if __name__ == '__main__':
-       from monitor import parser as parsermodule
-       parser = parsermodule.getParser(['nodesets'])
-       parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
-       parser = parsermodule.getParser(['defaults'], parser)
-       config = parsermodule.parse_args(parser)
-
-       try:
-               main2(config)
-       except Exception, err:
-               import traceback
-               print traceback.print_exc()
-               print "Exception: %s" % err
-               sys.exit(0)
+    from monitor import parser as parsermodule
+    parser = parsermodule.getParser(['nodesets'])
+    parser.set_defaults(filename=None, node=None, nodeselect=False, nodegroup=None, cachenodes=False)
+    parser = parsermodule.getParser(['defaults'], parser)
+    config = parsermodule.parse_args(parser)
+
+    try:
+        main2(config)
+    except Exception, err:
+        import traceback
+        print traceback.print_exc()
+        print "Exception: %s" % err
+        sys.exit(0)
diff --git a/commands/policy.py b/commands/policy.py

index 992e578..30b522a 100755 (executable)
--- a/commands/policy.py
+++ b/commands/policy.py
@@ -78,12 +78,13 @@ def main(hostnames, sitenames):
         node_count = 1
         site_count = 1
         #print "hosts: %s" % hostnames
+       print "apply-policy"
         for i,host in enumerate(hostnames):
                 try:
                         lb = plccache.plcdb_hn2lb[host]
                 except:
                         print "unknown host in plcdb_hn2lb %s" % host
-                       email_exception(host)
+                       email_exception("%s %s" % (i,host))
                         continue
  
                 nodeblack = BlacklistRecord.get_by(hostname=host)
@@ -105,7 +106,7 @@ def main(hostnames, sitenames):
                         not found_within(recent_actions, 'online_notice', 0.5):
                                 # NOTE: chronicly flapping nodes will not get 'online' notices
                                 #               since, they are never up long enough to be 'good'.
-                           # NOTE: searching for down_notice proves that the node has
+                               # NOTE: searching for down_notice proves that the node has
                                 #               gone through a 'down' state first, rather than just
                                 #               flapping through: good, offline, online, ...
                                 #       
@@ -139,7 +140,7 @@ def main(hostnames, sitenames):
  
                                 sitehist.attemptReboot(host)
                                 print "send message for host %s try_reboot" % host
-                               if not fbpcu.test_is_ok() and \
+                               if False and not fbpcu.test_is_ok() and \
                                         not found_within(recent_actions, 'pcuerror_notice', 3.0):
  
                                         args = {}
@@ -159,7 +160,7 @@ def main(hostnames, sitenames):
  
                 # NOTE: non-intuitive is that found_between(try_reboot, 3.5, 1)
                 #               will be false for a day after the above condition is satisfied
-               if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
+               if False and nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                         changed_greaterthan(nodehist.last_changed,1.5) and \
                         not nodehist.firewall and \
                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
@@ -198,11 +199,11 @@ def main(hostnames, sitenames):
                                         sitehist.sendMessage('down_notice', hostname=host)
                                         print "send message for host %s down" % host
  
-                               if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
+                               #if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
                                         # send down node notice
                                         #email_exception(host, "firewall_notice")
-                                       sitehist.sendMessage('firewall_notice', hostname=host)
-                                       print "send message for host %s down" % host
+                               #       sitehist.sendMessage('firewall_notice', hostname=host)
+                               #       print "send message for host %s down" % host
  
                 node_count = node_count + 1
                 print "time: ", time.strftime('%Y-%m-%d %H:%M:%S')
diff --git a/commands/shconfig.py b/commands/shconfig.py

index ba2f5e5..0c599ab 100755 (executable)
--- a/commands/shconfig.py
+++ b/commands/shconfig.py
@@ -4,5 +4,5 @@ from monitor import config
  
  for attr in dir(config):
         val = config.__getattribute__(attr)
-       if attr[0].isupper() and attr[1].isupper():
+       if (attr[0].isupper() and attr[1].isupper()) or ('email' in attr):
                 print '%s="%s" ' % (attr, val)
diff --git a/config.d/init-bootman-sequence.py b/config.d/init-bootman-sequence.py

index 59e0e8b..f261693 100755 (executable)
--- a/config.d/init-bootman-sequence.py
+++ b/config.d/init-bootman-sequence.py
@@ -29,6 +29,7 @@ def getSequences():
                                 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
                                 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
                                 "bminit-cfg-auth-authfail2-protoerror2-debug-done",
+                "bminit-cfg-auth-protoerror-protoerror2-exception-debug-validate-done",
                                 ]:
                         sequences.update({n : "restart_bootmanager_boot"})
  
@@ -62,6 +63,7 @@ def getSequences():
                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-missingkernel-debug-validate-done",
                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-nospace-debug-validate-done",
+                "bminit-cfg-auth-getplc-update-installinit-validate-netcfg-disk-update4-update3-rebuildinitrd-update3-implementerror-nospace-debug-validate-done",
                                 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-nospace-nospace-nospace-nospace-nospace-nospace-nospace-nospace-implementerror-nospace-debug-validate-done",
                                 ]:
                         sequences.update({n : "restart_bootmanager_rins"})
diff --git a/cron.d/copy-logs.sh b/cron.d/copy-logs.sh

deleted file mode 100755 (executable)

index 5c13a00..0000000
--- a/cron.d/copy-logs.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-cd /usr/share/monitor
-source agent.sh &> /dev/null
-
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/php.log /var/lib/monitor/httpd-log
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*-* /var/lib/monitor/httpd-log
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/httpd/*error* /var/lib/monitor/httpd-log
-
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/*-filesystem* /var/lib/monitor/filesystem
-rsync -qv -az -e ssh root@chloe.cs.princeton.edu:/vservers/www-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm
-
-rsync -qv -az -e ssh root@amber.cs.princeton.edu:/vservers/db-current/var/log/*-filesystem* /var/lib/monitor/filesystem
-rsync -qv -az -e ssh root@amber.cs.princeton.edu:/vservers/db-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm
-
-rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/*-filesystem* /var/lib/monitor/filesystem
-rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/*-checkrpm* /var/lib/monitor/checkrpm
-rsync -qv -az -e ssh root@janine.cs.princeton.edu:/vservers/boot-current/var/log/bm/ /var/lib/monitor/bmlogs/
diff --git a/monitor/bootman.py b/monitor/bootman.py

index eac2761..2070e00 100755 (executable)
--- a/monitor/bootman.py
+++ b/monitor/bootman.py
@@ -291,7 +291,7 @@ class PlanetLabSession:
  
                 # COPY Rpyc files to host
                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
-               cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
+               cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
                 if self.verbose: print cmd
                 print cmd
                 # TODO: Add timeout
@@ -449,6 +449,7 @@ class DebugInterface:
  
         def getDiskSteps(self):
                 steps = [
+                       ('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'),
                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
diff --git a/monitor/common.py b/monitor/common.py

index 2eb2bb7..5cf8151 100644 (file)
--- a/monitor/common.py
+++ b/monitor/common.py
@@ -282,3 +282,14 @@ def found_within(recent_actions, action_type, within):
         print "%s NOT found_within %s in recent_actions" % (action_type, timedelta(within) )
         return False
         
+
+class Time:
+    @classmethod
+    def dt_to_ts(cls, dt):
+        t = time.mktime(dt.timetuple())
+        return t
+
+    @classmethod
+    def ts_to_dt(cls, ts):
+        d = datetime.fromtimestamp(ts)
+        return d
diff --git a/monitor/generic.py b/monitor/generic.py

index 657c865..c1680d2 100644 (file)
--- a/monitor/generic.py
+++ b/monitor/generic.py
@@ -38,6 +38,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
         lb2hn = {}
         dsn = {}
         hn2lb = {}
+       exclude = []
         for id in id2lb:
                 if id2lb[id] not in lb2hn:
                         lb2hn[id2lb[id]] = []
@@ -48,6 +49,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
                         login_base = id2lb[node['site_id']]
                 else:
                         print >>sys.stderr, "%s has a foreign site_id %s" % (node['hostname'], node['site_id'])
+                       exclude.append(node['hostname'])
                         continue
                         for i in id2lb:
                                 print i, " ", id2lb[i]
@@ -66,7 +68,7 @@ def dsn_from_dsln(d_sites, id2lb, l_nodes):
                 dsn[login_base][hostname]['monitor'] = {}
  
                 hn2lb[hostname] = login_base
-       return (dsn, hn2lb, lb2hn)
+       return (dsn, hn2lb, lb2hn, exclude)
  
  
  class Time:
diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py

index 97200d9..00632bf 100644 (file)
--- a/monitor/wrapper/plc.py
+++ b/monitor/wrapper/plc.py
@@ -28,6 +28,21 @@ except:
         # NOTE: this host is used by default when there are no auth files.
         XMLRPC_SERVER="https://boot.planet-lab.org/PLCAPI/"
  
+global_log_api = True
+logging.basicConfig(level=logging.DEBUG,
+                    format='%(asctime)s %(levelname)s %(name)s : %(message)s',
+                    datefmt='%s %Y-%m-%dT%H:%M:%S',
+                    filename='/usr/share/monitor/myops-api-log.log',
+                    filemode='a')
+apilog = logging.getLogger("api")
+
+def log_api_call(name, *params):
+    logstr = "%s(" %name
+    for x in params:
+        logstr += "%s," % x
+    logstr = logstr[:-1] + ")"
+    if global_log_api: apilog.debug(logstr)
+
  logger = logging.getLogger("monitor")
         
  class Auth:
@@ -75,7 +90,11 @@ class PLC:
                         raise AssertionError("method does not exist")
  
                 try:
-                       return lambda *params : method(self.auth, *params)
+                       def call_method(aut, *params):
+                               if global_log_api: log_api_call(name, *params)
+                               return method(aut, *params)
+                       return lambda *params : call_method(self.auth, *params)
+                       #return lambda *params : method(self.auth, *params)
                 except xmlrpclib.ProtocolError:
                         traceback.print_exc()
                         global_error_count += 1
@@ -361,7 +380,7 @@ def suspendSiteSlices(loginbase):
                 try:
                         if not debug:
                             if not isSliceExempt(slice):
-                                   api.AddSliceAttribute(auth.auth, slice, "enabled", "0")
+                                   api.AddSliceTag(auth.auth, slice, "enabled", "0")
                 except Exception, exc:
                         logger.info("suspendSlices:  %s" % exc)
  
@@ -389,11 +408,11 @@ def enableSiteSlices(loginbase):
                                 if len(slice_list) == 0:
                                         return
                                 slice_id = slice_list[0]['slice_id']
-                               l_attr = api.GetSliceAttributes(auth.auth, {'slice_id': slice_id}, None)
+                               l_attr = api.GetSliceTags(auth.auth, {'slice_id': slice_id}, None)
                                 for attr in l_attr:
-                                       if "enabled" == attr['name'] and attr['value'] == "0":
+                                       if "enabled" == attr['tagname'] and attr['value'] == "0":
                                                 logger.info("Deleted enable=0 attribute from slice %s" % slice)
-                                               api.DeleteSliceAttribute(auth.auth, attr['slice_attribute_id'])
+                                               api.DeleteSliceTag(auth.auth, attr['slice_tag_id'])
                 except Exception, exc:
                         logger.info("enableSiteSlices: %s" % exc)
                         print "exception: %s" % exc
@@ -411,7 +430,7 @@ def enableSlices(nodename):
  #      api = xmlrpclib.Server(auth.server, verbose=False)
  #      for slice in  slices(siteId(nodename)):
  #              logger.info("Suspending slice %s" % slice)
-#              api.SliceAttributeAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
+#              api.SliceTagAdd(auth.auth, slice, "plc_slice_state", {"state" : "suspended"})
  #
  def enableSiteSliceCreation(loginbase):
         if isPendingSite(loginbase):
@@ -427,7 +446,8 @@ def enableSiteSliceCreation(loginbase):
                         site = api.GetSites(auth.auth, loginbase)[0]
                         if site['enabled'] == False:
                                 logger.info("\tcalling UpdateSite(%s, enabled=True)" % loginbase)
-                               api.UpdateSite(auth.auth, loginbase, {'enabled': True})
+                               if not isSiteExempt(loginbase):
+                                       api.UpdateSite(auth.auth, loginbase, {'enabled': True})
         except Exception, exc:
                 print "ERROR: enableSiteSliceCreation:  %s" % exc
                 logger.info("ERROR: enableSiteSliceCreation:  %s" % exc)
@@ -444,9 +464,9 @@ def areSlicesEnabled(site):
                         return None
                 for slice in slice_list:
                         slice_id = slice['slice_id']
-                       l_attr = api.GetSliceAttributes({'slice_id': slice_id})
+                       l_attr = api.GetSliceTags({'slice_id': slice_id})
                         for attr in l_attr:
-                               if "enabled" == attr['name'] and attr['value'] == "0":
+                               if "enabled" == attr['tagname'] and attr['value'] == "0":
                                         return False
  
         except Exception, exc:
diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py

index 60dbd22..4778a7d 100755 (executable)
--- a/monitor/wrapper/plccache.py
+++ b/monitor/wrapper/plccache.py
@@ -5,9 +5,9 @@ from monitor.wrapper import plc
  from monitor.generic import *
  from monitor.database.info.model import *
  from monitor import database
+from monitor import config
  import profile
  
-
  l_sites = None
  l_nodes = None
  l_pcus = None
@@ -16,7 +16,7 @@ plcdb_hn2lb = None
  plcdb_lb2hn = None
  plcdb_id2lb = None
  
-class CachedPLC(PLC):
+class CachedPLC(plc.PLC):
  
         def _param_to_str(self, name, *params):
                 fields = len(params)
@@ -98,11 +98,13 @@ def init():
         print >>sys.stderr, "building id2lb"
         (d_sites,id2lb) = dsites_from_lsites_id(l_sites)
         print >>sys.stderr, "building lb2hn"
-       (plcdb, hn2lb, lb2hn) = dsn_from_dsln(d_sites, id2lb, l_nodes)
+       (plcdb, hn2lb, lb2hn, exclude) = dsn_from_dsln(d_sites, id2lb, l_nodes)
  
         plcdb_hn2lb = hn2lb
         plcdb_lb2hn = lb2hn
         plcdb_id2lb = id2lb
+
+       l_nodes = filter(lambda x: x['hostname'] not in exclude, l_nodes)
         
         return
  
@@ -146,6 +148,13 @@ def deleteExtra(l_plc, objectClass=PlcSite, dbKey='loginbase', plcKey='login_bas
                 dbobj = objectClass.get_by(**{dbKey : obj})
                 dbobj.delete()
  
+def conv(s):
+    # strip non-ascii characters to prvent errors
+    r = s
+    if type(s) in (str,unicode):
+        r = "".join([x for x in s if ord(x) < 128])
+    return r
+
  def sync():
         l_sites = plc.api.GetSites({'peer_id':None}, 
                                                 ['login_base', 'site_id', 'abbreviated_name', 'latitude', 
@@ -172,8 +181,8 @@ def sync():
                 dbpcu = PlcPCU2.findby_or_create(pcu_id=pcu['pcu_id'])
                 dbpcu.date_checked = datetime.now()
                 for key in pcu.keys():
-                       print >>sys.stderr, "setting %s  = %s" % (key, pcu[key])
-                       setattr(dbpcu, key, pcu[key])
+                       print >>sys.stderr, "setting %s  = %s" % (key, conv(pcu[key]))
+                       setattr(dbpcu, key, conv(pcu[key]))
  
         deleteExtra(l_pcus, PlcPCU2, 'pcu_id', 'pcu_id')
         deleteExtra(l_pcus, HistoryPCURecord, 'plc_pcuid', 'pcu_id')
diff --git a/web/MonitorWeb/monitorweb/static/images/favicon.ico b/web/MonitorWeb/monitorweb/static/images/favicon.ico

index 332557b..eb03967 100644 (file)

Binary files a/web/MonitorWeb/monitorweb/static/images/favicon.ico and b/web/MonitorWeb/monitorweb/static/images/favicon.ico differ
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Wed, 13 Apr 2011 19:31:43 +0000 (19:31 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Wed, 13 Apr 2011 19:31:43 +0000 (19:31 +0000)
Monitor.spec		patch \| blob \| history
commands/bootman.py		patch \| blob \| history
commands/checksync.py		patch \| blob \| history
commands/nodebad.py		patch \| blob \| history
commands/policy.py		patch \| blob \| history
commands/shconfig.py		patch \| blob \| history
config.d/init-bootman-sequence.py		patch \| blob \| history
cron.d/copy-logs.sh	[deleted file]	patch \| blob \| history
monitor/bootman.py		patch \| blob \| history
monitor/common.py		patch \| blob \| history
monitor/generic.py		patch \| blob \| history
monitor/wrapper/plc.py		patch \| blob \| history
monitor/wrapper/plccache.py		patch \| blob \| history
web/MonitorWeb/monitorweb/static/images/favicon.ico		patch \| blob \| history