changed 'monitordebug' to failboot
authorStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 17 Aug 2009 22:03:47 +0000 (22:03 +0000)
committerStephen Soltesz <soltesz@cs.princeton.edu>
Mon, 17 Aug 2009 22:03:47 +0000 (22:03 +0000)
added run_level to nodebad record
added site_id to database
added DRAC6 support
added GeorgeTown PCU to BayTech to catch their wacky PCU.
add extra sequences too bootman to catch fsck error
remove setting nodes to 'disabled', leave them in failboot
send exception and Unknown Error messages to config.exception_email
disable synccheck in automate-default.sh script

14 files changed:
automate-default.sh
monitor/bootman.py
monitor/common.py
monitor/database/info/history.py
monitor/database/info/interface.py
monitor/scanapi.py
monitor/wrapper/emailTxt.py
monitor/wrapper/plc.py
monitor/wrapper/plccache.py
nodebad.py
pcucontrol/models/BayTech.py
pcucontrol/models/DRAC.py
policy.py
upgrade/monitor-server-3.0-19.sql

index 8b18032..8d300a7 100755 (executable)
@@ -62,7 +62,7 @@ fi
 #TODO: should add a call to ssh-add -l to check if the keys are loaded or not.
 source ${MONITOR_SCRIPT_ROOT}/agent.sh
 
-${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
+#${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
 ${MONITOR_SCRIPT_ROOT}/syncwithplc.py $DATE || :
 service plc restart monitor
 
@@ -75,7 +75,7 @@ ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || :
 ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
 
 ${MONITOR_SCRIPT_ROOT}/policy.py $DATE
-${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
+#${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
 service plc restart monitor
 curl -s 'http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview&formatcsv' > /var/lib/monitor/comon/$DATE.comon.csv
 
index 7bd0cb3..09be54f 100755 (executable)
@@ -136,7 +136,7 @@ class NodeConnection:
                        print "Running MANUAL fsck on %s" % self.node
                        cmd = "( touch /tmp/BM_RUNNING ;  " + \
                                  "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
-                                 "  fsck -v -f -y /dev/planetlab/vserver >> out.fsck 2>&1 ; " + \
+                                 "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
                                  "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
                                  "  rm -f /tmp/BM_RUNNING " + \
                                  ") &" 
@@ -300,7 +300,7 @@ class PlanetLabSession:
                        print ret
                        if ret != 0:
                                print "\tFAILED TWICE"
-                               email_exception("%s rsync failed twice" % self.node)
+                               #email_exception("%s rsync failed twice" % self.node)
                                raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 
                t1 = time.time()
@@ -435,6 +435,7 @@ class DebugInterface:
 
                                "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
                                "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
                                "bminit-cfg-auth-getplc-update-debug-done",
                                "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
                                "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
@@ -471,6 +472,7 @@ class DebugInterface:
                                "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
                                "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
                                "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
                                ]:
                        sequences.update({n : "restart_bootmanager_rins"})
 
@@ -512,7 +514,15 @@ class DebugInterface:
                # fsck_repair
                for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
                                  "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
-                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done"
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
                                ]:
                        sequences.update({n : "fsck_repair"})
 
@@ -529,6 +539,7 @@ class DebugInterface:
                for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
                                   "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
                                   "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                                  "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
                                ]:
                        sequences.update({n : "nodenetwork_email"})
 
@@ -641,6 +652,7 @@ class DebugInterface:
                        ('implementerror', 'Implementation Error'),
                        ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
                        ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
+                       ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
                        ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
                        ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
                        ('noinstall'    , 'notinstalled'),
@@ -744,7 +756,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
 
                                log=conn.get_dmesg().read()
                                sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
-                               conn.set_nodestate('disabled')
+                               #conn.set_nodestate('disabled')
 
                        return False
 
@@ -869,7 +881,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                args['log'] = conn.get_dmesg().read()
 
                                sitehist.sendMessage('baddisk_notice', **args)
-                               conn.set_nodestate('disabled')
+                               #conn.set_nodestate('disabled')
 
                elif sequences[s] == "update_hardware_email":
                        if not found_within(recent_actions, 'minimalhardware_notice', 7):
index 6fca571..d3dc895 100644 (file)
@@ -228,7 +228,7 @@ def email_exception(content=None, title=None):
         full_title = "exception running monitor %s" % title
 
     m=Message(full_title, msg, False)
-    m.send([config.cc_email])
+    m.send([config.exception_email])
     return
 
 def changed_lessthan(last_changed, days):
index 7190248..0abdebc 100644 (file)
@@ -18,6 +18,7 @@ class HistoryNodeRecord(Entity):
        haspcu = Field(Boolean,default=False)
        firewall = Field(Boolean,default=False)
        plc_nodeid = Field(Int,default=1)
+       plc_siteid = Field(Int,default=1)
 
        acts_as_versioned(ignore=['last_changed', 'last_checked'])
 
index 0a1437b..d37ab2e 100644 (file)
@@ -148,7 +148,7 @@ class SiteInterface(HistorySiteRecord):
                        m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
 
                        if ccemail:
-                               contacts = [config.cc_email]
+                               contacts = [config.exception_email]
                        else:
                                contacts = self.getContacts()
 
index 22e3e74..d0ed72b 100644 (file)
@@ -384,6 +384,11 @@ EOF                        """)
                                values['observed_status'] = 'DOWN'
                                values['firewall'] = True
 
+                       #if   values['port_status']['22']  == "open" and \
+                       #        values['port_status']['80']  == "closed" and \
+                       #        values['port_status']['806'] == "open" :
+                       #       email_exception("%s port 80 blocked" % nodename, "possible VSERVER ref blocked")
+
                #if not values['external_dns_status']:
                #       email_exception("%s DNS down" % nodename)
 
index 3afbe7b..8cd5bdc 100644 (file)
@@ -71,7 +71,7 @@ We need your help resolving this issue in a few ways:
     that there is a problem with the PCU configuration, we can help you
     resolve that independently.
 
- 2. If it is possible, please correcct the above PCU problem, or let us know
+ 2. If it is possible, please correct the above PCU problem, or let us know
     what steps you are taking.  By enabling us to take administrative actions
     automatically without your intervention, you will save time in the future 
     the next time we need to reboot this machine, because we will be able to 
@@ -125,15 +125,15 @@ This notice is simply to let you know that:
     %(hostname)s
 
 has some ports that appear to be blocked, making the node unusable.  While
-some ports are open, to be a fully functional node, all ports need to be
-accessible at all times.  Please see the following for the list of
-requirements for hosting a node:
+some ports are open, a fully functional node needs all ports accessible at all
+times.  Please see the following for the list of requirements for hosting a
+node:
 
     http://www.planet-lab.org/hosting
 
-The node will be considered 'DOWN' until the ports are unblocked.
+We will consider the node 'DOWN' until the ports are unblocked.
 
-Please investigate, and let us know if there's anything we can do to help get
+Please investigate and let us know if there's anything we can do to help get
 it back on-line.  You can see more information about the current status of
 this host here:
 
index 1515396..62de999 100644 (file)
@@ -142,8 +142,11 @@ def getNodeAPI(session):
        nodeauth = Auth(session=session)
        return PLC(nodeauth.auth, auth.server)
 
-def getAuthAPI():
-       return PLC(auth.auth, auth.server)
+def getAuthAPI(url=None):
+       if url:
+               return PLC(auth.auth, url)
+       else:
+               return PLC(auth.auth, auth.server)
 
 def getCachedAuthAPI():
        return CachedPLC(auth.auth, auth.server)
index ac23f1b..78e0500 100755 (executable)
@@ -148,7 +148,7 @@ def sync():
                                                'longitude', 'max_slices', 'slice_ids', 'node_ids', 
                                                'enabled', 'date_created' ])
        l_nodes = plc.api.GetNodes({'peer_id':None}, 
-                                               ['hostname', 'node_id', 'ports', 'site_id', 'boot_state',
+                                               ['hostname', 'node_id', 'ports', 'site_id', 'boot_state', 'run_level',
                                                 'version', 'last_updated', 'date_created', 'key',
                                                 'last_contact', 'pcu_ids', 'interface_ids'])
        l_pcus = plc.api.GetPCUs()
index 9ba6a32..acd5007 100755 (executable)
@@ -40,6 +40,7 @@ def check_node_state(rec, node):
        if rec.plc_node_stats:
                print rec.plc_node_stats
                boot_state = rec.plc_node_stats['boot_state']
+               run_level = rec.plc_node_stats['run_level']
                last_contact = rec.plc_node_stats['last_contact']
                node.plc_nodeid = rec.plc_node_stats['node_id']
        else:
@@ -55,13 +56,12 @@ def check_node_state(rec, node):
                node.haspcu = False
 
        node.firewall = rec.firewall
-
+       node.plc_siteid = rec.plc_node_stats['site_id']
 
        # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
        #                       'translations' into the node.status state
        #               'BOOT' is a permanent state, but we want it to have a bit of
        #                       hysteresis (less than 0.5 days)
-
        #################################################################
        # "Initialize" the findbad states into nodebad status if they are not already set
 
@@ -80,24 +80,10 @@ def check_node_state(rec, node):
                        node.status = 'offline'
                        node.last_changed = datetime.now()
                        
-
-       #if node_state == 'DOWN' and node.status not in ['offline', 'down', 'disabled']:
-       #       if boot_state != 'disabled':
-       #               print "changed status from %s to offline" % node.status
-       #               node.status = 'offline'
-       #               node.last_changed = datetime.now()
-       #       else:
-       #               print "changed status from %s to %s" % (node.status, boot_state)
-       #               node.status = boot_state
-       #               node.last_changed = datetime.now()
-
-       if node_state == 'DEBUG' and node.status != 'monitordebug' and \
-                                                                node.status != 'disabled' and \
-                                                                node.status != 'safeboot':
+       if node_state == 'DEBUG' and node.status not in ['failboot', 'disabled', 'safeboot']:
                if boot_state != 'disabled' and boot_state != 'safeboot':
-
-                       print "changed status from %s to monitordebug" % (node.status)
-                       node.status = "monitordebug"
+                       print "changed status from %s to failboot" % (node.status)
+                       node.status = "failboot"
                        node.last_changed = datetime.now()
                else:
                        print "changed status from %s to %s" % (node.status, boot_state)
@@ -113,8 +99,8 @@ def check_node_state(rec, node):
        # Switch temporary hystersis states into their 'firm' states.
        #         online -> good                after half a day
        #         offline -> down               after two days
-       #         monitordebug -> down  after 30 days
-       #         safeboot -> monitordebug after 60 days
+       #         failboot -> down  after 30 days
+       #         safeboot -> failboot after 60 days
        #         disabled -> down              after 60 days
 
        if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
@@ -127,15 +113,15 @@ def check_node_state(rec, node):
                node.status = 'down'
                # NOTE: do not reset last_changed, or you lose how long it's been down.
 
-       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+       if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
                print "changed status from %s to down" % node.status
                node.status = 'down'
                # NOTE: do not reset last_changed, or you lose how long it's been down.
 
        if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
                print "changed status from %s to down" % node.status
-               # NOTE: change an admin mode back into monitordebug after two months.
-               node.status = 'monitordebug'
+               # NOTE: change an admin mode back into failboot after two months.
+               node.status = 'failboot'
                node.last_changed = datetime.now()
 
        # extreme cases of offline nodes
index 1b55228..b4bf71d 100644 (file)
@@ -22,6 +22,31 @@ class BayTechRPC3NC(PCUControl):
                self.transport.close()
                return 0
 
+class BayTechGeorgeTown(PCUControl):
+       supported_ports = [22,23]
+       def run_telnet(self, node_port, dryrun):
+               return self.run_ssh(node_port, dryrun)
+       def run_ssh(self, node_port, dryrun):
+               # NOTE: The georgetown pcu always drops the first connection, 
+               self.transport.open(self.host, self.username, None, "Enter user name:")
+               self.transport.close()
+               time.sleep(1)
+               self.transport.open(self.host, self.username, None, "Enter user name:")
+               self.transport.sendPassword(self.password, "Enter Password:")
+
+               self.transport.ifThenSend("RPC-16>", "Reboot %d" % node_port)
+
+               # Reboot Outlet  N        (Y/N)?
+               if dryrun:
+                       self.transport.ifThenSend("(Y/N)?", "N")
+               else:
+                       self.transport.ifThenSend("(Y/N)?", "Y")
+               self.transport.ifThenSend("RPC-16>", "")
+
+               self.transport.close()
+               return 0
+
+
 class BayTechRPC16(PCUControl):
        supported_ports = [22,23]
        def run_telnet(self, node_port, dryrun):
index 712cf19..898b5e5 100644 (file)
@@ -25,7 +25,7 @@ class DRAC(PCUControl):
                        # Testing Reboot ?
                        #index = s.expect(["DRAC 5", "[%s]#" % self.username ])
                        # NOTE: be careful to escape any characters used by 're.compile'
-                       index = s.expect(["\$", "\[%s\]#" % self.username ])
+                       index = s.expect(["\$", "\[%s\]#" % self.username, "/.*>" ])
                        print "INDEX:", index
                        print s
                        if dryrun:
@@ -33,12 +33,16 @@ class DRAC(PCUControl):
                                        s.sendline("racadm getsysinfo")
                                elif index == 1:
                                        s.sendline("getsysinfo")
+                               elif index == 2:
+                                       s.sendline("racadm getsysinfo")
                        else:
                                print "serveraction powercycle"
                                if index == 0:
                                        s.sendline("racadm serveraction powercycle")
                                elif index == 1:
                                        s.sendline("serveraction powercycle")
+                               elif index == 2:
+                                       s.sendline("racadm serveraction powercycle")
                                
                        # TODO:  this is really lousy.  Without the sleep, the sendlines
                        # don't completely get through.  Even the added, expect line
@@ -47,7 +51,7 @@ class DRAC(PCUControl):
                        # other context...
                        s.send("\r\n\r\n")
                        time.sleep(20)
-                       index = s.expect(["\$", "\[%s\]#" % self.username ])
+                       index = s.expect(["\$", "\[%s\]#" % self.username, "/.*>" ])
                        print s
                        print "INDEX 2:", index
                        s.sendline("exit")
index fe54863..77cf76e 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -91,6 +91,7 @@ def main(hostnames, sitenames):
                # if it is offline and HAS a PCU, then try to use it.
                if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                        changed_greaterthan(nodehist.last_changed,1.0) and \
+                       not nodehist.firewall and \
                        not found_between(recent_actions, 'try_reboot', 3.5, 1):
 
                                sitehist.attemptReboot(host)
@@ -100,6 +101,7 @@ def main(hostnames, sitenames):
                #               will be false for a day after the above condition is satisfied
                if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                        changed_greaterthan(nodehist.last_changed,1.5) and \
+                       not nodehist.firewall and \
                        found_between(recent_actions, 'try_reboot', 3.5, 1) and \
                        not found_within(recent_actions, 'pcufailed_notice', 3.5):
                                
@@ -108,7 +110,7 @@ def main(hostnames, sitenames):
                                sitehist.sendMessage('pcufailed_notice', hostname=host)
                                print "send message for host %s PCU Failure" % host
 
-               if nodehist.status == 'monitordebug' and \
+               if nodehist.status == 'failboot' and \
                        changed_greaterthan(nodehist.last_changed, 1) and \
                        not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
                                # send down node notice
@@ -127,7 +129,7 @@ def main(hostnames, sitenames):
 
                                if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
                                        # send down node notice
-                                       email_exception(host, "firewall_notice")
+                                       #email_exception(host, "firewall_notice")
                                        sitehist.sendMessage('firewall_notice', hostname=host)
                                        print "send message for host %s down" % host
 
index 77e304a..e18934b 100644 (file)
@@ -19,3 +19,5 @@ ALTER TABLE findbadnoderecord_history ADD COLUMN traceroute varchar DEFAULT NULL
 ALTER TABLE historynoderecord ADD COLUMN firewall boolean DEFAULT false;
 ALTER TABLE historynoderecord_history ADD COLUMN firewall boolean DEFAULT false;
 
+ALTER TABLE historynoderecord ADD COLUMN plc_siteid integer DEFAULT 1;
+ALTER TABLE historynoderecord_history ADD COLUMN plc_siteid integer DEFAULT 1;