changed 'monitordebug' to failboot

author Stephen Soltesz <soltesz@cs.princeton.edu>

Mon, 17 Aug 2009 22:03:47 +0000 (22:03 +0000)

committer Stephen Soltesz <soltesz@cs.princeton.edu>

Mon, 17 Aug 2009 22:03:47 +0000 (22:03 +0000)
author Stephen Soltesz <soltesz@cs.princeton.edu>
Mon, 17 Aug 2009 22:03:47 +0000 (22:03 +0000)
committer Stephen Soltesz <soltesz@cs.princeton.edu>
Mon, 17 Aug 2009 22:03:47 +0000 (22:03 +0000)
diff --git a/automate-default.sh b/automate-default.sh

index 8b18032..8d300a7 100755 (executable)
--- a/automate-default.sh
+++ b/automate-default.sh
@@ -62,7 +62,7 @@ fi
  #TODO: should add a call to ssh-add -l to check if the keys are loaded or not.
  source ${MONITOR_SCRIPT_ROOT}/agent.sh
  
-${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
+#${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
  ${MONITOR_SCRIPT_ROOT}/syncwithplc.py $DATE || :
  service plc restart monitor
  
@@ -75,7 +75,7 @@ ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || :
  ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
  
  ${MONITOR_SCRIPT_ROOT}/policy.py $DATE
-${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
+#${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
  service plc restart monitor
  curl -s 'http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview&formatcsv' > /var/lib/monitor/comon/$DATE.comon.csv
  
diff --git a/monitor/bootman.py b/monitor/bootman.py

index 7bd0cb3..09be54f 100755 (executable)
--- a/monitor/bootman.py
+++ b/monitor/bootman.py
@@ -136,7 +136,7 @@ class NodeConnection:
                         print "Running MANUAL fsck on %s" % self.node
                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
-                                 "  fsck -v -f -y /dev/planetlab/vserver >> out.fsck 2>&1 ; " + \
+                                 "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
                                   "  rm -f /tmp/BM_RUNNING " + \
                                   ") &" 
@@ -300,7 +300,7 @@ class PlanetLabSession:
                         print ret
                         if ret != 0:
                                 print "\tFAILED TWICE"
-                               email_exception("%s rsync failed twice" % self.node)
+                               #email_exception("%s rsync failed twice" % self.node)
                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
  
                 t1 = time.time()
@@ -435,6 +435,7 @@ class DebugInterface:
  
                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
                                 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+                               "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
                                 "bminit-cfg-auth-getplc-update-debug-done",
                                 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
                                 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
@@ -471,6 +472,7 @@ class DebugInterface:
                                 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
                                 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+                               "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
                                 ]:
                         sequences.update({n : "restart_bootmanager_rins"})
  
@@ -512,7 +514,15 @@ class DebugInterface:
                 # fsck_repair
                 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
                                   "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
-                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done"
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
+                                 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
+                                 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
                                 ]:
                         sequences.update({n : "fsck_repair"})
  
@@ -529,6 +539,7 @@ class DebugInterface:
                 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
                                    "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
                                    "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+                                  "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
                                 ]:
                         sequences.update({n : "nodenetwork_email"})
  
@@ -641,6 +652,7 @@ class DebugInterface:
                         ('implementerror', 'Implementation Error'),
                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
+                       ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
                         ('noinstall'    , 'notinstalled'),
@@ -744,7 +756,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
  
                                 log=conn.get_dmesg().read()
                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
-                               conn.set_nodestate('disabled')
+                               #conn.set_nodestate('disabled')
  
                         return False
  
@@ -869,7 +881,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None):
                                 args['log'] = conn.get_dmesg().read()
  
                                 sitehist.sendMessage('baddisk_notice', **args)
-                               conn.set_nodestate('disabled')
+                               #conn.set_nodestate('disabled')
  
                 elif sequences[s] == "update_hardware_email":
                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
diff --git a/monitor/common.py b/monitor/common.py

index 6fca571..d3dc895 100644 (file)
--- a/monitor/common.py
+++ b/monitor/common.py
@@ -228,7 +228,7 @@ def email_exception(content=None, title=None):
          full_title = "exception running monitor %s" % title
  
      m=Message(full_title, msg, False)
-    m.send([config.cc_email])
+    m.send([config.exception_email])
      return
  
  def changed_lessthan(last_changed, days):
diff --git a/monitor/database/info/history.py b/monitor/database/info/history.py

index 7190248..0abdebc 100644 (file)
--- a/monitor/database/info/history.py
+++ b/monitor/database/info/history.py
@@ -18,6 +18,7 @@ class HistoryNodeRecord(Entity):
         haspcu = Field(Boolean,default=False)
         firewall = Field(Boolean,default=False)
         plc_nodeid = Field(Int,default=1)
+       plc_siteid = Field(Int,default=1)
  
         acts_as_versioned(ignore=['last_changed', 'last_checked'])
  
diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py

index 0a1437b..d37ab2e 100644 (file)
--- a/monitor/database/info/interface.py
+++ b/monitor/database/info/interface.py
@@ -148,7 +148,7 @@ class SiteInterface(HistorySiteRecord):
                         m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
  
                         if ccemail:
-                               contacts = [config.cc_email]
+                               contacts = [config.exception_email]
                         else:
                                 contacts = self.getContacts()
  
diff --git a/monitor/scanapi.py b/monitor/scanapi.py

index 22e3e74..d0ed72b 100644 (file)
--- a/monitor/scanapi.py
+++ b/monitor/scanapi.py
@@ -384,6 +384,11 @@ EOF                        """)
                                 values['observed_status'] = 'DOWN'
                                 values['firewall'] = True
  
+                       #if   values['port_status']['22']  == "open" and \
+                       #        values['port_status']['80']  == "closed" and \
+                       #        values['port_status']['806'] == "open" :
+                       #       email_exception("%s port 80 blocked" % nodename, "possible VSERVER ref blocked")
+
                 #if not values['external_dns_status']:
                 #       email_exception("%s DNS down" % nodename)
  
diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py

index 3afbe7b..8cd5bdc 100644 (file)
--- a/monitor/wrapper/emailTxt.py
+++ b/monitor/wrapper/emailTxt.py
@@ -71,7 +71,7 @@ We need your help resolving this issue in a few ways:
      that there is a problem with the PCU configuration, we can help you
      resolve that independently.
  
- 2. If it is possible, please correcct the above PCU problem, or let us know
+ 2. If it is possible, please correct the above PCU problem, or let us know
      what steps you are taking.  By enabling us to take administrative actions
      automatically without your intervention, you will save time in the future 
      the next time we need to reboot this machine, because we will be able to 
@@ -125,15 +125,15 @@ This notice is simply to let you know that:
      %(hostname)s
  
  has some ports that appear to be blocked, making the node unusable.  While
-some ports are open, to be a fully functional node, all ports need to be
-accessible at all times.  Please see the following for the list of
-requirements for hosting a node:
+some ports are open, a fully functional node needs all ports accessible at all
+times.  Please see the following for the list of requirements for hosting a
+node:
  
      http://www.planet-lab.org/hosting
  
-The node will be considered 'DOWN' until the ports are unblocked.
+We will consider the node 'DOWN' until the ports are unblocked.
  
-Please investigate, and let us know if there's anything we can do to help get
+Please investigate and let us know if there's anything we can do to help get
  it back on-line.  You can see more information about the current status of
  this host here:
  
diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py

index 1515396..62de999 100644 (file)
--- a/monitor/wrapper/plc.py
+++ b/monitor/wrapper/plc.py
@@ -142,8 +142,11 @@ def getNodeAPI(session):
         nodeauth = Auth(session=session)
         return PLC(nodeauth.auth, auth.server)
  
-def getAuthAPI():
-       return PLC(auth.auth, auth.server)
+def getAuthAPI(url=None):
+       if url:
+               return PLC(auth.auth, url)
+       else:
+               return PLC(auth.auth, auth.server)
  
  def getCachedAuthAPI():
         return CachedPLC(auth.auth, auth.server)
diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py

index ac23f1b..78e0500 100755 (executable)
--- a/monitor/wrapper/plccache.py
+++ b/monitor/wrapper/plccache.py
@@ -148,7 +148,7 @@ def sync():
                                                 'longitude', 'max_slices', 'slice_ids', 'node_ids', 
                                                 'enabled', 'date_created' ])
         l_nodes = plc.api.GetNodes({'peer_id':None}, 
-                                               ['hostname', 'node_id', 'ports', 'site_id', 'boot_state',
+                                               ['hostname', 'node_id', 'ports', 'site_id', 'boot_state', 'run_level',
                                                  'version', 'last_updated', 'date_created', 'key',
                                                  'last_contact', 'pcu_ids', 'interface_ids'])
         l_pcus = plc.api.GetPCUs()
diff --git a/nodebad.py b/nodebad.py

index 9ba6a32..acd5007 100755 (executable)
--- a/nodebad.py
+++ b/nodebad.py
@@ -40,6 +40,7 @@ def check_node_state(rec, node):
         if rec.plc_node_stats:
                 print rec.plc_node_stats
                 boot_state = rec.plc_node_stats['boot_state']
+               run_level = rec.plc_node_stats['run_level']
                 last_contact = rec.plc_node_stats['last_contact']
                 node.plc_nodeid = rec.plc_node_stats['node_id']
         else:
@@ -55,13 +56,12 @@ def check_node_state(rec, node):
                 node.haspcu = False
  
         node.firewall = rec.firewall
-
+       node.plc_siteid = rec.plc_node_stats['site_id']
  
         # NOTE: 'DOWN' and 'DEBUG'  are temporary states, so only need
         #                       'translations' into the node.status state
         #               'BOOT' is a permanent state, but we want it to have a bit of
         #                       hysteresis (less than 0.5 days)
-
         #################################################################
         # "Initialize" the findbad states into nodebad status if they are not already set
  
@@ -80,24 +80,10 @@ def check_node_state(rec, node):
                         node.status = 'offline'
                         node.last_changed = datetime.now()
                         
-
-       #if node_state == 'DOWN' and node.status not in ['offline', 'down', 'disabled']:
-       #       if boot_state != 'disabled':
-       #               print "changed status from %s to offline" % node.status
-       #               node.status = 'offline'
-       #               node.last_changed = datetime.now()
-       #       else:
-       #               print "changed status from %s to %s" % (node.status, boot_state)
-       #               node.status = boot_state
-       #               node.last_changed = datetime.now()
-
-       if node_state == 'DEBUG' and node.status != 'monitordebug' and \
-                                                                node.status != 'disabled' and \
-                                                                node.status != 'safeboot':
+       if node_state == 'DEBUG' and node.status not in ['failboot', 'disabled', 'safeboot']:
                 if boot_state != 'disabled' and boot_state != 'safeboot':
-
-                       print "changed status from %s to monitordebug" % (node.status)
-                       node.status = "monitordebug"
+                       print "changed status from %s to failboot" % (node.status)
+                       node.status = "failboot"
                         node.last_changed = datetime.now()
                 else:
                         print "changed status from %s to %s" % (node.status, boot_state)
@@ -113,8 +99,8 @@ def check_node_state(rec, node):
         # Switch temporary hystersis states into their 'firm' states.
         #         online -> good                after half a day
         #         offline -> down               after two days
-       #         monitordebug -> down  after 30 days
-       #         safeboot -> monitordebug after 60 days
+       #         failboot -> down  after 30 days
+       #         safeboot -> failboot after 60 days
         #         disabled -> down              after 60 days
  
         if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
@@ -127,15 +113,15 @@ def check_node_state(rec, node):
                 node.status = 'down'
                 # NOTE: do not reset last_changed, or you lose how long it's been down.
  
-       if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+       if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
                 print "changed status from %s to down" % node.status
                 node.status = 'down'
                 # NOTE: do not reset last_changed, or you lose how long it's been down.
  
         if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
                 print "changed status from %s to down" % node.status
-               # NOTE: change an admin mode back into monitordebug after two months.
-               node.status = 'monitordebug'
+               # NOTE: change an admin mode back into failboot after two months.
+               node.status = 'failboot'
                 node.last_changed = datetime.now()
  
         # extreme cases of offline nodes
diff --git a/pcucontrol/models/BayTech.py b/pcucontrol/models/BayTech.py

index 1b55228..b4bf71d 100644 (file)
--- a/pcucontrol/models/BayTech.py
+++ b/pcucontrol/models/BayTech.py
@@ -22,6 +22,31 @@ class BayTechRPC3NC(PCUControl):
                 self.transport.close()
                 return 0
  
+class BayTechGeorgeTown(PCUControl):
+       supported_ports = [22,23]
+       def run_telnet(self, node_port, dryrun):
+               return self.run_ssh(node_port, dryrun)
+       def run_ssh(self, node_port, dryrun):
+               # NOTE: The georgetown pcu always drops the first connection, 
+               self.transport.open(self.host, self.username, None, "Enter user name:")
+               self.transport.close()
+               time.sleep(1)
+               self.transport.open(self.host, self.username, None, "Enter user name:")
+               self.transport.sendPassword(self.password, "Enter Password:")
+
+               self.transport.ifThenSend("RPC-16>", "Reboot %d" % node_port)
+
+               # Reboot Outlet  N        (Y/N)?
+               if dryrun:
+                       self.transport.ifThenSend("(Y/N)?", "N")
+               else:
+                       self.transport.ifThenSend("(Y/N)?", "Y")
+               self.transport.ifThenSend("RPC-16>", "")
+
+               self.transport.close()
+               return 0
+
+
  class BayTechRPC16(PCUControl):
         supported_ports = [22,23]
         def run_telnet(self, node_port, dryrun):
diff --git a/pcucontrol/models/DRAC.py b/pcucontrol/models/DRAC.py

index 712cf19..898b5e5 100644 (file)
--- a/pcucontrol/models/DRAC.py
+++ b/pcucontrol/models/DRAC.py
@@ -25,7 +25,7 @@ class DRAC(PCUControl):
                         # Testing Reboot ?
                         #index = s.expect(["DRAC 5", "[%s]#" % self.username ])
                         # NOTE: be careful to escape any characters used by 're.compile'
-                       index = s.expect(["\$", "\[%s\]#" % self.username ])
+                       index = s.expect(["\$", "\[%s\]#" % self.username, "/.*>" ])
                         print "INDEX:", index
                         print s
                         if dryrun:
@@ -33,12 +33,16 @@ class DRAC(PCUControl):
                                         s.sendline("racadm getsysinfo")
                                 elif index == 1:
                                         s.sendline("getsysinfo")
+                               elif index == 2:
+                                       s.sendline("racadm getsysinfo")
                         else:
                                 print "serveraction powercycle"
                                 if index == 0:
                                         s.sendline("racadm serveraction powercycle")
                                 elif index == 1:
                                         s.sendline("serveraction powercycle")
+                               elif index == 2:
+                                       s.sendline("racadm serveraction powercycle")
                                 
                         # TODO:  this is really lousy.  Without the sleep, the sendlines
                         # don't completely get through.  Even the added, expect line
@@ -47,7 +51,7 @@ class DRAC(PCUControl):
                         # other context...
                         s.send("\r\n\r\n")
                         time.sleep(20)
-                       index = s.expect(["\$", "\[%s\]#" % self.username ])
+                       index = s.expect(["\$", "\[%s\]#" % self.username, "/.*>" ])
                         print s
                         print "INDEX 2:", index
                         s.sendline("exit")
diff --git a/policy.py b/policy.py

index fe54863..77cf76e 100755 (executable)
--- a/policy.py
+++ b/policy.py
@@ -91,6 +91,7 @@ def main(hostnames, sitenames):
                 # if it is offline and HAS a PCU, then try to use it.
                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                         changed_greaterthan(nodehist.last_changed,1.0) and \
+                       not nodehist.firewall and \
                         not found_between(recent_actions, 'try_reboot', 3.5, 1):
  
                                 sitehist.attemptReboot(host)
@@ -100,6 +101,7 @@ def main(hostnames, sitenames):
                 #               will be false for a day after the above condition is satisfied
                 if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
                         changed_greaterthan(nodehist.last_changed,1.5) and \
+                       not nodehist.firewall and \
                         found_between(recent_actions, 'try_reboot', 3.5, 1) and \
                         not found_within(recent_actions, 'pcufailed_notice', 3.5):
                                 
@@ -108,7 +110,7 @@ def main(hostnames, sitenames):
                                 sitehist.sendMessage('pcufailed_notice', hostname=host)
                                 print "send message for host %s PCU Failure" % host
  
-               if nodehist.status == 'monitordebug' and \
+               if nodehist.status == 'failboot' and \
                         changed_greaterthan(nodehist.last_changed, 1) and \
                         not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
                                 # send down node notice
@@ -127,7 +129,7 @@ def main(hostnames, sitenames):
  
                                 if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
                                         # send down node notice
-                                       email_exception(host, "firewall_notice")
+                                       #email_exception(host, "firewall_notice")
                                         sitehist.sendMessage('firewall_notice', hostname=host)
                                         print "send message for host %s down" % host
  
diff --git a/upgrade/monitor-server-3.0-19.sql b/upgrade/monitor-server-3.0-19.sql

index 77e304a..e18934b 100644 (file)
--- a/upgrade/monitor-server-3.0-19.sql
+++ b/upgrade/monitor-server-3.0-19.sql
@@ -19,3 +19,5 @@ ALTER TABLE findbadnoderecord_history ADD COLUMN traceroute varchar DEFAULT NULL
  ALTER TABLE historynoderecord ADD COLUMN firewall boolean DEFAULT false;
  ALTER TABLE historynoderecord_history ADD COLUMN firewall boolean DEFAULT false;
  
+ALTER TABLE historynoderecord ADD COLUMN plc_siteid integer DEFAULT 1;
+ALTER TABLE historynoderecord_history ADD COLUMN plc_siteid integer DEFAULT 1;
author	Stephen Soltesz <soltesz@cs.princeton.edu>
	Mon, 17 Aug 2009 22:03:47 +0000 (22:03 +0000)
committer	Stephen Soltesz <soltesz@cs.princeton.edu>
	Mon, 17 Aug 2009 22:03:47 +0000 (22:03 +0000)
automate-default.sh		patch \| blob \| history
monitor/bootman.py		patch \| blob \| history
monitor/common.py		patch \| blob \| history
monitor/database/info/history.py		patch \| blob \| history
monitor/database/info/interface.py		patch \| blob \| history
monitor/scanapi.py		patch \| blob \| history
monitor/wrapper/emailTxt.py		patch \| blob \| history
monitor/wrapper/plc.py		patch \| blob \| history
monitor/wrapper/plccache.py		patch \| blob \| history
nodebad.py		patch \| blob \| history
pcucontrol/models/BayTech.py		patch \| blob \| history
pcucontrol/models/DRAC.py		patch \| blob \| history
policy.py		patch \| blob \| history
upgrade/monitor-server-3.0-19.sql		patch \| blob \| history