From 40884a302bf204a7f42044b72d87f9431ad6dd35 Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Mon, 17 Aug 2009 22:03:47 +0000 Subject: [PATCH] changed 'monitordebug' to failboot added run_level to nodebad record added site_id to database added DRAC6 support added GeorgeTown PCU to BayTech to catch their wacky PCU. add extra sequences too bootman to catch fsck error remove setting nodes to 'disabled', leave them in failboot send exception and Unknown Error messages to config.exception_email disable synccheck in automate-default.sh script --- automate-default.sh | 4 ++-- monitor/bootman.py | 22 ++++++++++++++----- monitor/common.py | 2 +- monitor/database/info/history.py | 1 + monitor/database/info/interface.py | 2 +- monitor/scanapi.py | 5 +++++ monitor/wrapper/emailTxt.py | 12 +++++------ monitor/wrapper/plc.py | 7 ++++-- monitor/wrapper/plccache.py | 2 +- nodebad.py | 34 +++++++++--------------------- pcucontrol/models/BayTech.py | 25 ++++++++++++++++++++++ pcucontrol/models/DRAC.py | 8 +++++-- policy.py | 6 ++++-- upgrade/monitor-server-3.0-19.sql | 2 ++ 14 files changed, 86 insertions(+), 46 deletions(-) diff --git a/automate-default.sh b/automate-default.sh index 8b18032..8d300a7 100755 --- a/automate-default.sh +++ b/automate-default.sh @@ -62,7 +62,7 @@ fi #TODO: should add a call to ssh-add -l to check if the keys are loaded or not. source ${MONITOR_SCRIPT_ROOT}/agent.sh -${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || : +#${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || : ${MONITOR_SCRIPT_ROOT}/syncwithplc.py $DATE || : service plc restart monitor @@ -75,7 +75,7 @@ ps ax | grep BatchMode | grep -v grep | awk '{print $1}' | xargs -r kill || : ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || : ${MONITOR_SCRIPT_ROOT}/policy.py $DATE -${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || : +#${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || : service plc restart monitor curl -s 'http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview&formatcsv' > /var/lib/monitor/comon/$DATE.comon.csv diff --git a/monitor/bootman.py b/monitor/bootman.py index 7bd0cb3..09be54f 100755 --- a/monitor/bootman.py +++ b/monitor/bootman.py @@ -136,7 +136,7 @@ class NodeConnection: print "Running MANUAL fsck on %s" % self.node cmd = "( touch /tmp/BM_RUNNING ; " + \ " fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \ - " fsck -v -f -y /dev/planetlab/vserver >> out.fsck 2>&1 ; " + \ + " fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \ " python ./BootManager.py %s &> server.log < /dev/null ; " + \ " rm -f /tmp/BM_RUNNING " + \ ") &" @@ -300,7 +300,7 @@ class PlanetLabSession: print ret if ret != 0: print "\tFAILED TWICE" - email_exception("%s rsync failed twice" % self.node) + #email_exception("%s rsync failed twice" % self.node) raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key") t1 = time.time() @@ -435,6 +435,7 @@ class DebugInterface: "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done", "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done", + "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done", "bminit-cfg-auth-getplc-update-debug-done", "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done", "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done", @@ -471,6 +472,7 @@ class DebugInterface: "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done", "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done", "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done", + "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done", ]: sequences.update({n : "restart_bootmanager_rins"}) @@ -512,7 +514,15 @@ class DebugInterface: # fsck_repair for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done", "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done", - "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done" + "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done", + "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done", + "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done", ]: sequences.update({n : "fsck_repair"}) @@ -529,6 +539,7 @@ class DebugInterface: for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done", "bminit-cfg-update-exception-nodehostname-update-debug-done", + "bminit-cfg-exception-nodehostname-debug-validate-exception-done", ]: sequences.update({n : "nodenetwork_email"}) @@ -641,6 +652,7 @@ class DebugInterface: ('implementerror', 'Implementation Error'), ('fsckabort' , 'is mounted. e2fsck: Cannot continue, aborting'), ('fsckfail' , 'Running e2fsck -v -p /dev/planetlab/root failed'), + ('fsckfail2' , 'Running e2fsck -v -p /dev/planetlab/vservers failed'), ('readonlyfs' , '\[Errno 30\] Read-only file system'), ('baddisk' , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"), ('noinstall' , 'notinstalled'), @@ -744,7 +756,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None): log=conn.get_dmesg().read() sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log) - conn.set_nodestate('disabled') + #conn.set_nodestate('disabled') return False @@ -869,7 +881,7 @@ def restore_basic(sitehist, hostname, config=None, forced_action=None): args['log'] = conn.get_dmesg().read() sitehist.sendMessage('baddisk_notice', **args) - conn.set_nodestate('disabled') + #conn.set_nodestate('disabled') elif sequences[s] == "update_hardware_email": if not found_within(recent_actions, 'minimalhardware_notice', 7): diff --git a/monitor/common.py b/monitor/common.py index 6fca571..d3dc895 100644 --- a/monitor/common.py +++ b/monitor/common.py @@ -228,7 +228,7 @@ def email_exception(content=None, title=None): full_title = "exception running monitor %s" % title m=Message(full_title, msg, False) - m.send([config.cc_email]) + m.send([config.exception_email]) return def changed_lessthan(last_changed, days): diff --git a/monitor/database/info/history.py b/monitor/database/info/history.py index 7190248..0abdebc 100644 --- a/monitor/database/info/history.py +++ b/monitor/database/info/history.py @@ -18,6 +18,7 @@ class HistoryNodeRecord(Entity): haspcu = Field(Boolean,default=False) firewall = Field(Boolean,default=False) plc_nodeid = Field(Int,default=1) + plc_siteid = Field(Int,default=1) acts_as_versioned(ignore=['last_changed', 'last_checked']) diff --git a/monitor/database/info/interface.py b/monitor/database/info/interface.py index 0a1437b..d37ab2e 100644 --- a/monitor/database/info/interface.py +++ b/monitor/database/info/interface.py @@ -148,7 +148,7 @@ class SiteInterface(HistorySiteRecord): m = Message(message[0] % args, message[1] % args, viart, self.db.message_id) if ccemail: - contacts = [config.cc_email] + contacts = [config.exception_email] else: contacts = self.getContacts() diff --git a/monitor/scanapi.py b/monitor/scanapi.py index 22e3e74..d0ed72b 100644 --- a/monitor/scanapi.py +++ b/monitor/scanapi.py @@ -384,6 +384,11 @@ EOF """) values['observed_status'] = 'DOWN' values['firewall'] = True + #if values['port_status']['22'] == "open" and \ + # values['port_status']['80'] == "closed" and \ + # values['port_status']['806'] == "open" : + # email_exception("%s port 80 blocked" % nodename, "possible VSERVER ref blocked") + #if not values['external_dns_status']: # email_exception("%s DNS down" % nodename) diff --git a/monitor/wrapper/emailTxt.py b/monitor/wrapper/emailTxt.py index 3afbe7b..8cd5bdc 100644 --- a/monitor/wrapper/emailTxt.py +++ b/monitor/wrapper/emailTxt.py @@ -71,7 +71,7 @@ We need your help resolving this issue in a few ways: that there is a problem with the PCU configuration, we can help you resolve that independently. - 2. If it is possible, please correcct the above PCU problem, or let us know + 2. If it is possible, please correct the above PCU problem, or let us know what steps you are taking. By enabling us to take administrative actions automatically without your intervention, you will save time in the future the next time we need to reboot this machine, because we will be able to @@ -125,15 +125,15 @@ This notice is simply to let you know that: %(hostname)s has some ports that appear to be blocked, making the node unusable. While -some ports are open, to be a fully functional node, all ports need to be -accessible at all times. Please see the following for the list of -requirements for hosting a node: +some ports are open, a fully functional node needs all ports accessible at all +times. Please see the following for the list of requirements for hosting a +node: http://www.planet-lab.org/hosting -The node will be considered 'DOWN' until the ports are unblocked. +We will consider the node 'DOWN' until the ports are unblocked. -Please investigate, and let us know if there's anything we can do to help get +Please investigate and let us know if there's anything we can do to help get it back on-line. You can see more information about the current status of this host here: diff --git a/monitor/wrapper/plc.py b/monitor/wrapper/plc.py index 1515396..62de999 100644 --- a/monitor/wrapper/plc.py +++ b/monitor/wrapper/plc.py @@ -142,8 +142,11 @@ def getNodeAPI(session): nodeauth = Auth(session=session) return PLC(nodeauth.auth, auth.server) -def getAuthAPI(): - return PLC(auth.auth, auth.server) +def getAuthAPI(url=None): + if url: + return PLC(auth.auth, url) + else: + return PLC(auth.auth, auth.server) def getCachedAuthAPI(): return CachedPLC(auth.auth, auth.server) diff --git a/monitor/wrapper/plccache.py b/monitor/wrapper/plccache.py index ac23f1b..78e0500 100755 --- a/monitor/wrapper/plccache.py +++ b/monitor/wrapper/plccache.py @@ -148,7 +148,7 @@ def sync(): 'longitude', 'max_slices', 'slice_ids', 'node_ids', 'enabled', 'date_created' ]) l_nodes = plc.api.GetNodes({'peer_id':None}, - ['hostname', 'node_id', 'ports', 'site_id', 'boot_state', + ['hostname', 'node_id', 'ports', 'site_id', 'boot_state', 'run_level', 'version', 'last_updated', 'date_created', 'key', 'last_contact', 'pcu_ids', 'interface_ids']) l_pcus = plc.api.GetPCUs() diff --git a/nodebad.py b/nodebad.py index 9ba6a32..acd5007 100755 --- a/nodebad.py +++ b/nodebad.py @@ -40,6 +40,7 @@ def check_node_state(rec, node): if rec.plc_node_stats: print rec.plc_node_stats boot_state = rec.plc_node_stats['boot_state'] + run_level = rec.plc_node_stats['run_level'] last_contact = rec.plc_node_stats['last_contact'] node.plc_nodeid = rec.plc_node_stats['node_id'] else: @@ -55,13 +56,12 @@ def check_node_state(rec, node): node.haspcu = False node.firewall = rec.firewall - + node.plc_siteid = rec.plc_node_stats['site_id'] # NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need # 'translations' into the node.status state # 'BOOT' is a permanent state, but we want it to have a bit of # hysteresis (less than 0.5 days) - ################################################################# # "Initialize" the findbad states into nodebad status if they are not already set @@ -80,24 +80,10 @@ def check_node_state(rec, node): node.status = 'offline' node.last_changed = datetime.now() - - #if node_state == 'DOWN' and node.status not in ['offline', 'down', 'disabled']: - # if boot_state != 'disabled': - # print "changed status from %s to offline" % node.status - # node.status = 'offline' - # node.last_changed = datetime.now() - # else: - # print "changed status from %s to %s" % (node.status, boot_state) - # node.status = boot_state - # node.last_changed = datetime.now() - - if node_state == 'DEBUG' and node.status != 'monitordebug' and \ - node.status != 'disabled' and \ - node.status != 'safeboot': + if node_state == 'DEBUG' and node.status not in ['failboot', 'disabled', 'safeboot']: if boot_state != 'disabled' and boot_state != 'safeboot': - - print "changed status from %s to monitordebug" % (node.status) - node.status = "monitordebug" + print "changed status from %s to failboot" % (node.status) + node.status = "failboot" node.last_changed = datetime.now() else: print "changed status from %s to %s" % (node.status, boot_state) @@ -113,8 +99,8 @@ def check_node_state(rec, node): # Switch temporary hystersis states into their 'firm' states. # online -> good after half a day # offline -> down after two days - # monitordebug -> down after 30 days - # safeboot -> monitordebug after 60 days + # failboot -> down after 30 days + # safeboot -> failboot after 60 days # disabled -> down after 60 days if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5): @@ -127,15 +113,15 @@ def check_node_state(rec, node): node.status = 'down' # NOTE: do not reset last_changed, or you lose how long it's been down. - if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30): + if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30): print "changed status from %s to down" % node.status node.status = 'down' # NOTE: do not reset last_changed, or you lose how long it's been down. if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60): print "changed status from %s to down" % node.status - # NOTE: change an admin mode back into monitordebug after two months. - node.status = 'monitordebug' + # NOTE: change an admin mode back into failboot after two months. + node.status = 'failboot' node.last_changed = datetime.now() # extreme cases of offline nodes diff --git a/pcucontrol/models/BayTech.py b/pcucontrol/models/BayTech.py index 1b55228..b4bf71d 100644 --- a/pcucontrol/models/BayTech.py +++ b/pcucontrol/models/BayTech.py @@ -22,6 +22,31 @@ class BayTechRPC3NC(PCUControl): self.transport.close() return 0 +class BayTechGeorgeTown(PCUControl): + supported_ports = [22,23] + def run_telnet(self, node_port, dryrun): + return self.run_ssh(node_port, dryrun) + def run_ssh(self, node_port, dryrun): + # NOTE: The georgetown pcu always drops the first connection, + self.transport.open(self.host, self.username, None, "Enter user name:") + self.transport.close() + time.sleep(1) + self.transport.open(self.host, self.username, None, "Enter user name:") + self.transport.sendPassword(self.password, "Enter Password:") + + self.transport.ifThenSend("RPC-16>", "Reboot %d" % node_port) + + # Reboot Outlet N (Y/N)? + if dryrun: + self.transport.ifThenSend("(Y/N)?", "N") + else: + self.transport.ifThenSend("(Y/N)?", "Y") + self.transport.ifThenSend("RPC-16>", "") + + self.transport.close() + return 0 + + class BayTechRPC16(PCUControl): supported_ports = [22,23] def run_telnet(self, node_port, dryrun): diff --git a/pcucontrol/models/DRAC.py b/pcucontrol/models/DRAC.py index 712cf19..898b5e5 100644 --- a/pcucontrol/models/DRAC.py +++ b/pcucontrol/models/DRAC.py @@ -25,7 +25,7 @@ class DRAC(PCUControl): # Testing Reboot ? #index = s.expect(["DRAC 5", "[%s]#" % self.username ]) # NOTE: be careful to escape any characters used by 're.compile' - index = s.expect(["\$", "\[%s\]#" % self.username ]) + index = s.expect(["\$", "\[%s\]#" % self.username, "/.*>" ]) print "INDEX:", index print s if dryrun: @@ -33,12 +33,16 @@ class DRAC(PCUControl): s.sendline("racadm getsysinfo") elif index == 1: s.sendline("getsysinfo") + elif index == 2: + s.sendline("racadm getsysinfo") else: print "serveraction powercycle" if index == 0: s.sendline("racadm serveraction powercycle") elif index == 1: s.sendline("serveraction powercycle") + elif index == 2: + s.sendline("racadm serveraction powercycle") # TODO: this is really lousy. Without the sleep, the sendlines # don't completely get through. Even the added, expect line @@ -47,7 +51,7 @@ class DRAC(PCUControl): # other context... s.send("\r\n\r\n") time.sleep(20) - index = s.expect(["\$", "\[%s\]#" % self.username ]) + index = s.expect(["\$", "\[%s\]#" % self.username, "/.*>" ]) print s print "INDEX 2:", index s.sendline("exit") diff --git a/policy.py b/policy.py index fe54863..77cf76e 100755 --- a/policy.py +++ b/policy.py @@ -91,6 +91,7 @@ def main(hostnames, sitenames): # if it is offline and HAS a PCU, then try to use it. if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.0) and \ + not nodehist.firewall and \ not found_between(recent_actions, 'try_reboot', 3.5, 1): sitehist.attemptReboot(host) @@ -100,6 +101,7 @@ def main(hostnames, sitenames): # will be false for a day after the above condition is satisfied if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \ changed_greaterthan(nodehist.last_changed,1.5) and \ + not nodehist.firewall and \ found_between(recent_actions, 'try_reboot', 3.5, 1) and \ not found_within(recent_actions, 'pcufailed_notice', 3.5): @@ -108,7 +110,7 @@ def main(hostnames, sitenames): sitehist.sendMessage('pcufailed_notice', hostname=host) print "send message for host %s PCU Failure" % host - if nodehist.status == 'monitordebug' and \ + if nodehist.status == 'failboot' and \ changed_greaterthan(nodehist.last_changed, 1) and \ not found_between(recent_actions, 'bootmanager_restore', 0.5, 0): # send down node notice @@ -127,7 +129,7 @@ def main(hostnames, sitenames): if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5): # send down node notice - email_exception(host, "firewall_notice") + #email_exception(host, "firewall_notice") sitehist.sendMessage('firewall_notice', hostname=host) print "send message for host %s down" % host diff --git a/upgrade/monitor-server-3.0-19.sql b/upgrade/monitor-server-3.0-19.sql index 77e304a..e18934b 100644 --- a/upgrade/monitor-server-3.0-19.sql +++ b/upgrade/monitor-server-3.0-19.sql @@ -19,3 +19,5 @@ ALTER TABLE findbadnoderecord_history ADD COLUMN traceroute varchar DEFAULT NULL ALTER TABLE historynoderecord ADD COLUMN firewall boolean DEFAULT false; ALTER TABLE historynoderecord_history ADD COLUMN firewall boolean DEFAULT false; +ALTER TABLE historynoderecord ADD COLUMN plc_siteid integer DEFAULT 1; +ALTER TABLE historynoderecord_history ADD COLUMN plc_siteid integer DEFAULT 1; -- 2.43.0