#TODO: should add a call to ssh-add -l to check if the keys are loaded or not.
source ${MONITOR_SCRIPT_ROOT}/agent.sh
-${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
+#${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
${MONITOR_SCRIPT_ROOT}/syncwithplc.py $DATE || :
service plc restart monitor
ps ax | grep locfg | grep -v grep | awk '{print $1}' | xargs -r kill || :
${MONITOR_SCRIPT_ROOT}/policy.py $DATE
-${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
+#${MONITOR_SCRIPT_ROOT}/checksync.py $DATE || :
service plc restart monitor
curl -s 'http://summer.cs.princeton.edu/status/tabulator.cgi?table=table_nodeview&formatcsv' > /var/lib/monitor/comon/$DATE.comon.csv
print "Running MANUAL fsck on %s" % self.node
cmd = "( touch /tmp/BM_RUNNING ; " + \
" fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
- " fsck -v -f -y /dev/planetlab/vserver >> out.fsck 2>&1 ; " + \
+ " fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
" python ./BootManager.py %s &> server.log < /dev/null ; " + \
" rm -f /tmp/BM_RUNNING " + \
") &"
print ret
if ret != 0:
print "\tFAILED TWICE"
- email_exception("%s rsync failed twice" % self.node)
+ #email_exception("%s rsync failed twice" % self.node)
raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
t1 = time.time()
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
"bminit-cfg-auth-getplc-update-debug-done",
"bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
"bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
"bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
"bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
]:
sequences.update({n : "restart_bootmanager_rins"})
# fsck_repair
for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
- "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done"
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
]:
sequences.update({n : "fsck_repair"})
for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
"bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
"bminit-cfg-update-exception-nodehostname-update-debug-done",
+ "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
]:
sequences.update({n : "nodenetwork_email"})
('implementerror', 'Implementation Error'),
('fsckabort' , 'is mounted. e2fsck: Cannot continue, aborting'),
('fsckfail' , 'Running e2fsck -v -p /dev/planetlab/root failed'),
+ ('fsckfail2' , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
('readonlyfs' , '\[Errno 30\] Read-only file system'),
('baddisk' , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
('noinstall' , 'notinstalled'),
log=conn.get_dmesg().read()
sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
- conn.set_nodestate('disabled')
+ #conn.set_nodestate('disabled')
return False
args['log'] = conn.get_dmesg().read()
sitehist.sendMessage('baddisk_notice', **args)
- conn.set_nodestate('disabled')
+ #conn.set_nodestate('disabled')
elif sequences[s] == "update_hardware_email":
if not found_within(recent_actions, 'minimalhardware_notice', 7):
full_title = "exception running monitor %s" % title
m=Message(full_title, msg, False)
- m.send([config.cc_email])
+ m.send([config.exception_email])
return
def changed_lessthan(last_changed, days):
haspcu = Field(Boolean,default=False)
firewall = Field(Boolean,default=False)
plc_nodeid = Field(Int,default=1)
+ plc_siteid = Field(Int,default=1)
acts_as_versioned(ignore=['last_changed', 'last_checked'])
m = Message(message[0] % args, message[1] % args, viart, self.db.message_id)
if ccemail:
- contacts = [config.cc_email]
+ contacts = [config.exception_email]
else:
contacts = self.getContacts()
values['observed_status'] = 'DOWN'
values['firewall'] = True
+ #if values['port_status']['22'] == "open" and \
+ # values['port_status']['80'] == "closed" and \
+ # values['port_status']['806'] == "open" :
+ # email_exception("%s port 80 blocked" % nodename, "possible VSERVER ref blocked")
+
#if not values['external_dns_status']:
# email_exception("%s DNS down" % nodename)
that there is a problem with the PCU configuration, we can help you
resolve that independently.
- 2. If it is possible, please correcct the above PCU problem, or let us know
+ 2. If it is possible, please correct the above PCU problem, or let us know
what steps you are taking. By enabling us to take administrative actions
automatically without your intervention, you will save time in the future
the next time we need to reboot this machine, because we will be able to
%(hostname)s
has some ports that appear to be blocked, making the node unusable. While
-some ports are open, to be a fully functional node, all ports need to be
-accessible at all times. Please see the following for the list of
-requirements for hosting a node:
+some ports are open, a fully functional node needs all ports accessible at all
+times. Please see the following for the list of requirements for hosting a
+node:
http://www.planet-lab.org/hosting
-The node will be considered 'DOWN' until the ports are unblocked.
+We will consider the node 'DOWN' until the ports are unblocked.
-Please investigate, and let us know if there's anything we can do to help get
+Please investigate and let us know if there's anything we can do to help get
it back on-line. You can see more information about the current status of
this host here:
nodeauth = Auth(session=session)
return PLC(nodeauth.auth, auth.server)
-def getAuthAPI():
- return PLC(auth.auth, auth.server)
+def getAuthAPI(url=None):
+ if url:
+ return PLC(auth.auth, url)
+ else:
+ return PLC(auth.auth, auth.server)
def getCachedAuthAPI():
return CachedPLC(auth.auth, auth.server)
'longitude', 'max_slices', 'slice_ids', 'node_ids',
'enabled', 'date_created' ])
l_nodes = plc.api.GetNodes({'peer_id':None},
- ['hostname', 'node_id', 'ports', 'site_id', 'boot_state',
+ ['hostname', 'node_id', 'ports', 'site_id', 'boot_state', 'run_level',
'version', 'last_updated', 'date_created', 'key',
'last_contact', 'pcu_ids', 'interface_ids'])
l_pcus = plc.api.GetPCUs()
if rec.plc_node_stats:
print rec.plc_node_stats
boot_state = rec.plc_node_stats['boot_state']
+ run_level = rec.plc_node_stats['run_level']
last_contact = rec.plc_node_stats['last_contact']
node.plc_nodeid = rec.plc_node_stats['node_id']
else:
node.haspcu = False
node.firewall = rec.firewall
-
+ node.plc_siteid = rec.plc_node_stats['site_id']
# NOTE: 'DOWN' and 'DEBUG' are temporary states, so only need
# 'translations' into the node.status state
# 'BOOT' is a permanent state, but we want it to have a bit of
# hysteresis (less than 0.5 days)
-
#################################################################
# "Initialize" the findbad states into nodebad status if they are not already set
node.status = 'offline'
node.last_changed = datetime.now()
-
- #if node_state == 'DOWN' and node.status not in ['offline', 'down', 'disabled']:
- # if boot_state != 'disabled':
- # print "changed status from %s to offline" % node.status
- # node.status = 'offline'
- # node.last_changed = datetime.now()
- # else:
- # print "changed status from %s to %s" % (node.status, boot_state)
- # node.status = boot_state
- # node.last_changed = datetime.now()
-
- if node_state == 'DEBUG' and node.status != 'monitordebug' and \
- node.status != 'disabled' and \
- node.status != 'safeboot':
+ if node_state == 'DEBUG' and node.status not in ['failboot', 'disabled', 'safeboot']:
if boot_state != 'disabled' and boot_state != 'safeboot':
-
- print "changed status from %s to monitordebug" % (node.status)
- node.status = "monitordebug"
+ print "changed status from %s to failboot" % (node.status)
+ node.status = "failboot"
node.last_changed = datetime.now()
else:
print "changed status from %s to %s" % (node.status, boot_state)
# Switch temporary hystersis states into their 'firm' states.
# online -> good after half a day
# offline -> down after two days
- # monitordebug -> down after 30 days
- # safeboot -> monitordebug after 60 days
+ # failboot -> down after 30 days
+ # safeboot -> failboot after 60 days
# disabled -> down after 60 days
if node.status == 'online' and changed_greaterthan(node.last_changed, 0.5):
node.status = 'down'
# NOTE: do not reset last_changed, or you lose how long it's been down.
- if node.status == 'monitordebug' and changed_greaterthan(node.last_changed, 30):
+ if node.status == 'failboot' and changed_greaterthan(node.last_changed, 30):
print "changed status from %s to down" % node.status
node.status = 'down'
# NOTE: do not reset last_changed, or you lose how long it's been down.
if node.status == 'safeboot' and changed_greaterthan(node.last_changed, 60):
print "changed status from %s to down" % node.status
- # NOTE: change an admin mode back into monitordebug after two months.
- node.status = 'monitordebug'
+ # NOTE: change an admin mode back into failboot after two months.
+ node.status = 'failboot'
node.last_changed = datetime.now()
# extreme cases of offline nodes
self.transport.close()
return 0
+class BayTechGeorgeTown(PCUControl):
+ supported_ports = [22,23]
+ def run_telnet(self, node_port, dryrun):
+ return self.run_ssh(node_port, dryrun)
+ def run_ssh(self, node_port, dryrun):
+ # NOTE: The georgetown pcu always drops the first connection,
+ self.transport.open(self.host, self.username, None, "Enter user name:")
+ self.transport.close()
+ time.sleep(1)
+ self.transport.open(self.host, self.username, None, "Enter user name:")
+ self.transport.sendPassword(self.password, "Enter Password:")
+
+ self.transport.ifThenSend("RPC-16>", "Reboot %d" % node_port)
+
+ # Reboot Outlet N (Y/N)?
+ if dryrun:
+ self.transport.ifThenSend("(Y/N)?", "N")
+ else:
+ self.transport.ifThenSend("(Y/N)?", "Y")
+ self.transport.ifThenSend("RPC-16>", "")
+
+ self.transport.close()
+ return 0
+
+
class BayTechRPC16(PCUControl):
supported_ports = [22,23]
def run_telnet(self, node_port, dryrun):
# Testing Reboot ?
#index = s.expect(["DRAC 5", "[%s]#" % self.username ])
# NOTE: be careful to escape any characters used by 're.compile'
- index = s.expect(["\$", "\[%s\]#" % self.username ])
+ index = s.expect(["\$", "\[%s\]#" % self.username, "/.*>" ])
print "INDEX:", index
print s
if dryrun:
s.sendline("racadm getsysinfo")
elif index == 1:
s.sendline("getsysinfo")
+ elif index == 2:
+ s.sendline("racadm getsysinfo")
else:
print "serveraction powercycle"
if index == 0:
s.sendline("racadm serveraction powercycle")
elif index == 1:
s.sendline("serveraction powercycle")
+ elif index == 2:
+ s.sendline("racadm serveraction powercycle")
# TODO: this is really lousy. Without the sleep, the sendlines
# don't completely get through. Even the added, expect line
# other context...
s.send("\r\n\r\n")
time.sleep(20)
- index = s.expect(["\$", "\[%s\]#" % self.username ])
+ index = s.expect(["\$", "\[%s\]#" % self.username, "/.*>" ])
print s
print "INDEX 2:", index
s.sendline("exit")
# if it is offline and HAS a PCU, then try to use it.
if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
changed_greaterthan(nodehist.last_changed,1.0) and \
+ not nodehist.firewall and \
not found_between(recent_actions, 'try_reboot', 3.5, 1):
sitehist.attemptReboot(host)
# will be false for a day after the above condition is satisfied
if nodehist.haspcu and nodehist.status in ['offline', 'down'] and \
changed_greaterthan(nodehist.last_changed,1.5) and \
+ not nodehist.firewall and \
found_between(recent_actions, 'try_reboot', 3.5, 1) and \
not found_within(recent_actions, 'pcufailed_notice', 3.5):
sitehist.sendMessage('pcufailed_notice', hostname=host)
print "send message for host %s PCU Failure" % host
- if nodehist.status == 'monitordebug' and \
+ if nodehist.status == 'failboot' and \
changed_greaterthan(nodehist.last_changed, 1) and \
not found_between(recent_actions, 'bootmanager_restore', 0.5, 0):
# send down node notice
if nodehist.firewall and not found_within(recent_actions, 'firewall_notice', 3.5):
# send down node notice
- email_exception(host, "firewall_notice")
+ #email_exception(host, "firewall_notice")
sitehist.sendMessage('firewall_notice', hostname=host)
print "send message for host %s down" % host
ALTER TABLE historynoderecord ADD COLUMN firewall boolean DEFAULT false;
ALTER TABLE historynoderecord_history ADD COLUMN firewall boolean DEFAULT false;
+ALTER TABLE historynoderecord ADD COLUMN plc_siteid integer DEFAULT 1;
+ALTER TABLE historynoderecord_history ADD COLUMN plc_siteid integer DEFAULT 1;