import traceback
import config
+class ExceptionDoubleSSHError(Exception): pass
+
import signal
class Sopen(subprocess.Popen):
def kill(self, signal = signal.SIGTERM):
return "unknown"
def get_dmesg(self):
+ t_stamp = time.strftime("%Y-%m-%d-%H:%M")
self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
- download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
+ download(self.c, "/var/log/dmesg.bm.log", "log/history/%s-dmesg.%s.log" % (t_stamp, self.node))
+ os.system("cp log/history/%s-dmesg.%s.log log/dmesg.%s.log" % (t_stamp, self.node, self.node))
log = open("log/dmesg.%s.log" % self.node, 'r')
return log
def get_bootmanager_log(self):
- download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
- os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+ t_stamp = time.strftime("%Y-%m-%d-%H:%M")
+ download(self.c, "/tmp/bm.log", "log/history/%s-bm.%s.log" % (t_stamp, self.node))
+ #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+ os.system("cp log/history/%s-bm.%s.log log/bm.%s.log" % (t_stamp, self.node, self.node))
log = open("log/bm.%s.log" % self.node, 'r')
return log
if ret != 0:
print "\tFAILED TWICE"
#sys.exit(1)
- raise Exception("Failed twice trying to login with updated ssh host key")
+ raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
t1 = time.time()
# KILL any already running servers.
try:
k = SSHKnownHosts(); k.update(node); k.write(); del k
except:
+ from nodecommon import email_exception
+ email_exception()
print traceback.print_exc()
return False
session = PlanetLabSession(node, False, True)
else:
session = PlanetLabSession(node, config.nosetup, config.verbose)
+ except ExceptionDoubleSSHError, e:
+ msg = "ERROR setting up session for %s" % hostname
+ print msg
+ return False
except Exception, e:
- print "ERROR setting up session for %s" % hostname
+ msg = "ERROR setting up session for %s" % hostname
+ print msg
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception(msg)
print e
return False
try:
time.sleep(session.timeout*4)
conn = session.get_connection(config)
+ except EOFError:
+ # failed twice... no need to report this really, it's just in a
+ # weird state...
+ return False
except:
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception(node)
return False
if forced_action == "reboot":
- conn.restart_node('rins')
+ conn.restart_node('reinstall')
return True
boot_state = conn.get_boot_state()
]:
sequences.update({n : "restart_bootmanager_boot"})
- # conn.restart_bootmanager('rins')
+ # conn.restart_bootmanager('reinstall')
for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
"bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
# actual solution appears to involve removing the bad files, and
# continually trying to boot the node.
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
]:
sequences.update({n : "restart_bootmanager_rins"})
# repair_node_keys
sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
- # conn.restart_node('rins')
+ # conn.restart_node('reinstall')
for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
"bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
]:
sequences.update({n: "restart_node_boot"})
conn.restart_bootmanager('boot')
elif sequences[s] == "restart_bootmanager_rins":
if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
- conn.restart_bootmanager('rins')
+ conn.restart_bootmanager('reinstall')
elif sequences[s] == "restart_node_rins":
- conn.restart_node('rins')
+ conn.restart_node('reinstall')
elif sequences[s] == "restart_node_boot":
conn.restart_node('boot')
elif sequences[s] == "repair_node_keys":
if conn.compare_and_repair_nodekeys():
# the keys either are in sync or were forced in sync.
# so try to reboot the node again.
- conn.restart_bootmanager('rins')
+ conn.restart_bootmanager('reinstall')
pass
else:
# there was some failure to synchronize the keys.
args = {}
args['hostname'] = hostname
args['bmlog'] = conn.get_bootmanager_log().read()
- m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args,
+ m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
True, db='nodenet_persistmessages')
loginbase = plc.siteId(hostname)
emails = plc.getTechEmails(loginbase)
args = {}
try:
node = api.GetNodes(hostname)[0]
- net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+ net = api.GetInterfaces(node['interface_ids'])[0]
except:
+ from nodecommon import email_exception
+ email_exception()
print traceback.print_exc()
# TODO: api error. skip email, b/c all info is not available,
# flag_set will not be recorded.
args['hostname'] = hostname
args['network_config'] = nodenet_str
- args['nodenetwork_id'] = net['nodenetwork_id']
+ args['interface_id'] = net['interface_id']
m = PersistMessage(hostname, mailtxt.baddns[0] % args,
mailtxt.baddns[1] % args, True, db='baddns_persistmessages')