import sys
import os
-import policy
+import const
from getsshkeys import SSHKnownHosts
from emailTxt import mailtxt
from nodeconfig import network_config_to_str
import traceback
-import monitorconfig
+import config
+
+class ExceptionDoubleSSHError(Exception): pass
import signal
class Sopen(subprocess.Popen):
#from Rpyc import SocketConnection, Async
from Rpyc import SocketConnection, Async
from Rpyc.Utils import *
+fb = None
def get_fbnode(node):
- fb = database.dbLoad("findbad")
+ global fb
+ if fb is None:
+ fb = database.dbLoad("findbad")
fbnode = fb['nodes'][node]['values']
return fbnode
def get_bootmanager_log(self):
download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
- os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+ #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+ os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
log = open("log/bm.%s.log" % self.node, 'r')
return log
args['port'] = self.port
args['user'] = 'root'
args['hostname'] = self.node
- args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
+ args['monitordir'] = config.MONITOR_SCRIPT_ROOT
ssh_port = 22
if self.nosetup:
if ret != 0:
print "\tFAILED TWICE"
#sys.exit(1)
- raise Exception("Failed twice trying to login with updated ssh host key")
+ raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
t1 = time.time()
# KILL any already running servers.
mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
print "\tDisabling %s due to out-of-date BOOTCD" % hostname
api.UpdateNode(hostname, {'boot_state' : 'disable'})
try:
k = SSHKnownHosts(); k.update(node); k.write(); del k
except:
+ from nodecommon import email_exception
+ email_exception()
print traceback.print_exc()
return False
session = PlanetLabSession(node, False, True)
else:
session = PlanetLabSession(node, config.nosetup, config.verbose)
+ except ExceptionDoubleSSHError, e:
+ msg = "ERROR setting up session for %s" % hostname
+ print msg
+ return False
except Exception, e:
- print "ERROR setting up session for %s" % hostname
+ msg = "ERROR setting up session for %s" % hostname
+ print msg
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception(msg)
print e
return False
try:
time.sleep(session.timeout*4)
conn = session.get_connection(config)
+ except EOFError:
+ # failed twice... no need to report this really, it's just in a
+ # weird state...
+ return False
except:
print traceback.print_exc()
+ from nodecommon import email_exception
+ email_exception(node)
return False
-
if forced_action == "reboot":
conn.restart_node('rins')
mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
conn.set_nodestate('disable')
return False
('hardwarerequirefail' , 'Hardware requirements not met'),
('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+ ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
('modulefail' , 'Unable to get list of system modules'),
('writeerror' , 'write error: No space left on device'),
# By using the sequence identifier, we guarantee that there will be no
# frequent loops. I'm guessing there is a better way to track loops,
# though.
- if not config.force and pflags.getRecentFlag(s):
- pflags.setRecentFlag(s)
- pflags.save()
- print "... flag is set or it has already run recently. Skipping %s" % node
- return True
+ #if not config.force and pflags.getRecentFlag(s):
+ # pflags.setRecentFlag(s)
+ # pflags.save()
+ # print "... flag is set or it has already run recently. Skipping %s" % node
+ # return True
sequences = {}
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+ "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+ # actual solution appears to involve removing the bad files, and
+ # continually trying to boot the node.
+ "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+ "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
]:
sequences.update({n : "restart_bootmanager_rins"})
"bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+ "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
]:
sequences.update({n: "restart_node_boot"})
# update_node_config_email
for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
- "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+ "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
]:
sequences.update({n : "update_node_config_email"})
- for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+ for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
+ "bminit-cfg-update-exception-nodehostname-update-debug-done",
+ ]:
sequences.update({n : "nodenetwork_email"})
# update_bootcd_email
sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
# bad_dns_email
- sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"})
+ for n in [
+ "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+ "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+ ]:
+ sequences.update( { n : "bad_dns_email"})
flag_set = True
m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
m.reset()
- m.send(['monitor-list@lists.planet-lab.org'])
+ m.send([config.cc_email])
conn.restart_bootmanager('boot')
m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
m.reset()
- m.send(['monitor-list@lists.planet-lab.org'])
+ m.send([config.cc_email])
conn.restart_bootmanager('boot')
m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
True, db='nodeid_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
conn.dump_plconf_file()
conn.set_nodestate('disable')
args = {}
args['hostname'] = hostname
args['bmlog'] = conn.get_bootmanager_log().read()
- m = PersistMessage(hostname, mailtxt.plnode_network[0] % args, mailtxt.plnode_cfg[1] % args,
+ m = PersistMessage(hostname, mailtxt.plnode_cfg[0] % args, mailtxt.plnode_cfg[1] % args,
True, db='nodenet_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
conn.dump_plconf_file()
conn.set_nodestate('disable')
mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
print "\tDisabling %s due to out-of-date BOOTCD" % hostname
conn.set_nodestate('disable')
mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
conn.set_nodestate('disable')
elif sequences[s] == "update_hardware_email":
mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
conn.set_nodestate('disable')
elif sequences[s] == "bad_dns_email":
node = api.GetNodes(hostname)[0]
net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
except:
+ from nodecommon import email_exception
+ email_exception()
print traceback.print_exc()
# TODO: api error. skip email, b/c all info is not available,
# flag_set will not be recorded.
mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
loginbase = plc.siteId(hostname)
- m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+ emails = plc.getTechEmails(loginbase)
+ m.send(emails)
conn.set_nodestate('disable')
if flag_set: