X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=bootman.py;h=981a9118bf4485037e745e595a88ec8623ca709c;hb=da913fbd1629fc4669b186915df8ff3a340482d3;hp=d34e6ef0d40b778445e1d06f7ff3f281ef8825f9;hpb=46b482b101e87c6e01c954ae8085ad9785648ef0;p=monitor.git

diff --git a/bootman.py b/bootman.py
index d34e6ef..981a911 100755
--- a/bootman.py
+++ b/bootman.py
@@ -7,7 +7,7 @@ api = plc.getAuthAPI()
 
 import sys
 import os
-import policy
+import const
 
 from getsshkeys import SSHKnownHosts
 
@@ -20,11 +20,11 @@ from sets import Set
 import ssh.pxssh as pxssh
 import ssh.fdpexpect as fdpexpect
 import ssh.pexpect as pexpect
-from unified_model import *
+from monitor.model import *
 from emailTxt import mailtxt
 from nodeconfig import network_config_to_str
 import traceback
-import monitorconfig
+import config
 
 import signal
 class Sopen(subprocess.Popen):
@@ -34,11 +34,7 @@ class Sopen(subprocess.Popen):
 #from Rpyc import SocketConnection, Async
 from Rpyc import SocketConnection, Async
 from Rpyc.Utils import *
-
-def get_fbnode(node):
-	fb = database.dbLoad("findbad")
-	fbnode = fb['nodes'][node]['values']
-	return fbnode
+fb = None
 
 class NodeConnection:
 	def __init__(self, connection, node, config):
@@ -204,7 +200,7 @@ class PlanetLabSession:
 		args['port'] = self.port
 		args['user'] = 'root'
 		args['hostname'] = self.node
-		args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
+		args['monitordir'] = config.MONITOR_SCRIPT_ROOT
 		ssh_port = 22
 
 		if self.nosetup:
@@ -311,7 +307,7 @@ def reboot(hostname, config=None, forced_action=None):
 
 	# NOTE: Nothing works if the bootcd is REALLY old.
 	#       So, this is the first step.
-	fbnode = get_fbnode(hostname)
+	fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
 	if fbnode['category'] == "OLDBOOTCD":
 		print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
 		args = {}
@@ -321,7 +317,7 @@ def reboot(hostname, config=None, forced_action=None):
 							mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
 
 		loginbase = plc.siteId(hostname)
-		m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+		m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
 
 		print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 		api.UpdateNode(hostname, {'boot_state' : 'disable'})
@@ -359,7 +355,6 @@ def reboot(hostname, config=None, forced_action=None):
 		except:
 			print traceback.print_exc()
 			return False
-			
 
 	if forced_action == "reboot":
 		conn.restart_node('rins')
@@ -453,7 +448,7 @@ def reboot(hostname, config=None, forced_action=None):
 										 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
 			conn.set_nodestate('disable')
 			return False
 
@@ -503,6 +498,7 @@ def reboot(hostname, config=None, forced_action=None):
 			('nodehostname' , 'Configured node hostname does not resolve'),
 			('implementerror', 'Implementation Error'),
 			('readonlyfs'   , '[Errno 30] Read-only file system'),
+			('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
 			('noinstall'    , 'notinstalled'),
 			('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
 			('noblockdev'   , "No block devices detected."),
@@ -512,6 +508,7 @@ def reboot(hostname, config=None, forced_action=None):
 			('hardwarerequirefail' , 'Hardware requirements not met'),
 			('mkfsfail'	    , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
 			('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+			('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
 			('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
 			('modulefail'   , 'Unable to get list of system modules'),
 			('writeerror'   , 'write error: No space left on device'),
@@ -539,11 +536,11 @@ def reboot(hostname, config=None, forced_action=None):
 	#  By using the sequence identifier, we guarantee that there will be no
 	#  frequent loops.  I'm guessing there is a better way to track loops,
 	#  though.
-	if not config.force and pflags.getRecentFlag(s):
-		pflags.setRecentFlag(s)
-		pflags.save() 
-		print "... flag is set or it has already run recently. Skipping %s" % node
-		return True
+	#if not config.force and pflags.getRecentFlag(s):
+	#	pflags.setRecentFlag(s)
+	#	pflags.save() 
+	#	print "... flag is set or it has already run recently. Skipping %s" % node
+	#	return True
 
 	sequences = {}
 
@@ -581,7 +578,13 @@ def reboot(hostname, config=None, forced_action=None):
 			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
 			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
 			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
+			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
 			"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+			# actual solution appears to involve removing the bad files, and
+			# continually trying to boot the node.
+			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
 			]:
 		sequences.update({n : "restart_bootmanager_rins"})
 
@@ -615,11 +618,14 @@ def reboot(hostname, config=None, forced_action=None):
 
 	# update_node_config_email
 	for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-			"bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+			  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+			  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
 			]:
 		sequences.update({n : "update_node_config_email"})
 
-	for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+	for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+			   "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+			]:
 		sequences.update({n : "nodenetwork_email"})
 
 	# update_bootcd_email
@@ -643,7 +649,11 @@ def reboot(hostname, config=None, forced_action=None):
 	sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
 
 	# bad_dns_email
-	sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"})
+	for n in [ 
+	 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+		"bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+		]:
+		sequences.update( { n : "bad_dns_email"})
 
 	flag_set = True
 
@@ -708,7 +718,7 @@ def reboot(hostname, config=None, forced_action=None):
 			m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
 								True, db='nodeid_persistmessages')
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
 			conn.dump_plconf_file()
 			conn.set_nodestate('disable')
 
@@ -720,7 +730,7 @@ def reboot(hostname, config=None, forced_action=None):
 			m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
 								True, db='nodenet_persistmessages')
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
 			conn.dump_plconf_file()
 			conn.set_nodestate('disable')
 
@@ -735,7 +745,7 @@ def reboot(hostname, config=None, forced_action=None):
 								mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
 
 			print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 			conn.set_nodestate('disable')
@@ -753,7 +763,7 @@ def reboot(hostname, config=None, forced_action=None):
 										 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
 			conn.set_nodestate('disable')
 
 		elif sequences[s] == "update_hardware_email":
@@ -765,7 +775,7 @@ def reboot(hostname, config=None, forced_action=None):
 										 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
 			conn.set_nodestate('disable')
 
 		elif sequences[s] == "bad_dns_email":
@@ -788,7 +798,7 @@ def reboot(hostname, config=None, forced_action=None):
 										 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
 
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
 			conn.set_nodestate('disable')
 
 	if flag_set: