X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=bootman.py;h=5e8b908359f025fe9f0cb1bd4e1fbbee5df5b365;hb=refs%2Fheads%2F1.0;hp=b9a161f5aeacf484590b2319d676d9b25d684f54;hpb=c3f2afdc81c6711c3825c82e2cd4970671575438;p=monitor.git

diff --git a/bootman.py b/bootman.py
index b9a161f..5e8b908 100755
--- a/bootman.py
+++ b/bootman.py
@@ -7,7 +7,7 @@ api = plc.getAuthAPI()
 
 import sys
 import os
-import policy
+import const
 
 from getsshkeys import SSHKnownHosts
 
@@ -24,7 +24,9 @@ from unified_model import *
 from emailTxt import mailtxt
 from nodeconfig import network_config_to_str
 import traceback
-import monitorconfig
+import config
+
+class ExceptionDoubleSSHError(Exception): pass
 
 import signal
 class Sopen(subprocess.Popen):
@@ -34,9 +36,12 @@ class Sopen(subprocess.Popen):
 #from Rpyc import SocketConnection, Async
 from Rpyc import SocketConnection, Async
 from Rpyc.Utils import *
+fb = None
 
 def get_fbnode(node):
-	fb = database.dbLoad("findbad")
+	global fb
+	if fb is None:
+		fb = database.dbLoad("findbad")
 	fbnode = fb['nodes'][node]['values']
 	return fbnode
 
@@ -55,14 +60,18 @@ class NodeConnection:
 			return "unknown"
 
 	def get_dmesg(self):
+		t_stamp = time.strftime("%Y-%m-%d-%H:%M")
 		self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
-		download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
+		download(self.c, "/var/log/dmesg.bm.log", "log/history/%s-dmesg.%s.log" % (t_stamp, self.node))
+		os.system("cp log/history/%s-dmesg.%s.log log/dmesg.%s.log" % (t_stamp, self.node, self.node))
 		log = open("log/dmesg.%s.log" % self.node, 'r')
 		return log
 
 	def get_bootmanager_log(self):
-		download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
-		os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+		t_stamp = time.strftime("%Y-%m-%d-%H:%M")
+		download(self.c, "/tmp/bm.log", "log/history/%s-bm.%s.log" % (t_stamp, self.node))
+		#os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
+		os.system("cp log/history/%s-bm.%s.log log/bm.%s.log" % (t_stamp, self.node, self.node))
 		log = open("log/bm.%s.log" % self.node, 'r')
 		return log
 
@@ -204,7 +213,7 @@ class PlanetLabSession:
 		args['port'] = self.port
 		args['user'] = 'root'
 		args['hostname'] = self.node
-		args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
+		args['monitordir'] = config.MONITOR_SCRIPT_ROOT
 		ssh_port = 22
 
 		if self.nosetup:
@@ -229,7 +238,7 @@ class PlanetLabSession:
 			if ret != 0:
 				print "\tFAILED TWICE"
 				#sys.exit(1)
-				raise Exception("Failed twice trying to login with updated ssh host key")
+				raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 
 		t1 = time.time()
 		# KILL any already running servers.
@@ -321,7 +330,8 @@ def reboot(hostname, config=None, forced_action=None):
 							mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
 
 		loginbase = plc.siteId(hostname)
-		m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+		emails = plc.getTechEmails(loginbase)
+		m.send(emails) 
 
 		print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 		api.UpdateNode(hostname, {'boot_state' : 'disable'})
@@ -334,6 +344,8 @@ def reboot(hostname, config=None, forced_action=None):
 	try:
 		k = SSHKnownHosts(); k.update(node); k.write(); del k
 	except:
+		from nodecommon import email_exception
+		email_exception()
 		print traceback.print_exc()
 		return False
 
@@ -342,9 +354,16 @@ def reboot(hostname, config=None, forced_action=None):
 			session = PlanetLabSession(node, False, True)
 		else:
 			session = PlanetLabSession(node, config.nosetup, config.verbose)
+	except ExceptionDoubleSSHError, e:
+		msg = "ERROR setting up session for %s" % hostname
+		print msg
+		return False
 	except Exception, e:
-		print "ERROR setting up session for %s" % hostname
+		msg = "ERROR setting up session for %s" % hostname
+		print msg
 		print traceback.print_exc()
+		from nodecommon import email_exception
+		email_exception(msg)
 		print e
 		return False
 
@@ -356,13 +375,18 @@ def reboot(hostname, config=None, forced_action=None):
 		try:
 			time.sleep(session.timeout*4)
 			conn = session.get_connection(config)
+		except EOFError:
+			# failed twice... no need to report this really, it's just in a
+			# weird state...
+			return False
 		except:
 			print traceback.print_exc()
+			from nodecommon import email_exception
+			email_exception(node)
 			return False
-			
 
 	if forced_action == "reboot":
-		conn.restart_node('rins')
+		conn.restart_node('reinstall')
 		return True
 
 	boot_state = conn.get_boot_state()
@@ -400,25 +424,34 @@ def reboot(hostname, config=None, forced_action=None):
 			('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
 
 			('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
+
+			('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
+			('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
+
 			('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
 			('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
+
 			('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
 			('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
+
 			('floppytimeout','floppy0: floppy timeout called'),
 			('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
 
+			# hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
+			# hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
+
 			# floppy0: floppy timeout called
 			# end_request: I/O error, dev fd0, sector 0
 
-			#Buffer I/O error on device dm-2, logical block 8888896
-			#ata1: status=0x51 { DriveReady SeekComplete Error }
-			#ata1: error=0x40 { UncorrectableError }
-			#SCSI error : <0 0 0 0> return code = 0x8000002
-			#sda: Current: sense key: Medium Error
+			# Buffer I/O error on device dm-2, logical block 8888896
+			# ata1: status=0x51 { DriveReady SeekComplete Error }
+			# ata1: error=0x40 { UncorrectableError }
+			# SCSI error : <0 0 0 0> return code = 0x8000002
+			# sda: Current: sense key: Medium Error
 			#	Additional sense: Unrecovered read error - auto reallocate failed
 
-			#SCSI error : <0 2 0 0> return code = 0x40001
-			#end_request: I/O error, dev sda, sector 572489600
+			# SCSI error : <0 2 0 0> return code = 0x40001
+			# end_request: I/O error, dev sda, sector 572489600
 		]
 		id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
 		sequence.append(id)
@@ -444,7 +477,8 @@ def reboot(hostname, config=None, forced_action=None):
 										 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			emails = plc.getTechEmails(loginbase)
+			m.send(emails) 
 			conn.set_nodestate('disable')
 			return False
 
@@ -503,6 +537,7 @@ def reboot(hostname, config=None, forced_action=None):
 			('hardwarerequirefail' , 'Hardware requirements not met'),
 			('mkfsfail'	    , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
 			('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
+			('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
 			('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
 			('modulefail'   , 'Unable to get list of system modules'),
 			('writeerror'   , 'write error: No space left on device'),
@@ -530,11 +565,11 @@ def reboot(hostname, config=None, forced_action=None):
 	#  By using the sequence identifier, we guarantee that there will be no
 	#  frequent loops.  I'm guessing there is a better way to track loops,
 	#  though.
-	if not config.force and pflags.getRecentFlag(s):
-		pflags.setRecentFlag(s)
-		pflags.save() 
-		print "... flag is set or it has already run recently. Skipping %s" % node
-		return True
+	#if not config.force and pflags.getRecentFlag(s):
+	#	pflags.setRecentFlag(s)
+	#	pflags.save() 
+	#	print "... flag is set or it has already run recently. Skipping %s" % node
+	#	return True
 
 	sequences = {}
 
@@ -558,7 +593,7 @@ def reboot(hostname, config=None, forced_action=None):
 			]:
 		sequences.update({n : "restart_bootmanager_boot"})
 
-	#	conn.restart_bootmanager('rins')
+	#	conn.restart_bootmanager('reinstall')
 	for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
 			"bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
 			"bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
@@ -572,14 +607,23 @@ def reboot(hostname, config=None, forced_action=None):
 			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
 			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
 			"bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
+			"bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
+			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
 			"bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
+			# actual solution appears to involve removing the bad files, and
+			# continually trying to boot the node.
+			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
+			"bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+			"bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
+			"bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
+			"bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
 			]:
 		sequences.update({n : "restart_bootmanager_rins"})
 
 	# repair_node_keys
 	sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
 
-	#   conn.restart_node('rins')
+	#   conn.restart_node('reinstall')
 	for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
 			"bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
 			"bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
@@ -601,16 +645,20 @@ def reboot(hostname, config=None, forced_action=None):
 			 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
 			 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
 			 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
+			 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
 			 ]:
 		sequences.update({n: "restart_node_boot"})
 
 	# update_node_config_email
 	for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
-			"bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+			  "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
+			  "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
 			]:
 		sequences.update({n : "update_node_config_email"})
 
-	for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
+	for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
+			   "bminit-cfg-update-exception-nodehostname-update-debug-done", 
+			]:
 		sequences.update({n : "nodenetwork_email"})
 
 	# update_bootcd_email
@@ -634,7 +682,11 @@ def reboot(hostname, config=None, forced_action=None):
 	sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
 
 	# bad_dns_email
-	sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"})
+	for n in [ 
+	 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+		"bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
+		]:
+		sequences.update( { n : "bad_dns_email"})
 
 	flag_set = True
 
@@ -650,7 +702,7 @@ def reboot(hostname, config=None, forced_action=None):
 		m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
 									 mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
 		m.reset()
-		m.send(['monitor-list@lists.planet-lab.org'])
+		m.send([config.cc_email]) 
 
 		conn.restart_bootmanager('boot')
 
@@ -665,16 +717,16 @@ def reboot(hostname, config=None, forced_action=None):
 			conn.restart_bootmanager('boot')
 		elif sequences[s] == "restart_bootmanager_rins":
 			if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
-			conn.restart_bootmanager('rins')
+			conn.restart_bootmanager('reinstall')
 		elif sequences[s] == "restart_node_rins":
-			conn.restart_node('rins')
+			conn.restart_node('reinstall')
 		elif sequences[s] == "restart_node_boot":
 			conn.restart_node('boot')
 		elif sequences[s] == "repair_node_keys":
 			if conn.compare_and_repair_nodekeys():
 				# the keys either are in sync or were forced in sync.
 				# so try to reboot the node again.
-				conn.restart_bootmanager('rins')
+				conn.restart_bootmanager('reinstall')
 				pass
 			else:
 				# there was some failure to synchronize the keys.
@@ -688,7 +740,7 @@ def reboot(hostname, config=None, forced_action=None):
 			m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
 										 mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
 			m.reset()
-			m.send(['monitor-list@lists.planet-lab.org'])
+			m.send([config.cc_email]) 
 
 			conn.restart_bootmanager('boot')
 
@@ -699,7 +751,8 @@ def reboot(hostname, config=None, forced_action=None):
 			m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
 								True, db='nodeid_persistmessages')
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			emails = plc.getTechEmails(loginbase)
+			m.send(emails) 
 			conn.dump_plconf_file()
 			conn.set_nodestate('disable')
 
@@ -708,10 +761,11 @@ def reboot(hostname, config=None, forced_action=None):
 			args = {}
 			args['hostname'] = hostname
 			args['bmlog'] = conn.get_bootmanager_log().read()
-			m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
+			m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
 								True, db='nodenet_persistmessages')
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			emails = plc.getTechEmails(loginbase)
+			m.send(emails) 
 			conn.dump_plconf_file()
 			conn.set_nodestate('disable')
 
@@ -726,7 +780,8 @@ def reboot(hostname, config=None, forced_action=None):
 								mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			emails = plc.getTechEmails(loginbase)
+			m.send(emails) 
 
 			print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 			conn.set_nodestate('disable')
@@ -744,7 +799,8 @@ def reboot(hostname, config=None, forced_action=None):
 										 mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			emails = plc.getTechEmails(loginbase)
+			m.send(emails) 
 			conn.set_nodestate('disable')
 
 		elif sequences[s] == "update_hardware_email":
@@ -756,7 +812,8 @@ def reboot(hostname, config=None, forced_action=None):
 										 mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			emails = plc.getTechEmails(loginbase)
+			m.send(emails) 
 			conn.set_nodestate('disable')
 
 		elif sequences[s] == "bad_dns_email":
@@ -764,8 +821,10 @@ def reboot(hostname, config=None, forced_action=None):
 			args = {}
 			try:
 				node = api.GetNodes(hostname)[0]
-				net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
+				net = api.GetInterfaces(node['interface_ids'])[0]
 			except:
+				from nodecommon import email_exception
+				email_exception()
 				print traceback.print_exc()
 				# TODO: api error. skip email, b/c all info is not available,
 				# flag_set will not be recorded.
@@ -774,12 +833,13 @@ def reboot(hostname, config=None, forced_action=None):
 
 			args['hostname'] = hostname
 			args['network_config'] = nodenet_str
-			args['nodenetwork_id'] = net['nodenetwork_id']
+			args['interface_id'] = net['interface_id']
 			m = PersistMessage(hostname, mailtxt.baddns[0] % args,
 										 mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
 
 			loginbase = plc.siteId(hostname)
-			m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
+			emails = plc.getTechEmails(loginbase)
+			m.send(emails) 
 			conn.set_nodestate('disable')
 
 	if flag_set: