3 # Attempt to reboot a node in debug state.
16 from monitor.getsshkeys import SSHKnownHosts
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
41 api = plc.getAuthAPI()
45 class ExceptionDoubleSSHError(Exception): pass
48 def __init__(self, connection, node, config):
53 def get_boot_state(self):
55 if self.c.modules.os.path.exists('/tmp/source'):
57 elif self.c.modules.os.path.exists('/vservers'):
63 print self.c.modules.sys.path
71 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
72 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
73 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
74 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
75 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
78 def get_bootmanager_log(self):
79 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
80 download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
81 os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
82 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
86 # def get_dmesg(self):
87 # self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
88 # download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
89 # log = open("log/dmesg.%s.log" % self.node, 'r')
92 # def get_bootmanager_log(self):
93 # download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
94 # #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
95 # os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
96 # log = open("log/bm.%s.log" % self.node, 'r')
99 def dump_plconf_file(self):
101 self.c.modules.sys.path.append("/tmp/source/")
102 self.c.modules.os.chdir('/tmp/source')
104 log = c.modules.BootManager.log('/tmp/new.log')
105 bm = c.modules.BootManager.BootManager(log,'boot')
107 BootManagerException = c.modules.Exceptions.BootManagerException
108 InitializeBootManager = c.modules.BootManager.InitializeBootManager
109 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
112 InitializeBootManager.Run(bm.VARS, bm.LOG)
113 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
117 print " Possibly, unable to find valid configuration file"
120 for key in bm.VARS.keys():
121 print key, " == ", bm.VARS[key]
123 print " Unable to read Node Configuration"
126 def compare_and_repair_nodekeys(self):
128 self.c.modules.sys.path.append("/tmp/source/")
129 self.c.modules.os.chdir('/tmp/source')
131 log = c.modules.BootManager.log('/tmp/new.log')
132 bm = c.modules.BootManager.BootManager(log,'boot')
134 BootManagerException = c.modules.Exceptions.BootManagerException
135 InitializeBootManager = c.modules.BootManager.InitializeBootManager
136 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
139 plcnode = plccache.GetNodeByName(self.node)
141 InitializeBootManager.Run(bm.VARS, bm.LOG)
142 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
147 print " Possibly, unable to find valid configuration file"
150 print " NODE: %s" % bm.VARS['NODE_KEY']
151 print " PLC : %s" % plcnode['key']
153 if bm.VARS['NODE_KEY'] == plcnode['key']:
156 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
157 print " Successfully updated NODE_KEY with PLC"
162 #for key in bm.VARS.keys():
163 # print key, " == ", bm.VARS[key]
165 print " Unable to retrieve NODE_KEY"
167 def bootmanager_running(self):
168 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
173 def set_nodestate(self, state='boot'):
174 return api.UpdateNode(self.node, {'boot_state' : state})
176 def restart_node(self, state='boot'):
177 api.UpdateNode(self.node, {'boot_state' : state})
179 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
180 if not pflags.getRecentFlag('gentlekill'):
181 print " Killing all slice processes... : %s" % self.node
182 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
183 self.c.modules.os.system(cmd_slicekill)
184 cmd = """ shutdown -r +1 & """
185 print " Restarting %s : %s" % ( self.node, cmd)
186 self.c.modules.os.system(cmd)
188 pflags.setRecentFlag('gentlekill')
191 print " Restarting with sysrq 'sub' %s" % self.node
192 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
193 self.c.modules.os.system(cmd)
197 def restart_bootmanager(self, forceState):
199 self.c.modules.os.chdir('/tmp/source')
200 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
201 print " BootManager is already running: try again soon..."
203 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
204 cmd = "( touch /tmp/BM_RUNNING ; " + \
205 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
206 " rm -f /tmp/BM_RUNNING " + \
208 cmd = cmd % forceState
209 self.c.modules.os.system(cmd)
214 class PlanetLabSession:
215 globalport = 22000 + int(random.random()*1000)
217 def __init__(self, node, nosetup, verbose):
218 self.verbose = verbose
221 self.nosetup = nosetup
225 def get_connection(self, config):
226 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
229 # print i, conn.c.modules.sys.path
230 # print conn.c.modules.os.path.exists('/tmp/source')
235 def setup_host(self):
236 self.port = PlanetLabSession.globalport
237 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
240 args['port'] = self.port
241 args['user'] = 'root'
242 args['hostname'] = self.node
243 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
247 print "Skipping setup"
250 # COPY Rpyc files to host
251 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
252 if self.verbose: print cmd
256 localos = moncommands.CMD()
258 ret = localos.system(cmd, timeout)
261 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
262 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
263 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
264 ret = localos.system(cmd, timeout)
267 print "\tFAILED TWICE"
269 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
272 # KILL any already running servers.
273 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
274 (ov,ev) = ssh.run_noexcept2("""<<\EOF
276 echo "kill server" >> out.log
277 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
278 echo "export" >> out.log
279 export PYTHONPATH=$HOME ;
280 echo "start server" >> out.log
281 python Rpyc/Servers/forking_server.py &> server.log &
282 echo "done" >> out.log
284 #cmd = """ssh %(user)s@%(hostname)s """ + \
285 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
287 #if self.verbose: print cmd
289 #print localos.system(cmd,timeout)
291 ## START a new rpyc server.
292 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
293 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
295 #if self.verbose: print cmd
296 #print localos.system(cmd,timeout)
297 print "setup rpyc server over ssh"
301 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
302 # and the following options seems to work well.
303 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
304 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
305 """-o ConnectTimeout=120 """ + \
306 """-n -N -L %(port)s:localhost:18812 """ + \
307 """%(user)s@%(hostname)s"""
309 if self.verbose: print cmd
311 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
312 # TODO: the read() here may block indefinitely. Need a better
313 # approach therefore, that includes a timeout.
314 #ret = self.command.stdout.read(5)
315 ret = moncommands.read_t(self.command.stdout, 5)
319 # NOTE: There is still a slight race for machines that are slow...
320 self.timeout = 2*(t2-t1)
321 print "Sleeping for %s sec" % self.timeout
322 time.sleep(self.timeout)
325 if self.command.returncode is not None:
326 print "Failed to establish tunnel!"
327 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
329 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
333 if self.verbose: print "Killing SSH session %s" % self.port
334 print "Killing SSH session %s" % self.port
338 def steps_to_list(steps, index=1):
339 return map(lambda x: x[index], steps)
341 def index_to_id(steps,index):
342 if index < len(steps):
343 return steps[index][0]
347 class DebugInterface:
348 def __init__(self, hostname):
349 self.hostname = hostname
352 def getConnection(self):
353 print "Creating session for %s" % self.hostname
354 # update known_hosts file (in case the node has rebooted since last run)
356 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
359 print traceback.print_exc()
364 self.session = PlanetLabSession(self.hostname, False, True)
366 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
367 except ExceptionDoubleSSHError, e:
368 msg = "ERROR setting up session for %s" % self.hostname
372 traceback.print_exc()
377 conn = self.session.get_connection(config)
379 # NOTE: sometimes the wait in setup_host() is not long enough.
380 # So, here we try to wait a little longer before giving up entirely.
382 time.sleep(self.session.timeout*5)
383 conn = self.session.get_connection(config)
385 # failed twice... no need to report this really, it's just in a
389 traceback.print_exc()
390 email_exception(self.hostname)
392 #print "trying to use conn before returning it."
393 #print conn.c.modules.sys.path
394 #print conn.c.modules.os.path.exists('/tmp/source')
397 #print "conn: %s" % conn
400 def getSequences(self):
402 # TODO: This can be replaced with a DB definition at a future time.
403 # This would make it possible for an admin to introduce new
404 # patterns without touching code.
407 # restart_bootmanager_boot
408 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
409 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
410 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
412 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
414 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
415 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
416 "bminit-cfg-auth-getplc-update-debug-done",
417 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
418 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
419 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
420 "bminit-cfg-auth-protoerror-exception-update-debug-done",
421 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
422 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
424 sequences.update({n : "restart_bootmanager_boot"})
426 # conn.restart_bootmanager('reinstall')
427 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
428 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
429 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
430 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
431 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
432 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
433 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
434 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
435 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
436 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
437 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
438 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
439 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
440 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
441 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
442 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
443 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
444 # actual solution appears to involve removing the bad files, and
445 # continually trying to boot the node.
446 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
447 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
448 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
449 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
450 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
452 sequences.update({n : "restart_bootmanager_rins"})
455 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
456 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
458 sequences.update({n: "repair_node_keys"})
460 # conn.restart_node('reinstall')
461 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
462 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
463 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
464 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
465 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
466 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
467 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
468 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
469 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
470 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
471 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
472 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
474 sequences.update({n : "restart_node_rins"})
477 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
478 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
479 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
480 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
481 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
482 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
483 "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
485 sequences.update({n: "restart_node_boot"})
487 # update_node_config_email
488 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
489 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
490 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
491 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
493 sequences.update({n : "update_node_config_email"})
495 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
496 "bminit-cfg-update-exception-nodehostname-update-debug-done",
498 sequences.update({n : "nodenetwork_email"})
500 # update_bootcd_email
501 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
502 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
503 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
504 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
505 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
507 sequences.update({n : "update_bootcd_email"})
509 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
511 sequences.update({n: "suspect_error_email"})
513 # update_hardware_email
514 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
515 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
517 # broken_hardware_email
518 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
522 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
523 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
525 sequences.update( { n : "bad_dns_email"})
529 def getDiskSteps(self):
531 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
532 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
533 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
535 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
537 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
538 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
540 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
541 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
543 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
544 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
546 ('floppytimeout','floppy0: floppy timeout called'),
547 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
549 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
550 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
552 # floppy0: floppy timeout called
553 # end_request: I/O error, dev fd0, sector 0
555 # Buffer I/O error on device dm-2, logical block 8888896
556 # ata1: status=0x51 { DriveReady SeekComplete Error }
557 # ata1: error=0x40 { UncorrectableError }
558 # SCSI error : <0 0 0 0> return code = 0x8000002
559 # sda: Current: sense key: Medium Error
560 # Additional sense: Unrecovered read error - auto reallocate failed
562 # SCSI error : <0 2 0 0> return code = 0x40001
563 # end_request: I/O error, dev sda, sector 572489600
567 def getDiskSequence(self, steps, child):
570 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
577 def getBootManagerStepPatterns(self):
579 ('bminit' , 'Initializing the BootManager.'),
580 ('cfg' , 'Reading node configuration file.'),
581 ('auth' , 'Authenticating node with PLC.'),
582 ('getplc' , 'Retrieving details of node from PLC.'),
583 ('update' , 'Updating node boot state at PLC.'),
584 ('hardware' , 'Checking if hardware requirements met.'),
585 ('installinit' , 'Install: Initializing.'),
586 ('installdisk' , 'Install: partitioning disks.'),
587 ('installbootfs', 'Install: bootstrapfs tarball.'),
588 ('installcfg' , 'Install: Writing configuration files.'),
589 ('installstop' , 'Install: Shutting down installer.'),
590 ('update2' , 'Updating node boot state at PLC.'),
591 ('installinit2' , 'Install: Initializing.'),
592 ('validate' , 'Validating node installation.'),
593 ('rebuildinitrd', 'Rebuilding initrd'),
594 ('netcfg' , 'Install: Writing Network Configuration files.'),
595 ('update3' , 'Updating node configuration.'),
596 ('disk' , 'Checking for unused disks to add to LVM.'),
597 ('update4' , 'Sending hardware configuration to PLC.'),
598 ('debug' , 'Starting debug mode'),
599 ('bmexceptmount', 'BootManagerException during mount'),
600 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
601 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
602 ('exception' , 'Exception'),
603 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
604 ('protoerror' , 'XML RPC protocol error'),
605 ('nodehostname' , 'Configured node hostname does not resolve'),
606 ('implementerror', 'Implementation Error'),
607 ('readonlyfs' , '[Errno 30] Read-only file system'),
608 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
609 ('noinstall' , 'notinstalled'),
610 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
611 ('noblockdev' , "No block devices detected."),
612 ('dnserror' , 'Name or service not known'),
613 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
614 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
615 ('hardwarerequirefail' , 'Hardware requirements not met'),
616 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
617 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
618 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
619 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
620 ('modulefail' , 'Unable to get list of system modules'),
621 ('writeerror' , 'write error: No space left on device'),
622 ('nospace' , "No space left on device"),
623 ('nonode' , 'Failed to authenticate call: No such node'),
624 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
625 ('bootcheckfail' , 'BootCheckAuthentication'),
626 ('bootupdatefail' , 'BootUpdateNode'),
630 def getBootManagerSequenceFromLog(self, steps, child):
634 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
635 id = index_to_id(steps,index)
638 if id == "exception":
639 print "...Found An Exception!!!"
640 elif id == "done": #index == len(steps_to_list(steps)):
646 def restore(sitehist, hostname, config=None, forced_action=None):
647 ret = restore_basic(sitehist, hostname, config, forced_action)
651 def restore_basic(sitehist, hostname, config=None, forced_action=None):
653 # NOTE: Nothing works if the bootcd is REALLY old.
654 # So, this is the first step.
656 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
657 recent_actions = sitehist.getRecentActions(hostname=hostname)
659 if fbnode['observed_category'] == "OLDBOOTCD":
660 print "\t...Notify owner to update BootImage!!!"
662 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
663 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
665 print "\tDisabling %s due to out-of-date BootImage" % hostname
666 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
668 # NOTE: nothing else is possible.
671 debugnode = DebugInterface(hostname)
672 conn = debugnode.getConnection()
673 if type(conn) == type(False): return False
675 boot_state = conn.get_boot_state()
676 if boot_state != "debug":
677 print "... %s in %s state: skipping..." % (hostname , boot_state)
678 return boot_state == "boot"
680 if conn.bootmanager_running():
681 print "...BootManager is currently running. Skipping host %s" %hostname
684 # Read persistent flags, tagged on one week intervals.
686 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
687 dmesg = conn.get_dmesg()
688 child = fdpexpect.fdspawn(dmesg)
690 steps = debugnode.getDiskSteps()
691 sequence = debugnode.getDiskSequence(steps, child)
694 if config and not config.quiet: print "\tSET: ", s
697 print "...Potential drive errors on %s" % hostname
698 if len(s) == 2 and 'floppyerror' in s:
699 print "...Should investigate. Continuing with node."
701 print "...Should investigate. Skipping node."
702 # TODO: send message related to these errors.
704 if not found_within(recent_actions, 'baddisk_notice', 7):
705 print "baddisk_notice not found recently"
707 log=conn.get_dmesg().read()
708 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
709 conn.set_nodestate('disabled')
713 print "...Downloading bm.log from %s" %hostname
714 log = conn.get_bootmanager_log()
715 child = fdpexpect.fdspawn(log)
717 if hasattr(config, 'collect') and config.collect: return True
719 if config and not config.quiet: print "...Scanning bm.log for errors"
723 steps = debugnode.getBootManagerStepPatterns()
724 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
726 s = "-".join(sequence)
727 print " FOUND SEQUENCE: ", s
729 # NOTE: We get or set the flag based on the current sequence identifier.
730 # By using the sequence identifier, we guarantee that there will be no
731 # frequent loops. I'm guessing there is a better way to track loops,
734 sequences = debugnode.getSequences()
737 if s not in sequences:
738 print " HOST %s" % hostname
739 print " UNKNOWN SEQUENCE: %s" % s
742 args['hostname'] = hostname
744 args['bmlog'] = conn.get_bootmanager_log().read()
745 args['viart'] = False
746 args['saveact'] = True
748 sitehist.sendMessage('unknownsequence_notice', **args)
750 conn.restart_bootmanager('boot')
752 # NOTE: Do not set the pflags value for this sequence if it's unknown.
753 # This way, we can check it again after we've fixed it.
758 if sequences[s] == "restart_bootmanager_boot":
759 print "...Restarting BootManager.py on %s "%hostname
760 conn.restart_bootmanager('boot')
761 elif sequences[s] == "restart_bootmanager_rins":
762 print "...Restarting BootManager.py on %s "%hostname
763 conn.restart_bootmanager('reinstall')
764 elif sequences[s] == "restart_node_rins":
765 conn.restart_node('reinstall')
766 elif sequences[s] == "restart_node_boot":
767 conn.restart_node('boot')
768 elif sequences[s] == "repair_node_keys":
769 if conn.compare_and_repair_nodekeys():
770 # the keys either are in sync or were forced in sync.
771 # so try to reboot the node again.
772 # TODO: why was this originally 'reinstall' instead of 'boot'??
773 conn.restart_bootmanager('boot')
776 # there was some failure to synchronize the keys.
777 print "...Unable to repair node keys on %s" %hostname
779 elif sequences[s] == "suspect_error_email":
781 args['hostname'] = hostname
783 args['bmlog'] = conn.get_bootmanager_log().read()
784 args['viart'] = False
785 args['saveact'] = True
787 sitehist.sendMessage('unknownsequence_notice', **args)
788 conn.restart_bootmanager('boot')
790 # TODO: differentiate this and the 'nodenetwork_email' actions.
791 elif sequences[s] == "update_node_config_email":
793 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
795 args['hostname'] = hostname
796 sitehist.sendMessage('nodeconfig_notice', **args)
797 conn.dump_plconf_file()
799 elif sequences[s] == "nodenetwork_email":
801 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
803 args['hostname'] = hostname
804 args['bmlog'] = conn.get_bootmanager_log().read()
805 sitehist.sendMessage('nodeconfig_notice', **args)
806 conn.dump_plconf_file()
808 elif sequences[s] == "update_bootcd_email":
810 if not found_within(recent_actions, 'newalphacd_notice', 3.5):
812 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
813 args['hostname'] = hostname
815 sitehist.sendMessage('newalphacd_notice', **args)
817 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
819 elif sequences[s] == "broken_hardware_email":
820 # MAKE An ACTION record that this host has failed hardware. May
821 # require either an exception "/minhw" or other manual intervention.
822 # Definitely need to send out some more EMAIL.
823 # TODO: email notice of broken hardware
824 if not found_within(recent_actions, 'baddisk_notice', 7):
825 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
827 args['hostname'] = hostname
828 args['log'] = conn.get_dmesg().read()
830 sitehist.sendMessage('baddisk_notice', **args)
831 conn.set_nodestate('disabled')
833 elif sequences[s] == "update_hardware_email":
834 if not found_within(recent_actions, 'minimalhardware_notice', 7):
835 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
837 args['hostname'] = hostname
838 args['bmlog'] = conn.get_bootmanager_log().read()
839 sitehist.sendMessage('minimalhardware_notice', **args)
841 elif sequences[s] == "bad_dns_email":
842 if not found_within(recent_actions, 'baddns_notice', 1):
843 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
846 node = plccache.GetNodeByName(hostname)
847 net = api.GetInterfaces(node['interface_ids'])[0]
850 print traceback.print_exc()
851 # TODO: api error. skip email, b/c all info is not available,
852 # flag_set will not be recorded.
854 nodenet_str = network_config_to_str(net)
856 args['hostname'] = hostname
857 args['network_config'] = nodenet_str
858 args['interface_id'] = net['interface_id']
860 sitehist.sendMessage('baddns_notice', **args)
865 # MAIN -------------------------------------------------------------------
868 from monitor import parser as parsermodule
869 parser = parsermodule.getParser()
871 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
872 force=None, quiet=False)
873 parser.add_option("", "--child", dest="child", action="store_true",
874 help="This is the child mode of this process.")
875 parser.add_option("", "--force", dest="force", metavar="boot_state",
876 help="Force a boot state passed to BootManager.py.")
877 parser.add_option("", "--quiet", dest="quiet", action="store_true",
878 help="Extra quiet output messages.")
879 parser.add_option("", "--verbose", dest="verbose", action="store_true",
880 help="Extra debug output messages.")
881 parser.add_option("", "--nonet", dest="nonet", action="store_true",
882 help="Do not setup the network, use existing log files to re-run a test pass.")
883 parser.add_option("", "--collect", dest="collect", action="store_true",
884 help="No action, just collect dmesg, and bm.log")
885 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
886 help="Do not perform the orginary setup phase.")
888 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
889 config = parsermodule.parse_args(parser)
892 nodes = config.getListFromFile(config.nodelist)
894 nodes = [ config.node ]
901 lb = plccache.plcdb_hn2lb[node]
902 sitehist = SiteInterface.get_or_make(loginbase=lb)
903 #reboot(node, config)
904 restore(sitehist, node, config=None, forced_action=None)
906 if __name__ == "__main__":