3 # Attempt to reboot a node in debug state.
16 from monitor.getsshkeys import SSHKnownHosts
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
41 api = plc.getAuthAPI()
45 class ExceptionDoubleSSHError(Exception): pass
48 def __init__(self, connection, node, config):
53 def get_boot_state(self):
55 if self.c.modules.os.path.exists('/tmp/source'):
57 elif self.c.modules.os.path.exists('/vservers'):
63 print self.c.modules.sys.path
71 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
72 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
73 log = open("log/dmesg.%s.log" % self.node, 'r')
76 def get_bootmanager_log(self):
77 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
78 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
79 os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
80 log = open("log/bm.%s.log" % self.node, 'r')
83 def dump_plconf_file(self):
85 self.c.modules.sys.path.append("/tmp/source/")
86 self.c.modules.os.chdir('/tmp/source')
88 log = c.modules.BootManager.log('/tmp/new.log')
89 bm = c.modules.BootManager.BootManager(log,'boot')
91 BootManagerException = c.modules.Exceptions.BootManagerException
92 InitializeBootManager = c.modules.BootManager.InitializeBootManager
93 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
96 InitializeBootManager.Run(bm.VARS, bm.LOG)
97 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
101 print " Possibly, unable to find valid configuration file"
104 for key in bm.VARS.keys():
105 print key, " == ", bm.VARS[key]
107 print " Unable to read Node Configuration"
110 def compare_and_repair_nodekeys(self):
112 self.c.modules.sys.path.append("/tmp/source/")
113 self.c.modules.os.chdir('/tmp/source')
115 log = c.modules.BootManager.log('/tmp/new.log')
116 bm = c.modules.BootManager.BootManager(log,'boot')
118 BootManagerException = c.modules.Exceptions.BootManagerException
119 InitializeBootManager = c.modules.BootManager.InitializeBootManager
120 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
123 plcnode = plccache.GetNodeByName(self.node)
125 InitializeBootManager.Run(bm.VARS, bm.LOG)
126 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
131 print " Possibly, unable to find valid configuration file"
134 print " NODE: %s" % bm.VARS['NODE_KEY']
135 print " PLC : %s" % plcnode['key']
137 if bm.VARS['NODE_KEY'] == plcnode['key']:
140 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
141 print " Successfully updated NODE_KEY with PLC"
146 #for key in bm.VARS.keys():
147 # print key, " == ", bm.VARS[key]
149 print " Unable to retrieve NODE_KEY"
151 def bootmanager_running(self):
152 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
157 def set_nodestate(self, state='boot'):
158 return api.UpdateNode(self.node, {'boot_state' : state})
160 def restart_node(self, state='boot'):
161 api.UpdateNode(self.node, {'boot_state' : state})
163 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
164 if not pflags.getRecentFlag('gentlekill'):
165 print " Killing all slice processes... : %s" % self.node
166 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
167 self.c.modules.os.system(cmd_slicekill)
168 cmd = """ shutdown -r +1 & """
169 print " Restarting %s : %s" % ( self.node, cmd)
170 self.c.modules.os.system(cmd)
172 pflags.setRecentFlag('gentlekill')
175 print " Restarting with sysrq 'sub' %s" % self.node
176 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
177 self.c.modules.os.system(cmd)
181 def restart_bootmanager(self, forceState):
183 self.c.modules.os.chdir('/tmp/source')
184 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
185 print " BootManager is already running: try again soon..."
187 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
188 cmd = "( touch /tmp/BM_RUNNING ; " + \
189 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
190 " rm -f /tmp/BM_RUNNING " + \
192 cmd = cmd % forceState
193 self.c.modules.os.system(cmd)
198 class PlanetLabSession:
199 globalport = 22000 + int(random.random()*1000)
201 def __init__(self, node, nosetup, verbose):
202 self.verbose = verbose
205 self.nosetup = nosetup
209 def get_connection(self, config):
210 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
213 # print i, conn.c.modules.sys.path
214 # print conn.c.modules.os.path.exists('/tmp/source')
219 def setup_host(self):
220 self.port = PlanetLabSession.globalport
221 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
224 args['port'] = self.port
225 args['user'] = 'root'
226 args['hostname'] = self.node
227 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
231 print "Skipping setup"
234 # COPY Rpyc files to host
235 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
236 if self.verbose: print cmd
240 localos = moncommands.CMD()
242 ret = localos.system(cmd, timeout)
245 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
246 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
247 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
248 ret = localos.system(cmd, timeout)
251 print "\tFAILED TWICE"
253 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
256 # KILL any already running servers.
257 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
258 (ov,ev) = ssh.run_noexcept2("""<<\EOF
260 echo "kill server" >> out.log
261 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
262 echo "export" >> out.log
263 export PYTHONPATH=$HOME ;
264 echo "start server" >> out.log
265 python Rpyc/Servers/forking_server.py &> server.log &
266 echo "done" >> out.log
268 #cmd = """ssh %(user)s@%(hostname)s """ + \
269 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
271 #if self.verbose: print cmd
273 #print localos.system(cmd,timeout)
275 ## START a new rpyc server.
276 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
277 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
279 #if self.verbose: print cmd
280 #print localos.system(cmd,timeout)
281 print "setup rpyc server over ssh"
285 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
286 # and the following options seems to work well.
287 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
288 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
289 """-o ConnectTimeout=120 """ + \
290 """-n -N -L %(port)s:localhost:18812 """ + \
291 """%(user)s@%(hostname)s"""
293 if self.verbose: print cmd
295 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
296 # TODO: the read() here may block indefinitely. Need a better
297 # approach therefore, that includes a timeout.
298 #ret = self.command.stdout.read(5)
299 ret = moncommands.read_t(self.command.stdout, 5)
303 # NOTE: There is still a slight race for machines that are slow...
304 self.timeout = 2*(t2-t1)
305 print "Sleeping for %s sec" % self.timeout
306 time.sleep(self.timeout)
309 if self.command.returncode is not None:
310 print "Failed to establish tunnel!"
311 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
313 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
317 if self.verbose: print "Killing SSH session %s" % self.port
318 print "Killing SSH session %s" % self.port
322 def steps_to_list(steps, index=1):
323 return map(lambda x: x[index], steps)
325 def index_to_id(steps,index):
326 if index < len(steps):
327 return steps[index][0]
331 class DebugInterface:
332 def __init__(self, hostname):
333 self.hostname = hostname
336 def getConnection(self):
337 print "Creating session for %s" % self.hostname
338 # update known_hosts file (in case the node has rebooted since last run)
340 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
343 print traceback.print_exc()
348 self.session = PlanetLabSession(self.hostname, False, True)
350 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
351 except ExceptionDoubleSSHError, e:
352 msg = "ERROR setting up session for %s" % self.hostname
356 traceback.print_exc()
361 conn = self.session.get_connection(config)
363 # NOTE: sometimes the wait in setup_host() is not long enough.
364 # So, here we try to wait a little longer before giving up entirely.
366 time.sleep(self.session.timeout*5)
367 conn = self.session.get_connection(config)
369 # failed twice... no need to report this really, it's just in a
373 traceback.print_exc()
374 email_exception(self.hostname)
376 #print "trying to use conn before returning it."
377 #print conn.c.modules.sys.path
378 #print conn.c.modules.os.path.exists('/tmp/source')
381 #print "conn: %s" % conn
384 def getSequences(self):
386 # TODO: This can be replaced with a DB definition at a future time.
387 # This would make it possible for an admin to introduce new
388 # patterns without touching code.
391 # restart_bootmanager_boot
392 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
393 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
394 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
396 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
398 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
399 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
400 "bminit-cfg-auth-getplc-update-debug-done",
401 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
402 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
403 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
404 "bminit-cfg-auth-protoerror-exception-update-debug-done",
405 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
406 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
408 sequences.update({n : "restart_bootmanager_boot"})
410 # conn.restart_bootmanager('reinstall')
411 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
412 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
413 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
414 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
415 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
416 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
417 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
418 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
419 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
420 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
421 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
422 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
423 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
424 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
425 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
426 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
427 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
428 # actual solution appears to involve removing the bad files, and
429 # continually trying to boot the node.
430 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
431 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
432 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
434 sequences.update({n : "restart_bootmanager_rins"})
437 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
439 # conn.restart_node('reinstall')
440 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
441 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
442 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
443 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
444 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
445 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
446 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
447 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
448 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
449 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
450 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
451 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
453 sequences.update({n : "restart_node_rins"})
456 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
457 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
458 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
459 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
460 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
461 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
463 sequences.update({n: "restart_node_boot"})
465 # update_node_config_email
466 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
467 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
468 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
470 sequences.update({n : "update_node_config_email"})
472 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
473 "bminit-cfg-update-exception-nodehostname-update-debug-done",
475 sequences.update({n : "nodenetwork_email"})
477 # update_bootcd_email
478 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
479 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
480 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
481 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
482 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
484 sequences.update({n : "update_bootcd_email"})
486 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
488 sequences.update({n: "suspect_error_email"})
490 # update_hardware_email
491 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
492 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
494 # broken_hardware_email
495 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
499 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
500 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
502 sequences.update( { n : "bad_dns_email"})
506 def getDiskSteps(self):
508 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
509 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
510 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
512 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
514 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
515 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
517 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
518 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
520 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
521 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
523 ('floppytimeout','floppy0: floppy timeout called'),
524 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
526 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
527 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
529 # floppy0: floppy timeout called
530 # end_request: I/O error, dev fd0, sector 0
532 # Buffer I/O error on device dm-2, logical block 8888896
533 # ata1: status=0x51 { DriveReady SeekComplete Error }
534 # ata1: error=0x40 { UncorrectableError }
535 # SCSI error : <0 0 0 0> return code = 0x8000002
536 # sda: Current: sense key: Medium Error
537 # Additional sense: Unrecovered read error - auto reallocate failed
539 # SCSI error : <0 2 0 0> return code = 0x40001
540 # end_request: I/O error, dev sda, sector 572489600
544 def getDiskSequence(self, steps, child):
547 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
554 def getBootManagerStepPatterns(self):
556 ('bminit' , 'Initializing the BootManager.'),
557 ('cfg' , 'Reading node configuration file.'),
558 ('auth' , 'Authenticating node with PLC.'),
559 ('getplc' , 'Retrieving details of node from PLC.'),
560 ('update' , 'Updating node boot state at PLC.'),
561 ('hardware' , 'Checking if hardware requirements met.'),
562 ('installinit' , 'Install: Initializing.'),
563 ('installdisk' , 'Install: partitioning disks.'),
564 ('installbootfs', 'Install: bootstrapfs tarball.'),
565 ('installcfg' , 'Install: Writing configuration files.'),
566 ('installstop' , 'Install: Shutting down installer.'),
567 ('update2' , 'Updating node boot state at PLC.'),
568 ('installinit2' , 'Install: Initializing.'),
569 ('validate' , 'Validating node installation.'),
570 ('rebuildinitrd', 'Rebuilding initrd'),
571 ('netcfg' , 'Install: Writing Network Configuration files.'),
572 ('update3' , 'Updating node configuration.'),
573 ('disk' , 'Checking for unused disks to add to LVM.'),
574 ('update4' , 'Sending hardware configuration to PLC.'),
575 ('debug' , 'Starting debug mode'),
576 ('bmexceptmount', 'BootManagerException during mount'),
577 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
578 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
579 ('exception' , 'Exception'),
580 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
581 ('protoerror' , 'XML RPC protocol error'),
582 ('nodehostname' , 'Configured node hostname does not resolve'),
583 ('implementerror', 'Implementation Error'),
584 ('readonlyfs' , '[Errno 30] Read-only file system'),
585 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
586 ('noinstall' , 'notinstalled'),
587 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
588 ('noblockdev' , "No block devices detected."),
589 ('dnserror' , 'Name or service not known'),
590 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
591 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
592 ('hardwarerequirefail' , 'Hardware requirements not met'),
593 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
594 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
595 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
596 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
597 ('modulefail' , 'Unable to get list of system modules'),
598 ('writeerror' , 'write error: No space left on device'),
599 ('nospace' , "No space left on device"),
600 ('nonode' , 'Failed to authenticate call: No such node'),
601 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
602 ('bootcheckfail' , 'BootCheckAuthentication'),
603 ('bootupdatefail' , 'BootUpdateNode'),
607 def getBootManagerSequenceFromLog(self, steps, child):
611 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
612 id = index_to_id(steps,index)
615 if id == "exception":
616 print "...Found An Exception!!!"
617 elif id == "done": #index == len(steps_to_list(steps)):
624 def restore(sitehist, hostname, config=None, forced_action=None):
626 # NOTE: Nothing works if the bootcd is REALLY old.
627 # So, this is the first step.
629 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
630 recent_actions = sitehist.getRecentActions(hostname=hostname)
632 if fbnode['observed_category'] == "OLDBOOTCD":
633 print "\t...Notify owner to update BootImage!!!"
635 if not found_within(recent_actions, 'newbootcd_notice', 3):
636 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
638 print "\tDisabling %s due to out-of-date BootImage" % hostname
639 api.UpdateNode(hostname, {'boot_state' : 'disable'})
641 # NOTE: nothing else is possible.
644 debugnode = DebugInterface(hostname)
645 conn = debugnode.getConnection()
646 #print "conn: %s" % conn
647 #print "trying to use conn after returning it."
648 #print conn.c.modules.sys.path
649 #print conn.c.modules.os.path.exists('/tmp/source')
650 if type(conn) == type(False): return False
652 #if forced_action == "reboot":
653 # conn.restart_node('reinstall')
656 boot_state = conn.get_boot_state()
657 if boot_state != "debug":
658 print "... %s in %s state: skipping..." % (hostname , boot_state)
659 return boot_state == "boot"
661 if conn.bootmanager_running():
662 print "...BootManager is currently running. Skipping host %s" %hostname
665 # Read persistent flags, tagged on one week intervals.
667 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
668 dmesg = conn.get_dmesg()
669 child = fdpexpect.fdspawn(dmesg)
671 steps = debugnode.getDiskSteps()
672 sequence = debugnode.getDiskSequence(steps, child)
675 if config and not config.quiet: print "\tSET: ", s
678 print "...Potential drive errors on %s" % hostname
679 if len(s) == 2 and 'floppyerror' in s:
680 print "...Should investigate. Continuing with node."
682 print "...Should investigate. Skipping node."
683 # TODO: send message related to these errors.
685 if not found_within(recent_actions, 'newbootcd_notice', 3):
687 log=conn.get_dmesg().read()
688 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
689 conn.set_nodestate('disable')
693 print "...Downloading bm.log from %s" %hostname
694 log = conn.get_bootmanager_log()
695 child = fdpexpect.fdspawn(log)
697 if hasattr(config, 'collect') and config.collect: return True
699 if config and not config.quiet: print "...Scanning bm.log for errors"
703 steps = debugnode.getBootManagerStepPatterns()
704 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
706 s = "-".join(sequence)
707 print " FOUND SEQUENCE: ", s
709 # NOTE: We get or set the flag based on the current sequence identifier.
710 # By using the sequence identifier, we guarantee that there will be no
711 # frequent loops. I'm guessing there is a better way to track loops,
714 sequences = debugnode.getSequences()
717 if s not in sequences:
718 print " HOST %s" % hostname
719 print " UNKNOWN SEQUENCE: %s" % s
722 args['hostname'] = hostname
724 args['bmlog'] = conn.get_bootmanager_log().read()
725 args['viart'] = False
727 sitehist.sendMessage('unknownsequence_notice', **args)
729 conn.restart_bootmanager('boot')
731 # NOTE: Do not set the pflags value for this sequence if it's unknown.
732 # This way, we can check it again after we've fixed it.
737 if sequences[s] == "restart_bootmanager_boot":
738 print "...Restarting BootManager.py on %s "%hostname
739 conn.restart_bootmanager('boot')
740 elif sequences[s] == "restart_bootmanager_rins":
741 print "...Restarting BootManager.py on %s "%hostname
742 conn.restart_bootmanager('reinstall')
743 elif sequences[s] == "restart_node_rins":
744 conn.restart_node('reinstall')
745 elif sequences[s] == "restart_node_boot":
746 conn.restart_node('boot')
747 elif sequences[s] == "repair_node_keys":
748 if conn.compare_and_repair_nodekeys():
749 # the keys either are in sync or were forced in sync.
750 # so try to reboot the node again.
751 conn.restart_bootmanager('reinstall')
754 # there was some failure to synchronize the keys.
755 print "...Unable to repair node keys on %s" %hostname
757 elif sequences[s] == "suspect_error_email":
759 args['hostname'] = hostname
761 args['bmlog'] = conn.get_bootmanager_log().read()
762 args['viart'] = False
764 sitehist.sendMessage('unknownsequence_notice', **args)
765 conn.restart_bootmanager('boot')
767 # TODO: differentiate this and the 'nodenetwork_email' actions.
768 elif sequences[s] == "update_node_config_email":
770 if not found_within(recent_actions, 'nodeconfig_notice', 3):
772 args['hostname'] = hostname
773 sitehist.sendMessage('nodeconfig_notice', **args)
774 conn.dump_plconf_file()
776 elif sequences[s] == "nodenetwork_email":
778 if not found_within(recent_actions, 'nodeconfig_notice', 3):
780 args['hostname'] = hostname
781 args['bmlog'] = conn.get_bootmanager_log().read()
782 sitehist.sendMessage('nodeconfig_notice', **args)
783 conn.dump_plconf_file()
785 elif sequences[s] == "update_bootcd_email":
787 if not found_within(recent_actions, 'newalphacd_notice', 3):
789 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
790 args['hostname'] = hostname
792 sitehist.sendMessage('newalphacd_notice', **args)
794 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
796 elif sequences[s] == "broken_hardware_email":
797 # MAKE An ACTION record that this host has failed hardware. May
798 # require either an exception "/minhw" or other manual intervention.
799 # Definitely need to send out some more EMAIL.
800 # TODO: email notice of broken hardware
801 if not found_within(recent_actions, 'baddisk_notice', 1):
802 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
804 args['hostname'] = hostname
805 args['log'] = conn.get_dmesg().read()
807 sitehist.sendMessage('baddisk_notice', **args)
808 conn.set_nodestate('disable')
810 elif sequences[s] == "update_hardware_email":
811 if not found_within(recent_actions, 'minimalhardware_notice', 1):
812 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
814 args['hostname'] = hostname
815 args['bmlog'] = conn.get_bootmanager_log().read()
816 sitehist.sendMessage('minimalhardware_notice', **args)
818 elif sequences[s] == "bad_dns_email":
819 if not found_within(recent_actions, 'baddns_notice', 1):
820 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
823 node = plccache.GetNodeByName(hostname)
824 net = api.GetInterfaces(node['interface_ids'])[0]
827 print traceback.print_exc()
828 # TODO: api error. skip email, b/c all info is not available,
829 # flag_set will not be recorded.
831 nodenet_str = network_config_to_str(net)
833 args['hostname'] = hostname
834 args['network_config'] = nodenet_str
835 args['interface_id'] = net['interface_id']
837 sitehist.sendMessage('baddns_notice', **args)
842 # MAIN -------------------------------------------------------------------
845 from monitor import parser as parsermodule
846 parser = parsermodule.getParser()
848 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
849 force=None, quiet=False)
850 parser.add_option("", "--child", dest="child", action="store_true",
851 help="This is the child mode of this process.")
852 parser.add_option("", "--force", dest="force", metavar="boot_state",
853 help="Force a boot state passed to BootManager.py.")
854 parser.add_option("", "--quiet", dest="quiet", action="store_true",
855 help="Extra quiet output messages.")
856 parser.add_option("", "--verbose", dest="verbose", action="store_true",
857 help="Extra debug output messages.")
858 parser.add_option("", "--nonet", dest="nonet", action="store_true",
859 help="Do not setup the network, use existing log files to re-run a test pass.")
860 parser.add_option("", "--collect", dest="collect", action="store_true",
861 help="No action, just collect dmesg, and bm.log")
862 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
863 help="Do not perform the orginary setup phase.")
865 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
866 config = parsermodule.parse_args(parser)
869 nodes = config.getListFromFile(config.nodelist)
871 nodes = [ config.node ]
878 lb = plccache.plcdb_hn2lb[node]
879 sitehist = SiteInterface.get_or_make(loginbase=lb)
880 #reboot(node, config)
881 restore(sitehist, node, config=None, forced_action=None)
883 if __name__ == "__main__":