3 # Attempt to reboot a node in debug state.
16 from getsshkeys import SSHKnownHosts
18 from Rpyc import SocketConnection, Async
19 from Rpyc.Utils import *
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.wrapper import plc
28 from monitor.wrapper.emailTxt import mailtxt
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
36 from nodeconfig import network_config_to_str
39 api = plc.getAuthAPI()
44 def __init__(self, connection, node, config):
49 def get_boot_state(self):
51 if self.c.modules.os.path.exists('/tmp/source'):
53 elif self.c.modules.os.path.exists('/vservers'):
59 print self.c.modules.sys.path
66 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
67 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
68 log = open("log/dmesg.%s.log" % self.node, 'r')
71 def get_bootmanager_log(self):
72 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
73 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
74 log = open("log/bm.%s.log" % self.node, 'r')
77 def dump_plconf_file(self):
79 self.c.modules.sys.path.append("/tmp/source/")
80 self.c.modules.os.chdir('/tmp/source')
82 log = c.modules.BootManager.log('/tmp/new.log')
83 bm = c.modules.BootManager.BootManager(log,'boot')
85 BootManagerException = c.modules.Exceptions.BootManagerException
86 InitializeBootManager = c.modules.BootManager.InitializeBootManager
87 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
90 InitializeBootManager.Run(bm.VARS, bm.LOG)
91 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
95 print " Possibly, unable to find valid configuration file"
98 for key in bm.VARS.keys():
99 print key, " == ", bm.VARS[key]
101 print " Unable to read Node Configuration"
104 def compare_and_repair_nodekeys(self):
106 self.c.modules.sys.path.append("/tmp/source/")
107 self.c.modules.os.chdir('/tmp/source')
109 log = c.modules.BootManager.log('/tmp/new.log')
110 bm = c.modules.BootManager.BootManager(log,'boot')
112 BootManagerException = c.modules.Exceptions.BootManagerException
113 InitializeBootManager = c.modules.BootManager.InitializeBootManager
114 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
117 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
119 InitializeBootManager.Run(bm.VARS, bm.LOG)
120 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
125 print " Possibly, unable to find valid configuration file"
128 print " NODE: %s" % bm.VARS['NODE_KEY']
129 print " PLC : %s" % plcnode['key']
131 if bm.VARS['NODE_KEY'] == plcnode['key']:
134 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
135 print " Successfully updated NODE_KEY with PLC"
140 #for key in bm.VARS.keys():
141 # print key, " == ", bm.VARS[key]
143 print " Unable to retrieve NODE_KEY"
145 def bootmanager_running(self):
146 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
151 def set_nodestate(self, state='boot'):
152 return api.UpdateNode(self.node, {'boot_state' : state})
154 def restart_node(self, state='boot'):
155 api.UpdateNode(self.node, {'boot_state' : state})
157 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
158 if not pflags.getRecentFlag('gentlekill'):
159 print " Killing all slice processes... : %s" % self.node
160 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
161 self.c.modules.os.system(cmd_slicekill)
162 cmd = """ shutdown -r +1 & """
163 print " Restarting %s : %s" % ( self.node, cmd)
164 self.c.modules.os.system(cmd)
166 pflags.setRecentFlag('gentlekill')
169 print " Restarting with sysrq 'sub' %s" % self.node
170 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
171 self.c.modules.os.system(cmd)
175 def restart_bootmanager(self, forceState):
177 self.c.modules.os.chdir('/tmp/source')
178 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
179 print " BootManager is already running: try again soon..."
181 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
182 cmd = "( touch /tmp/BM_RUNNING ; " + \
183 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
184 " rm -f /tmp/BM_RUNNING " + \
186 cmd = cmd % forceState
187 self.c.modules.os.system(cmd)
192 class PlanetLabSession:
193 globalport = 22000 + int(random.random()*1000)
195 def __init__(self, node, nosetup, verbose):
196 self.verbose = verbose
199 self.nosetup = nosetup
203 def get_connection(self, config):
204 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
207 # print i, conn.c.modules.sys.path
208 # print conn.c.modules.os.path.exists('/tmp/source')
213 def setup_host(self):
214 self.port = PlanetLabSession.globalport
215 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
218 args['port'] = self.port
219 args['user'] = 'root'
220 args['hostname'] = self.node
221 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
225 print "Skipping setup"
228 # COPY Rpyc files to host
229 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
230 if self.verbose: print cmd
234 localos = moncommands.CMD()
236 ret = localos.system(cmd, timeout)
239 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
240 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
241 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
242 ret = localos.system(cmd, timeout)
245 print "\tFAILED TWICE"
247 raise Exception("Failed twice trying to login with updated ssh host key")
250 # KILL any already running servers.
251 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
252 (ov,ev) = ssh.run_noexcept2("""<<\EOF
254 echo "kill server" >> out.log
255 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
256 echo "export" >> out.log
257 export PYTHONPATH=$HOME ;
258 echo "start server" >> out.log
259 python Rpyc/Servers/forking_server.py &> server.log &
260 echo "done" >> out.log
262 #cmd = """ssh %(user)s@%(hostname)s """ + \
263 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
265 #if self.verbose: print cmd
267 #print localos.system(cmd,timeout)
269 ## START a new rpyc server.
270 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
271 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
273 #if self.verbose: print cmd
274 #print localos.system(cmd,timeout)
275 print "setup rpyc server over ssh"
279 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
280 # and the following options seems to work well.
281 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
282 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
283 """-o ConnectTimeout=120 """ + \
284 """-n -N -L %(port)s:localhost:18812 """ + \
285 """%(user)s@%(hostname)s"""
287 if self.verbose: print cmd
289 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
290 # TODO: the read() here may block indefinitely. Need a better
291 # approach therefore, that includes a timeout.
292 #ret = self.command.stdout.read(5)
293 ret = moncommands.read_t(self.command.stdout, 5)
297 # NOTE: There is still a slight race for machines that are slow...
298 self.timeout = 2*(t2-t1)
299 print "Sleeping for %s sec" % self.timeout
300 time.sleep(self.timeout)
303 if self.command.returncode is not None:
304 print "Failed to establish tunnel!"
305 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
307 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
311 if self.verbose: print "Killing SSH session %s" % self.port
312 print "Killing SSH session %s" % self.port
316 def steps_to_list(steps, index=1):
317 return map(lambda x: x[index], steps)
319 def index_to_id(steps,index):
320 if index < len(steps):
321 return steps[index][0]
325 class DebugInterface:
326 def __init__(self, hostname):
327 self.hostname = hostname
330 def getConnection(self):
331 print "Creating session for %s" % self.hostname
332 # update known_hosts file (in case the node has rebooted since last run)
334 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
337 print traceback.print_exc()
342 self.session = PlanetLabSession(self.hostname, False, True)
344 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
346 msg = "ERROR setting up session for %s" % self.hostname
348 traceback.print_exc()
353 conn = self.session.get_connection(config)
355 # NOTE: sometimes the wait in setup_host() is not long enough.
356 # So, here we try to wait a little longer before giving up entirely.
358 time.sleep(self.session.timeout*5)
359 conn = self.session.get_connection(config)
361 traceback.print_exc()
362 email_exception(self.hostname)
364 #print "trying to use conn before returning it."
365 #print conn.c.modules.sys.path
366 #print conn.c.modules.os.path.exists('/tmp/source')
369 #print "conn: %s" % conn
372 def getSequences(self):
374 # TODO: This can be replaced with a DB definition at a future time.
375 # This would make it possible for an admin to introduce new
376 # patterns without touching code.
379 # restart_bootmanager_boot
380 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
381 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
382 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
384 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
386 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
387 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
388 "bminit-cfg-auth-getplc-update-debug-done",
389 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
390 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
391 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
392 "bminit-cfg-auth-protoerror-exception-update-debug-done",
393 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
394 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
396 sequences.update({n : "restart_bootmanager_boot"})
398 # conn.restart_bootmanager('rins')
399 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
400 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
401 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
402 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
403 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
404 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
405 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
406 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
407 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
408 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
409 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
410 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
411 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
412 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
413 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
414 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
415 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
416 # actual solution appears to involve removing the bad files, and
417 # continually trying to boot the node.
418 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
419 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
420 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
422 sequences.update({n : "restart_bootmanager_rins"})
425 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
427 # conn.restart_node('rins')
428 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
429 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
430 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
431 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
432 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
433 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
434 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
435 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
436 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
437 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
438 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
439 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
441 sequences.update({n : "restart_node_rins"})
444 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
445 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
446 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
447 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
448 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
449 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
451 sequences.update({n: "restart_node_boot"})
453 # update_node_config_email
454 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
455 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
456 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
458 sequences.update({n : "update_node_config_email"})
460 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
461 "bminit-cfg-update-exception-nodehostname-update-debug-done",
463 sequences.update({n : "nodenetwork_email"})
465 # update_bootcd_email
466 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
467 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
468 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
469 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
470 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
472 sequences.update({n : "update_bootcd_email"})
474 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
476 sequences.update({n: "suspect_error_email"})
478 # update_hardware_email
479 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
480 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
482 # broken_hardware_email
483 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
487 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
488 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
490 sequences.update( { n : "bad_dns_email"})
494 def getDiskSteps(self):
496 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
497 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
498 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
500 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
502 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
503 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
505 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
506 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
508 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
509 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
511 ('floppytimeout','floppy0: floppy timeout called'),
512 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
514 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
515 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
517 # floppy0: floppy timeout called
518 # end_request: I/O error, dev fd0, sector 0
520 # Buffer I/O error on device dm-2, logical block 8888896
521 # ata1: status=0x51 { DriveReady SeekComplete Error }
522 # ata1: error=0x40 { UncorrectableError }
523 # SCSI error : <0 0 0 0> return code = 0x8000002
524 # sda: Current: sense key: Medium Error
525 # Additional sense: Unrecovered read error - auto reallocate failed
527 # SCSI error : <0 2 0 0> return code = 0x40001
528 # end_request: I/O error, dev sda, sector 572489600
532 def getDiskSequence(self, steps, child):
535 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
542 def getBootManagerStepPatterns(self):
544 ('bminit' , 'Initializing the BootManager.'),
545 ('cfg' , 'Reading node configuration file.'),
546 ('auth' , 'Authenticating node with PLC.'),
547 ('getplc' , 'Retrieving details of node from PLC.'),
548 ('update' , 'Updating node boot state at PLC.'),
549 ('hardware' , 'Checking if hardware requirements met.'),
550 ('installinit' , 'Install: Initializing.'),
551 ('installdisk' , 'Install: partitioning disks.'),
552 ('installbootfs', 'Install: bootstrapfs tarball.'),
553 ('installcfg' , 'Install: Writing configuration files.'),
554 ('installstop' , 'Install: Shutting down installer.'),
555 ('update2' , 'Updating node boot state at PLC.'),
556 ('installinit2' , 'Install: Initializing.'),
557 ('validate' , 'Validating node installation.'),
558 ('rebuildinitrd', 'Rebuilding initrd'),
559 ('netcfg' , 'Install: Writing Network Configuration files.'),
560 ('update3' , 'Updating node configuration.'),
561 ('disk' , 'Checking for unused disks to add to LVM.'),
562 ('update4' , 'Sending hardware configuration to PLC.'),
563 ('debug' , 'Starting debug mode'),
564 ('bmexceptmount', 'BootManagerException during mount'),
565 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
566 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
567 ('exception' , 'Exception'),
568 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
569 ('protoerror' , 'XML RPC protocol error'),
570 ('nodehostname' , 'Configured node hostname does not resolve'),
571 ('implementerror', 'Implementation Error'),
572 ('readonlyfs' , '[Errno 30] Read-only file system'),
573 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
574 ('noinstall' , 'notinstalled'),
575 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
576 ('noblockdev' , "No block devices detected."),
577 ('dnserror' , 'Name or service not known'),
578 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
579 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
580 ('hardwarerequirefail' , 'Hardware requirements not met'),
581 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
582 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
583 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
584 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
585 ('modulefail' , 'Unable to get list of system modules'),
586 ('writeerror' , 'write error: No space left on device'),
587 ('nospace' , "No space left on device"),
588 ('nonode' , 'Failed to authenticate call: No such node'),
589 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
590 ('bootcheckfail' , 'BootCheckAuthentication'),
591 ('bootupdatefail' , 'BootUpdateNode'),
595 def getBootManagerSequenceFromLog(self, steps, child):
599 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
600 id = index_to_id(steps,index)
603 if id == "exception":
604 print "...Found An Exception!!!"
605 elif id == "done": #index == len(steps_to_list(steps)):
612 def restore(sitehist, hostname, config=None, forced_action=None):
614 # NOTE: Nothing works if the bootcd is REALLY old.
615 # So, this is the first step.
617 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
618 recent_actions = sitehist.getRecentActions(hostname=hostname)
620 if fbnode['observed_category'] == "OLDBOOTCD":
621 print "\t...Notify owner to update BootImage!!!"
623 if not found_within(recent_actions, 'newbootcd_notice', 3):
624 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
626 print "\tDisabling %s due to out-of-date BootImage" % hostname
627 api.UpdateNode(hostname, {'boot_state' : 'disable'})
629 # NOTE: nothing else is possible.
632 debugnode = DebugInterface(hostname)
633 conn = debugnode.getConnection()
634 #print "conn: %s" % conn
635 #print "trying to use conn after returning it."
636 #print conn.c.modules.sys.path
637 #print conn.c.modules.os.path.exists('/tmp/source')
638 if type(conn) == type(False): return False
640 #if forced_action == "reboot":
641 # conn.restart_node('rins')
644 boot_state = conn.get_boot_state()
645 if boot_state != "debug":
646 print "... %s in %s state: skipping..." % (hostname , boot_state)
647 return boot_state == "boot"
649 if conn.bootmanager_running():
650 print "...BootManager is currently running. Skipping host %s" %hostname
653 # Read persistent flags, tagged on one week intervals.
654 #pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
656 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
657 dmesg = conn.get_dmesg()
658 child = fdpexpect.fdspawn(dmesg)
660 steps = debugnode.getDiskSteps()
661 sequence = debugnode.getDiskSequence(steps, child)
664 if config and not config.quiet: print "\tSET: ", s
667 print "...Potential drive errors on %s" % hostname
668 if len(s) == 2 and 'floppyerror' in s:
669 print "...Should investigate. Continuing with node."
671 print "...Should investigate. Skipping node."
672 # TODO: send message related to these errors.
674 if not found_within(recent_actions, 'newbootcd_notice', 3):
676 log=conn.get_dmesg().read()
677 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
678 conn.set_nodestate('disable')
682 print "...Downloading bm.log from %s" %hostname
683 log = conn.get_bootmanager_log()
684 child = fdpexpect.fdspawn(log)
686 if hasattr(config, 'collect') and config.collect: return True
688 if config and not config.quiet: print "...Scanning bm.log for errors"
692 steps = debugnode.getBootManagerStepPatterns()
693 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
695 s = "-".join(sequence)
696 print " FOUND SEQUENCE: ", s
698 # NOTE: We get or set the flag based on the current sequence identifier.
699 # By using the sequence identifier, we guarantee that there will be no
700 # frequent loops. I'm guessing there is a better way to track loops,
703 sequences = debugnode.getSequences()
706 if s not in sequences:
707 print " HOST %s" % hostname
708 print " UNKNOWN SEQUENCE: %s" % s
711 args['hostname'] = hostname
713 args['bmlog'] = conn.get_bootmanager_log().read()
714 args['viart'] = False
716 sitehist.sendMessage('unknownsequence_notice', **args)
718 conn.restart_bootmanager('boot')
720 # NOTE: Do not set the pflags value for this sequence if it's unknown.
721 # This way, we can check it again after we've fixed it.
726 if sequences[s] == "restart_bootmanager_boot":
727 print "...Restarting BootManager.py on %s "%hostname
728 conn.restart_bootmanager('boot')
729 elif sequences[s] == "restart_bootmanager_rins":
730 print "...Restarting BootManager.py on %s "%hostname
731 conn.restart_bootmanager('rins')
732 elif sequences[s] == "restart_node_rins":
733 conn.restart_node('rins')
734 elif sequences[s] == "restart_node_boot":
735 conn.restart_node('boot')
736 elif sequences[s] == "repair_node_keys":
737 if conn.compare_and_repair_nodekeys():
738 # the keys either are in sync or were forced in sync.
739 # so try to reboot the node again.
740 conn.restart_bootmanager('rins')
743 # there was some failure to synchronize the keys.
744 print "...Unable to repair node keys on %s" %hostname
746 elif sequences[s] == "suspect_error_email":
748 args['hostname'] = hostname
750 args['bmlog'] = conn.get_bootmanager_log().read()
751 args['viart'] = False
753 sitehist.sendMessage('unknownsequence_notice', **args)
754 conn.restart_bootmanager('boot')
756 # TODO: differentiate this and the 'nodenetwork_email' actions.
757 elif sequences[s] == "update_node_config_email":
759 if not found_within(recent_actions, 'nodeconfig_notice', 3):
761 args['hostname'] = hostname
762 sitehist.sendMessage('nodeconfig_notice', **args)
763 conn.dump_plconf_file()
765 elif sequences[s] == "nodenetwork_email":
767 if not found_within(recent_actions, 'nodeconfig_notice', 3):
769 args['hostname'] = hostname
770 args['bmlog'] = conn.get_bootmanager_log().read()
771 sitehist.sendMessage('nodeconfig_notice', **args)
772 conn.dump_plconf_file()
774 elif sequences[s] == "update_bootcd_email":
776 if not found_within(recent_actions, 'newalphacd_notice', 3):
778 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
779 args['hostname'] = hostname
781 sitehist.sendMessage('newalphacd_notice', **args)
783 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
785 elif sequences[s] == "broken_hardware_email":
786 # MAKE An ACTION record that this host has failed hardware. May
787 # require either an exception "/minhw" or other manual intervention.
788 # Definitely need to send out some more EMAIL.
789 # TODO: email notice of broken hardware
790 if not found_within(recent_actions, 'baddisk_notice', 1):
791 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
793 args['hostname'] = hostname
794 args['log'] = conn.get_dmesg().read()
796 sitehist.sendMessage('baddisk_notice', **args)
797 conn.set_nodestate('disable')
799 elif sequences[s] == "update_hardware_email":
800 if not found_within(recent_actions, 'minimalhardware_notice', 1):
801 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
803 args['hostname'] = hostname
804 args['bmlog'] = conn.get_bootmanager_log().read()
805 sitehist.sendMessage('minimalhardware_notice', **args)
807 elif sequences[s] == "bad_dns_email":
808 if not found_within(recent_actions, 'baddns_notice', 1):
809 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
812 node = api.GetNodes(hostname)[0]
813 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
816 print traceback.print_exc()
817 # TODO: api error. skip email, b/c all info is not available,
818 # flag_set will not be recorded.
820 nodenet_str = network_config_to_str(net)
822 args['hostname'] = hostname
823 args['network_config'] = nodenet_str
824 args['nodenetwork_id'] = net['nodenetwork_id']
826 sitehist.sendMessage('baddns_notice', **args)
831 # MAIN -------------------------------------------------------------------
834 from monitor import parser as parsermodule
835 parser = parsermodule.getParser()
837 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
838 force=None, quiet=False)
839 parser.add_option("", "--child", dest="child", action="store_true",
840 help="This is the child mode of this process.")
841 parser.add_option("", "--force", dest="force", metavar="boot_state",
842 help="Force a boot state passed to BootManager.py.")
843 parser.add_option("", "--quiet", dest="quiet", action="store_true",
844 help="Extra quiet output messages.")
845 parser.add_option("", "--verbose", dest="verbose", action="store_true",
846 help="Extra debug output messages.")
847 parser.add_option("", "--nonet", dest="nonet", action="store_true",
848 help="Do not setup the network, use existing log files to re-run a test pass.")
849 parser.add_option("", "--collect", dest="collect", action="store_true",
850 help="No action, just collect dmesg, and bm.log")
851 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
852 help="Do not perform the orginary setup phase.")
854 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
855 config = parsermodule.parse_args(parser)
858 nodes = config.getListFromFile(config.nodelist)
860 nodes = [ config.node ]
868 if __name__ == "__main__":