3 # Attempt to reboot a node in debug state.
16 from monitor.getsshkeys import SSHKnownHosts
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
41 api = plc.getAuthAPI()
45 class ExceptionDoubleSSHError(Exception): pass
48 def __init__(self, connection, node, config):
53 def get_boot_state(self):
55 if self.c.modules.os.path.exists('/tmp/source'):
57 elif self.c.modules.os.path.exists('/vservers'):
63 print self.c.modules.sys.path
71 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
72 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
73 log = open("log/dmesg.%s.log" % self.node, 'r')
76 def get_bootmanager_log(self):
77 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
78 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
79 os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
80 log = open("log/bm.%s.log" % self.node, 'r')
83 def dump_plconf_file(self):
85 self.c.modules.sys.path.append("/tmp/source/")
86 self.c.modules.os.chdir('/tmp/source')
88 log = c.modules.BootManager.log('/tmp/new.log')
89 bm = c.modules.BootManager.BootManager(log,'boot')
91 BootManagerException = c.modules.Exceptions.BootManagerException
92 InitializeBootManager = c.modules.BootManager.InitializeBootManager
93 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
96 InitializeBootManager.Run(bm.VARS, bm.LOG)
97 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
101 print " Possibly, unable to find valid configuration file"
104 for key in bm.VARS.keys():
105 print key, " == ", bm.VARS[key]
107 print " Unable to read Node Configuration"
110 def compare_and_repair_nodekeys(self):
112 self.c.modules.sys.path.append("/tmp/source/")
113 self.c.modules.os.chdir('/tmp/source')
115 log = c.modules.BootManager.log('/tmp/new.log')
116 bm = c.modules.BootManager.BootManager(log,'boot')
118 BootManagerException = c.modules.Exceptions.BootManagerException
119 InitializeBootManager = c.modules.BootManager.InitializeBootManager
120 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
123 plcnode = plccache.GetNodeByName(self.node)
125 InitializeBootManager.Run(bm.VARS, bm.LOG)
126 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
131 print " Possibly, unable to find valid configuration file"
134 print " NODE: %s" % bm.VARS['NODE_KEY']
135 print " PLC : %s" % plcnode['key']
137 if bm.VARS['NODE_KEY'] == plcnode['key']:
140 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
141 print " Successfully updated NODE_KEY with PLC"
146 #for key in bm.VARS.keys():
147 # print key, " == ", bm.VARS[key]
149 print " Unable to retrieve NODE_KEY"
151 def bootmanager_running(self):
152 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
157 def set_nodestate(self, state='boot'):
158 return api.UpdateNode(self.node, {'boot_state' : state})
160 def restart_node(self, state='boot'):
161 api.UpdateNode(self.node, {'boot_state' : state})
163 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
164 if not pflags.getRecentFlag('gentlekill'):
165 print " Killing all slice processes... : %s" % self.node
166 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
167 self.c.modules.os.system(cmd_slicekill)
168 cmd = """ shutdown -r +1 & """
169 print " Restarting %s : %s" % ( self.node, cmd)
170 self.c.modules.os.system(cmd)
172 pflags.setRecentFlag('gentlekill')
175 print " Restarting with sysrq 'sub' %s" % self.node
176 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
177 self.c.modules.os.system(cmd)
181 def restart_bootmanager(self, forceState):
183 self.c.modules.os.chdir('/tmp/source')
184 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
185 print " BootManager is already running: try again soon..."
187 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
188 cmd = "( touch /tmp/BM_RUNNING ; " + \
189 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
190 " rm -f /tmp/BM_RUNNING " + \
192 cmd = cmd % forceState
193 self.c.modules.os.system(cmd)
198 class PlanetLabSession:
199 globalport = 22000 + int(random.random()*1000)
201 def __init__(self, node, nosetup, verbose):
202 self.verbose = verbose
205 self.nosetup = nosetup
209 def get_connection(self, config):
210 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
213 # print i, conn.c.modules.sys.path
214 # print conn.c.modules.os.path.exists('/tmp/source')
219 def setup_host(self):
220 self.port = PlanetLabSession.globalport
221 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
224 args['port'] = self.port
225 args['user'] = 'root'
226 args['hostname'] = self.node
227 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
231 print "Skipping setup"
234 # COPY Rpyc files to host
235 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
236 if self.verbose: print cmd
240 localos = moncommands.CMD()
242 ret = localos.system(cmd, timeout)
245 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
246 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
247 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
248 ret = localos.system(cmd, timeout)
251 print "\tFAILED TWICE"
253 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
256 # KILL any already running servers.
257 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
258 (ov,ev) = ssh.run_noexcept2("""<<\EOF
260 echo "kill server" >> out.log
261 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
262 echo "export" >> out.log
263 export PYTHONPATH=$HOME ;
264 echo "start server" >> out.log
265 python Rpyc/Servers/forking_server.py &> server.log &
266 echo "done" >> out.log
268 #cmd = """ssh %(user)s@%(hostname)s """ + \
269 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
271 #if self.verbose: print cmd
273 #print localos.system(cmd,timeout)
275 ## START a new rpyc server.
276 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
277 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
279 #if self.verbose: print cmd
280 #print localos.system(cmd,timeout)
281 print "setup rpyc server over ssh"
285 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
286 # and the following options seems to work well.
287 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
288 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
289 """-o ConnectTimeout=120 """ + \
290 """-n -N -L %(port)s:localhost:18812 """ + \
291 """%(user)s@%(hostname)s"""
293 if self.verbose: print cmd
295 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
296 # TODO: the read() here may block indefinitely. Need a better
297 # approach therefore, that includes a timeout.
298 #ret = self.command.stdout.read(5)
299 ret = moncommands.read_t(self.command.stdout, 5)
303 # NOTE: There is still a slight race for machines that are slow...
304 self.timeout = 2*(t2-t1)
305 print "Sleeping for %s sec" % self.timeout
306 time.sleep(self.timeout)
309 if self.command.returncode is not None:
310 print "Failed to establish tunnel!"
311 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
313 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
317 if self.verbose: print "Killing SSH session %s" % self.port
318 print "Killing SSH session %s" % self.port
322 def steps_to_list(steps, index=1):
323 return map(lambda x: x[index], steps)
325 def index_to_id(steps,index):
326 if index < len(steps):
327 return steps[index][0]
331 class DebugInterface:
332 def __init__(self, hostname):
333 self.hostname = hostname
336 def getConnection(self):
337 print "Creating session for %s" % self.hostname
338 # update known_hosts file (in case the node has rebooted since last run)
340 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
343 print traceback.print_exc()
348 self.session = PlanetLabSession(self.hostname, False, True)
350 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
351 except ExceptionDoubleSSHError, e:
352 msg = "ERROR setting up session for %s" % self.hostname
356 traceback.print_exc()
361 conn = self.session.get_connection(config)
363 # NOTE: sometimes the wait in setup_host() is not long enough.
364 # So, here we try to wait a little longer before giving up entirely.
366 time.sleep(self.session.timeout*5)
367 conn = self.session.get_connection(config)
369 # failed twice... no need to report this really, it's just in a
373 traceback.print_exc()
374 email_exception(self.hostname)
376 #print "trying to use conn before returning it."
377 #print conn.c.modules.sys.path
378 #print conn.c.modules.os.path.exists('/tmp/source')
381 #print "conn: %s" % conn
384 def getSequences(self):
386 # TODO: This can be replaced with a DB definition at a future time.
387 # This would make it possible for an admin to introduce new
388 # patterns without touching code.
391 # restart_bootmanager_boot
392 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
393 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
394 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
396 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
398 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
399 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
400 "bminit-cfg-auth-getplc-update-debug-done",
401 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
402 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
403 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
404 "bminit-cfg-auth-protoerror-exception-update-debug-done",
405 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
406 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
408 sequences.update({n : "restart_bootmanager_boot"})
410 # conn.restart_bootmanager('reinstall')
411 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
412 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
413 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
414 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
415 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
416 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
417 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
418 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
419 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
420 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
421 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
422 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
423 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
424 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
425 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
426 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
427 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
428 # actual solution appears to involve removing the bad files, and
429 # continually trying to boot the node.
430 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
431 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
432 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
433 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
434 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
436 sequences.update({n : "restart_bootmanager_rins"})
439 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
440 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
442 sequences.update({n: "repair_node_keys"})
444 # conn.restart_node('reinstall')
445 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
446 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
447 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
448 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
449 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
450 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
451 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
452 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
453 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
454 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
455 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
456 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
458 sequences.update({n : "restart_node_rins"})
461 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
462 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
463 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
464 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
465 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
466 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
467 "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
469 sequences.update({n: "restart_node_boot"})
471 # update_node_config_email
472 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
473 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
474 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
475 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
477 sequences.update({n : "update_node_config_email"})
479 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
480 "bminit-cfg-update-exception-nodehostname-update-debug-done",
482 sequences.update({n : "nodenetwork_email"})
484 # update_bootcd_email
485 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
486 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
487 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
488 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
489 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
491 sequences.update({n : "update_bootcd_email"})
493 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
495 sequences.update({n: "suspect_error_email"})
497 # update_hardware_email
498 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
499 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
501 # broken_hardware_email
502 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
506 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
507 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
509 sequences.update( { n : "bad_dns_email"})
513 def getDiskSteps(self):
515 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
516 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
517 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
519 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
521 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
522 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
524 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
525 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
527 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
528 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
530 ('floppytimeout','floppy0: floppy timeout called'),
531 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
533 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
534 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
536 # floppy0: floppy timeout called
537 # end_request: I/O error, dev fd0, sector 0
539 # Buffer I/O error on device dm-2, logical block 8888896
540 # ata1: status=0x51 { DriveReady SeekComplete Error }
541 # ata1: error=0x40 { UncorrectableError }
542 # SCSI error : <0 0 0 0> return code = 0x8000002
543 # sda: Current: sense key: Medium Error
544 # Additional sense: Unrecovered read error - auto reallocate failed
546 # SCSI error : <0 2 0 0> return code = 0x40001
547 # end_request: I/O error, dev sda, sector 572489600
551 def getDiskSequence(self, steps, child):
554 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
561 def getBootManagerStepPatterns(self):
563 ('bminit' , 'Initializing the BootManager.'),
564 ('cfg' , 'Reading node configuration file.'),
565 ('auth' , 'Authenticating node with PLC.'),
566 ('getplc' , 'Retrieving details of node from PLC.'),
567 ('update' , 'Updating node boot state at PLC.'),
568 ('hardware' , 'Checking if hardware requirements met.'),
569 ('installinit' , 'Install: Initializing.'),
570 ('installdisk' , 'Install: partitioning disks.'),
571 ('installbootfs', 'Install: bootstrapfs tarball.'),
572 ('installcfg' , 'Install: Writing configuration files.'),
573 ('installstop' , 'Install: Shutting down installer.'),
574 ('update2' , 'Updating node boot state at PLC.'),
575 ('installinit2' , 'Install: Initializing.'),
576 ('validate' , 'Validating node installation.'),
577 ('rebuildinitrd', 'Rebuilding initrd'),
578 ('netcfg' , 'Install: Writing Network Configuration files.'),
579 ('update3' , 'Updating node configuration.'),
580 ('disk' , 'Checking for unused disks to add to LVM.'),
581 ('update4' , 'Sending hardware configuration to PLC.'),
582 ('debug' , 'Starting debug mode'),
583 ('bmexceptmount', 'BootManagerException during mount'),
584 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
585 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
586 ('exception' , 'Exception'),
587 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
588 ('protoerror' , 'XML RPC protocol error'),
589 ('nodehostname' , 'Configured node hostname does not resolve'),
590 ('implementerror', 'Implementation Error'),
591 ('readonlyfs' , '[Errno 30] Read-only file system'),
592 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
593 ('noinstall' , 'notinstalled'),
594 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
595 ('noblockdev' , "No block devices detected."),
596 ('dnserror' , 'Name or service not known'),
597 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
598 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
599 ('hardwarerequirefail' , 'Hardware requirements not met'),
600 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
601 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
602 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
603 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
604 ('modulefail' , 'Unable to get list of system modules'),
605 ('writeerror' , 'write error: No space left on device'),
606 ('nospace' , "No space left on device"),
607 ('nonode' , 'Failed to authenticate call: No such node'),
608 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
609 ('bootcheckfail' , 'BootCheckAuthentication'),
610 ('bootupdatefail' , 'BootUpdateNode'),
614 def getBootManagerSequenceFromLog(self, steps, child):
618 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
619 id = index_to_id(steps,index)
622 if id == "exception":
623 print "...Found An Exception!!!"
624 elif id == "done": #index == len(steps_to_list(steps)):
631 def restore(sitehist, hostname, config=None, forced_action=None):
633 # NOTE: Nothing works if the bootcd is REALLY old.
634 # So, this is the first step.
636 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
637 recent_actions = sitehist.getRecentActions(hostname=hostname)
639 if fbnode['observed_category'] == "OLDBOOTCD":
640 print "\t...Notify owner to update BootImage!!!"
642 if not found_within(recent_actions, 'newbootcd_notice', 3):
643 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
645 print "\tDisabling %s due to out-of-date BootImage" % hostname
646 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
648 # NOTE: nothing else is possible.
651 debugnode = DebugInterface(hostname)
652 conn = debugnode.getConnection()
653 #print "conn: %s" % conn
654 #print "trying to use conn after returning it."
655 #print conn.c.modules.sys.path
656 #print conn.c.modules.os.path.exists('/tmp/source')
657 if type(conn) == type(False): return False
659 #if forced_action == "reboot":
660 # conn.restart_node('reinstall')
663 boot_state = conn.get_boot_state()
664 if boot_state != "debug":
665 print "... %s in %s state: skipping..." % (hostname , boot_state)
666 return boot_state == "boot"
668 if conn.bootmanager_running():
669 print "...BootManager is currently running. Skipping host %s" %hostname
672 # Read persistent flags, tagged on one week intervals.
674 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
675 dmesg = conn.get_dmesg()
676 child = fdpexpect.fdspawn(dmesg)
678 steps = debugnode.getDiskSteps()
679 sequence = debugnode.getDiskSequence(steps, child)
682 if config and not config.quiet: print "\tSET: ", s
685 print "...Potential drive errors on %s" % hostname
686 if len(s) == 2 and 'floppyerror' in s:
687 print "...Should investigate. Continuing with node."
689 print "...Should investigate. Skipping node."
690 # TODO: send message related to these errors.
692 if not found_within(recent_actions, 'newbootcd_notice', 3):
694 log=conn.get_dmesg().read()
695 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
696 conn.set_nodestate('disabled')
700 print "...Downloading bm.log from %s" %hostname
701 log = conn.get_bootmanager_log()
702 child = fdpexpect.fdspawn(log)
704 if hasattr(config, 'collect') and config.collect: return True
706 if config and not config.quiet: print "...Scanning bm.log for errors"
710 steps = debugnode.getBootManagerStepPatterns()
711 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
713 s = "-".join(sequence)
714 print " FOUND SEQUENCE: ", s
716 # NOTE: We get or set the flag based on the current sequence identifier.
717 # By using the sequence identifier, we guarantee that there will be no
718 # frequent loops. I'm guessing there is a better way to track loops,
721 sequences = debugnode.getSequences()
724 if s not in sequences:
725 print " HOST %s" % hostname
726 print " UNKNOWN SEQUENCE: %s" % s
729 args['hostname'] = hostname
731 args['bmlog'] = conn.get_bootmanager_log().read()
732 args['viart'] = False
734 sitehist.sendMessage('unknownsequence_notice', **args)
736 conn.restart_bootmanager('boot')
738 # NOTE: Do not set the pflags value for this sequence if it's unknown.
739 # This way, we can check it again after we've fixed it.
744 if sequences[s] == "restart_bootmanager_boot":
745 print "...Restarting BootManager.py on %s "%hostname
746 conn.restart_bootmanager('boot')
747 elif sequences[s] == "restart_bootmanager_rins":
748 print "...Restarting BootManager.py on %s "%hostname
749 conn.restart_bootmanager('reinstall')
750 elif sequences[s] == "restart_node_rins":
751 conn.restart_node('reinstall')
752 elif sequences[s] == "restart_node_boot":
753 conn.restart_node('boot')
754 elif sequences[s] == "repair_node_keys":
755 if conn.compare_and_repair_nodekeys():
756 # the keys either are in sync or were forced in sync.
757 # so try to reboot the node again.
758 # TODO: why was this originally 'reinstall' instead of 'boot'??
759 conn.restart_bootmanager('boot')
762 # there was some failure to synchronize the keys.
763 print "...Unable to repair node keys on %s" %hostname
765 elif sequences[s] == "suspect_error_email":
767 args['hostname'] = hostname
769 args['bmlog'] = conn.get_bootmanager_log().read()
770 args['viart'] = False
772 sitehist.sendMessage('unknownsequence_notice', **args)
773 conn.restart_bootmanager('boot')
775 # TODO: differentiate this and the 'nodenetwork_email' actions.
776 elif sequences[s] == "update_node_config_email":
778 if not found_within(recent_actions, 'nodeconfig_notice', 3):
780 args['hostname'] = hostname
781 sitehist.sendMessage('nodeconfig_notice', **args)
782 conn.dump_plconf_file()
784 elif sequences[s] == "nodenetwork_email":
786 if not found_within(recent_actions, 'nodeconfig_notice', 3):
788 args['hostname'] = hostname
789 args['bmlog'] = conn.get_bootmanager_log().read()
790 sitehist.sendMessage('nodeconfig_notice', **args)
791 conn.dump_plconf_file()
793 elif sequences[s] == "update_bootcd_email":
795 if not found_within(recent_actions, 'newalphacd_notice', 3):
797 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
798 args['hostname'] = hostname
800 sitehist.sendMessage('newalphacd_notice', **args)
802 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
804 elif sequences[s] == "broken_hardware_email":
805 # MAKE An ACTION record that this host has failed hardware. May
806 # require either an exception "/minhw" or other manual intervention.
807 # Definitely need to send out some more EMAIL.
808 # TODO: email notice of broken hardware
809 if not found_within(recent_actions, 'baddisk_notice', 1):
810 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
812 args['hostname'] = hostname
813 args['log'] = conn.get_dmesg().read()
815 sitehist.sendMessage('baddisk_notice', **args)
816 conn.set_nodestate('disabled')
818 elif sequences[s] == "update_hardware_email":
819 if not found_within(recent_actions, 'minimalhardware_notice', 1):
820 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
822 args['hostname'] = hostname
823 args['bmlog'] = conn.get_bootmanager_log().read()
824 sitehist.sendMessage('minimalhardware_notice', **args)
826 elif sequences[s] == "bad_dns_email":
827 if not found_within(recent_actions, 'baddns_notice', 1):
828 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
831 node = plccache.GetNodeByName(hostname)
832 net = api.GetInterfaces(node['interface_ids'])[0]
835 print traceback.print_exc()
836 # TODO: api error. skip email, b/c all info is not available,
837 # flag_set will not be recorded.
839 nodenet_str = network_config_to_str(net)
841 args['hostname'] = hostname
842 args['network_config'] = nodenet_str
843 args['interface_id'] = net['interface_id']
845 sitehist.sendMessage('baddns_notice', **args)
850 # MAIN -------------------------------------------------------------------
853 from monitor import parser as parsermodule
854 parser = parsermodule.getParser()
856 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
857 force=None, quiet=False)
858 parser.add_option("", "--child", dest="child", action="store_true",
859 help="This is the child mode of this process.")
860 parser.add_option("", "--force", dest="force", metavar="boot_state",
861 help="Force a boot state passed to BootManager.py.")
862 parser.add_option("", "--quiet", dest="quiet", action="store_true",
863 help="Extra quiet output messages.")
864 parser.add_option("", "--verbose", dest="verbose", action="store_true",
865 help="Extra debug output messages.")
866 parser.add_option("", "--nonet", dest="nonet", action="store_true",
867 help="Do not setup the network, use existing log files to re-run a test pass.")
868 parser.add_option("", "--collect", dest="collect", action="store_true",
869 help="No action, just collect dmesg, and bm.log")
870 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
871 help="Do not perform the orginary setup phase.")
873 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
874 config = parsermodule.parse_args(parser)
877 nodes = config.getListFromFile(config.nodelist)
879 nodes = [ config.node ]
886 lb = plccache.plcdb_hn2lb[node]
887 sitehist = SiteInterface.get_or_make(loginbase=lb)
888 #reboot(node, config)
889 restore(sitehist, node, config=None, forced_action=None)
891 if __name__ == "__main__":