3 # Attempt to reboot a node in debug state.
16 from monitor.getsshkeys import SSHKnownHosts
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
41 api = plc.getAuthAPI()
45 class ExceptionDoubleSSHError(Exception): pass
48 def __init__(self, connection, node, config):
53 def get_boot_state(self):
55 if self.c.modules.os.path.exists('/tmp/source'):
57 elif self.c.modules.os.path.exists('/vservers'):
63 print self.c.modules.sys.path
71 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
72 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
73 log = open("log/dmesg.%s.log" % self.node, 'r')
76 def get_bootmanager_log(self):
77 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
78 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
79 os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
80 log = open("log/bm.%s.log" % self.node, 'r')
83 def dump_plconf_file(self):
85 self.c.modules.sys.path.append("/tmp/source/")
86 self.c.modules.os.chdir('/tmp/source')
88 log = c.modules.BootManager.log('/tmp/new.log')
89 bm = c.modules.BootManager.BootManager(log,'boot')
91 BootManagerException = c.modules.Exceptions.BootManagerException
92 InitializeBootManager = c.modules.BootManager.InitializeBootManager
93 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
96 InitializeBootManager.Run(bm.VARS, bm.LOG)
97 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
101 print " Possibly, unable to find valid configuration file"
104 for key in bm.VARS.keys():
105 print key, " == ", bm.VARS[key]
107 print " Unable to read Node Configuration"
110 def compare_and_repair_nodekeys(self):
112 self.c.modules.sys.path.append("/tmp/source/")
113 self.c.modules.os.chdir('/tmp/source')
115 log = c.modules.BootManager.log('/tmp/new.log')
116 bm = c.modules.BootManager.BootManager(log,'boot')
118 BootManagerException = c.modules.Exceptions.BootManagerException
119 InitializeBootManager = c.modules.BootManager.InitializeBootManager
120 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
123 plcnode = plccache.GetNodeByName(self.node)
125 InitializeBootManager.Run(bm.VARS, bm.LOG)
126 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
131 print " Possibly, unable to find valid configuration file"
134 print " NODE: %s" % bm.VARS['NODE_KEY']
135 print " PLC : %s" % plcnode['key']
137 if bm.VARS['NODE_KEY'] == plcnode['key']:
140 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
141 print " Successfully updated NODE_KEY with PLC"
146 #for key in bm.VARS.keys():
147 # print key, " == ", bm.VARS[key]
149 print " Unable to retrieve NODE_KEY"
151 def bootmanager_running(self):
152 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
157 def set_nodestate(self, state='boot'):
158 return api.UpdateNode(self.node, {'boot_state' : state})
160 def restart_node(self, state='boot'):
161 api.UpdateNode(self.node, {'boot_state' : state})
163 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
164 if not pflags.getRecentFlag('gentlekill'):
165 print " Killing all slice processes... : %s" % self.node
166 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
167 self.c.modules.os.system(cmd_slicekill)
168 cmd = """ shutdown -r +1 & """
169 print " Restarting %s : %s" % ( self.node, cmd)
170 self.c.modules.os.system(cmd)
172 pflags.setRecentFlag('gentlekill')
175 print " Restarting with sysrq 'sub' %s" % self.node
176 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
177 self.c.modules.os.system(cmd)
181 def restart_bootmanager(self, forceState):
183 self.c.modules.os.chdir('/tmp/source')
184 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
185 print " BootManager is already running: try again soon..."
187 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
188 cmd = "( touch /tmp/BM_RUNNING ; " + \
189 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
190 " rm -f /tmp/BM_RUNNING " + \
192 cmd = cmd % forceState
193 self.c.modules.os.system(cmd)
198 class PlanetLabSession:
199 globalport = 22000 + int(random.random()*1000)
201 def __init__(self, node, nosetup, verbose):
202 self.verbose = verbose
205 self.nosetup = nosetup
209 def get_connection(self, config):
210 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
213 # print i, conn.c.modules.sys.path
214 # print conn.c.modules.os.path.exists('/tmp/source')
219 def setup_host(self):
220 self.port = PlanetLabSession.globalport
221 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
224 args['port'] = self.port
225 args['user'] = 'root'
226 args['hostname'] = self.node
227 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
231 print "Skipping setup"
234 # COPY Rpyc files to host
235 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
236 if self.verbose: print cmd
240 localos = moncommands.CMD()
242 ret = localos.system(cmd, timeout)
245 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
246 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
247 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
248 ret = localos.system(cmd, timeout)
251 print "\tFAILED TWICE"
253 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
256 # KILL any already running servers.
257 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
258 (ov,ev) = ssh.run_noexcept2("""<<\EOF
260 echo "kill server" >> out.log
261 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
262 echo "export" >> out.log
263 export PYTHONPATH=$HOME ;
264 echo "start server" >> out.log
265 python Rpyc/Servers/forking_server.py &> server.log &
266 echo "done" >> out.log
268 #cmd = """ssh %(user)s@%(hostname)s """ + \
269 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
271 #if self.verbose: print cmd
273 #print localos.system(cmd,timeout)
275 ## START a new rpyc server.
276 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
277 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
279 #if self.verbose: print cmd
280 #print localos.system(cmd,timeout)
281 print "setup rpyc server over ssh"
285 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
286 # and the following options seems to work well.
287 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
288 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
289 """-o ConnectTimeout=120 """ + \
290 """-n -N -L %(port)s:localhost:18812 """ + \
291 """%(user)s@%(hostname)s"""
293 if self.verbose: print cmd
295 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
296 # TODO: the read() here may block indefinitely. Need a better
297 # approach therefore, that includes a timeout.
298 #ret = self.command.stdout.read(5)
299 ret = moncommands.read_t(self.command.stdout, 5)
303 # NOTE: There is still a slight race for machines that are slow...
304 self.timeout = 2*(t2-t1)
305 print "Sleeping for %s sec" % self.timeout
306 time.sleep(self.timeout)
309 if self.command.returncode is not None:
310 print "Failed to establish tunnel!"
311 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
313 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
317 if self.verbose: print "Killing SSH session %s" % self.port
318 print "Killing SSH session %s" % self.port
322 def steps_to_list(steps, index=1):
323 return map(lambda x: x[index], steps)
325 def index_to_id(steps,index):
326 if index < len(steps):
327 return steps[index][0]
331 class DebugInterface:
332 def __init__(self, hostname):
333 self.hostname = hostname
336 def getConnection(self):
337 print "Creating session for %s" % self.hostname
338 # update known_hosts file (in case the node has rebooted since last run)
340 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
343 print traceback.print_exc()
348 self.session = PlanetLabSession(self.hostname, False, True)
350 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
351 except ExceptionDoubleSSHError, e:
352 msg = "ERROR setting up session for %s" % self.hostname
356 traceback.print_exc()
361 conn = self.session.get_connection(config)
363 # NOTE: sometimes the wait in setup_host() is not long enough.
364 # So, here we try to wait a little longer before giving up entirely.
366 time.sleep(self.session.timeout*5)
367 conn = self.session.get_connection(config)
369 # failed twice... no need to report this really, it's just in a
373 traceback.print_exc()
374 email_exception(self.hostname)
376 #print "trying to use conn before returning it."
377 #print conn.c.modules.sys.path
378 #print conn.c.modules.os.path.exists('/tmp/source')
381 #print "conn: %s" % conn
384 def getSequences(self):
386 # TODO: This can be replaced with a DB definition at a future time.
387 # This would make it possible for an admin to introduce new
388 # patterns without touching code.
391 # restart_bootmanager_boot
392 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
393 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
394 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
396 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
398 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
399 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
400 "bminit-cfg-auth-getplc-update-debug-done",
401 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
402 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
403 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
404 "bminit-cfg-auth-protoerror-exception-update-debug-done",
405 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
406 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
408 sequences.update({n : "restart_bootmanager_boot"})
410 # conn.restart_bootmanager('reinstall')
411 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
412 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
413 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
414 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
415 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
416 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
417 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
418 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
419 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
420 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
421 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
422 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
423 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
424 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
425 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
426 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
427 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
428 # actual solution appears to involve removing the bad files, and
429 # continually trying to boot the node.
430 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
431 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
432 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
433 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
434 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
436 sequences.update({n : "restart_bootmanager_rins"})
439 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
440 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
442 sequences.update({n: "repair_node_keys"})
444 # conn.restart_node('reinstall')
445 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
446 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
447 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
448 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
449 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
450 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
451 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
452 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
453 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
454 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
455 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
456 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
458 sequences.update({n : "restart_node_rins"})
461 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
462 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
463 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
464 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
465 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
466 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
467 "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
469 sequences.update({n: "restart_node_boot"})
471 # update_node_config_email
472 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
473 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
474 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
476 sequences.update({n : "update_node_config_email"})
478 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
479 "bminit-cfg-update-exception-nodehostname-update-debug-done",
481 sequences.update({n : "nodenetwork_email"})
483 # update_bootcd_email
484 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
485 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
486 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
487 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
488 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
490 sequences.update({n : "update_bootcd_email"})
492 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
494 sequences.update({n: "suspect_error_email"})
496 # update_hardware_email
497 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
498 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
500 # broken_hardware_email
501 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
505 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
506 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
508 sequences.update( { n : "bad_dns_email"})
512 def getDiskSteps(self):
514 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
515 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
516 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
518 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
520 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
521 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
523 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
524 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
526 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
527 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
529 ('floppytimeout','floppy0: floppy timeout called'),
530 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
532 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
533 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
535 # floppy0: floppy timeout called
536 # end_request: I/O error, dev fd0, sector 0
538 # Buffer I/O error on device dm-2, logical block 8888896
539 # ata1: status=0x51 { DriveReady SeekComplete Error }
540 # ata1: error=0x40 { UncorrectableError }
541 # SCSI error : <0 0 0 0> return code = 0x8000002
542 # sda: Current: sense key: Medium Error
543 # Additional sense: Unrecovered read error - auto reallocate failed
545 # SCSI error : <0 2 0 0> return code = 0x40001
546 # end_request: I/O error, dev sda, sector 572489600
550 def getDiskSequence(self, steps, child):
553 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
560 def getBootManagerStepPatterns(self):
562 ('bminit' , 'Initializing the BootManager.'),
563 ('cfg' , 'Reading node configuration file.'),
564 ('auth' , 'Authenticating node with PLC.'),
565 ('getplc' , 'Retrieving details of node from PLC.'),
566 ('update' , 'Updating node boot state at PLC.'),
567 ('hardware' , 'Checking if hardware requirements met.'),
568 ('installinit' , 'Install: Initializing.'),
569 ('installdisk' , 'Install: partitioning disks.'),
570 ('installbootfs', 'Install: bootstrapfs tarball.'),
571 ('installcfg' , 'Install: Writing configuration files.'),
572 ('installstop' , 'Install: Shutting down installer.'),
573 ('update2' , 'Updating node boot state at PLC.'),
574 ('installinit2' , 'Install: Initializing.'),
575 ('validate' , 'Validating node installation.'),
576 ('rebuildinitrd', 'Rebuilding initrd'),
577 ('netcfg' , 'Install: Writing Network Configuration files.'),
578 ('update3' , 'Updating node configuration.'),
579 ('disk' , 'Checking for unused disks to add to LVM.'),
580 ('update4' , 'Sending hardware configuration to PLC.'),
581 ('debug' , 'Starting debug mode'),
582 ('bmexceptmount', 'BootManagerException during mount'),
583 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
584 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
585 ('exception' , 'Exception'),
586 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
587 ('protoerror' , 'XML RPC protocol error'),
588 ('nodehostname' , 'Configured node hostname does not resolve'),
589 ('implementerror', 'Implementation Error'),
590 ('readonlyfs' , '[Errno 30] Read-only file system'),
591 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
592 ('noinstall' , 'notinstalled'),
593 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
594 ('noblockdev' , "No block devices detected."),
595 ('dnserror' , 'Name or service not known'),
596 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
597 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
598 ('hardwarerequirefail' , 'Hardware requirements not met'),
599 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
600 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
601 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
602 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
603 ('modulefail' , 'Unable to get list of system modules'),
604 ('writeerror' , 'write error: No space left on device'),
605 ('nospace' , "No space left on device"),
606 ('nonode' , 'Failed to authenticate call: No such node'),
607 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
608 ('bootcheckfail' , 'BootCheckAuthentication'),
609 ('bootupdatefail' , 'BootUpdateNode'),
613 def getBootManagerSequenceFromLog(self, steps, child):
617 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
618 id = index_to_id(steps,index)
621 if id == "exception":
622 print "...Found An Exception!!!"
623 elif id == "done": #index == len(steps_to_list(steps)):
630 def restore(sitehist, hostname, config=None, forced_action=None):
632 # NOTE: Nothing works if the bootcd is REALLY old.
633 # So, this is the first step.
635 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
636 recent_actions = sitehist.getRecentActions(hostname=hostname)
638 if fbnode['observed_category'] == "OLDBOOTCD":
639 print "\t...Notify owner to update BootImage!!!"
641 if not found_within(recent_actions, 'newbootcd_notice', 3):
642 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
644 print "\tDisabling %s due to out-of-date BootImage" % hostname
645 api.UpdateNode(hostname, {'boot_state' : 'disable'})
647 # NOTE: nothing else is possible.
650 debugnode = DebugInterface(hostname)
651 conn = debugnode.getConnection()
652 #print "conn: %s" % conn
653 #print "trying to use conn after returning it."
654 #print conn.c.modules.sys.path
655 #print conn.c.modules.os.path.exists('/tmp/source')
656 if type(conn) == type(False): return False
658 #if forced_action == "reboot":
659 # conn.restart_node('reinstall')
662 boot_state = conn.get_boot_state()
663 if boot_state != "debug":
664 print "... %s in %s state: skipping..." % (hostname , boot_state)
665 return boot_state == "boot"
667 if conn.bootmanager_running():
668 print "...BootManager is currently running. Skipping host %s" %hostname
671 # Read persistent flags, tagged on one week intervals.
673 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
674 dmesg = conn.get_dmesg()
675 child = fdpexpect.fdspawn(dmesg)
677 steps = debugnode.getDiskSteps()
678 sequence = debugnode.getDiskSequence(steps, child)
681 if config and not config.quiet: print "\tSET: ", s
684 print "...Potential drive errors on %s" % hostname
685 if len(s) == 2 and 'floppyerror' in s:
686 print "...Should investigate. Continuing with node."
688 print "...Should investigate. Skipping node."
689 # TODO: send message related to these errors.
691 if not found_within(recent_actions, 'newbootcd_notice', 3):
693 log=conn.get_dmesg().read()
694 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
695 conn.set_nodestate('disable')
699 print "...Downloading bm.log from %s" %hostname
700 log = conn.get_bootmanager_log()
701 child = fdpexpect.fdspawn(log)
703 if hasattr(config, 'collect') and config.collect: return True
705 if config and not config.quiet: print "...Scanning bm.log for errors"
709 steps = debugnode.getBootManagerStepPatterns()
710 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
712 s = "-".join(sequence)
713 print " FOUND SEQUENCE: ", s
715 # NOTE: We get or set the flag based on the current sequence identifier.
716 # By using the sequence identifier, we guarantee that there will be no
717 # frequent loops. I'm guessing there is a better way to track loops,
720 sequences = debugnode.getSequences()
723 if s not in sequences:
724 print " HOST %s" % hostname
725 print " UNKNOWN SEQUENCE: %s" % s
728 args['hostname'] = hostname
730 args['bmlog'] = conn.get_bootmanager_log().read()
731 args['viart'] = False
733 sitehist.sendMessage('unknownsequence_notice', **args)
735 conn.restart_bootmanager('boot')
737 # NOTE: Do not set the pflags value for this sequence if it's unknown.
738 # This way, we can check it again after we've fixed it.
743 if sequences[s] == "restart_bootmanager_boot":
744 print "...Restarting BootManager.py on %s "%hostname
745 conn.restart_bootmanager('boot')
746 elif sequences[s] == "restart_bootmanager_rins":
747 print "...Restarting BootManager.py on %s "%hostname
748 conn.restart_bootmanager('reinstall')
749 elif sequences[s] == "restart_node_rins":
750 conn.restart_node('reinstall')
751 elif sequences[s] == "restart_node_boot":
752 conn.restart_node('boot')
753 elif sequences[s] == "repair_node_keys":
754 if conn.compare_and_repair_nodekeys():
755 # the keys either are in sync or were forced in sync.
756 # so try to reboot the node again.
757 # TODO: why was this originally 'reinstall' instead of 'boot'??
758 conn.restart_bootmanager('boot')
761 # there was some failure to synchronize the keys.
762 print "...Unable to repair node keys on %s" %hostname
764 elif sequences[s] == "suspect_error_email":
766 args['hostname'] = hostname
768 args['bmlog'] = conn.get_bootmanager_log().read()
769 args['viart'] = False
771 sitehist.sendMessage('unknownsequence_notice', **args)
772 conn.restart_bootmanager('boot')
774 # TODO: differentiate this and the 'nodenetwork_email' actions.
775 elif sequences[s] == "update_node_config_email":
777 if not found_within(recent_actions, 'nodeconfig_notice', 3):
779 args['hostname'] = hostname
780 sitehist.sendMessage('nodeconfig_notice', **args)
781 conn.dump_plconf_file()
783 elif sequences[s] == "nodenetwork_email":
785 if not found_within(recent_actions, 'nodeconfig_notice', 3):
787 args['hostname'] = hostname
788 args['bmlog'] = conn.get_bootmanager_log().read()
789 sitehist.sendMessage('nodeconfig_notice', **args)
790 conn.dump_plconf_file()
792 elif sequences[s] == "update_bootcd_email":
794 if not found_within(recent_actions, 'newalphacd_notice', 3):
796 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
797 args['hostname'] = hostname
799 sitehist.sendMessage('newalphacd_notice', **args)
801 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
803 elif sequences[s] == "broken_hardware_email":
804 # MAKE An ACTION record that this host has failed hardware. May
805 # require either an exception "/minhw" or other manual intervention.
806 # Definitely need to send out some more EMAIL.
807 # TODO: email notice of broken hardware
808 if not found_within(recent_actions, 'baddisk_notice', 1):
809 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
811 args['hostname'] = hostname
812 args['log'] = conn.get_dmesg().read()
814 sitehist.sendMessage('baddisk_notice', **args)
815 conn.set_nodestate('disable')
817 elif sequences[s] == "update_hardware_email":
818 if not found_within(recent_actions, 'minimalhardware_notice', 1):
819 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
821 args['hostname'] = hostname
822 args['bmlog'] = conn.get_bootmanager_log().read()
823 sitehist.sendMessage('minimalhardware_notice', **args)
825 elif sequences[s] == "bad_dns_email":
826 if not found_within(recent_actions, 'baddns_notice', 1):
827 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
830 node = plccache.GetNodeByName(hostname)
831 net = api.GetInterfaces(node['interface_ids'])[0]
834 print traceback.print_exc()
835 # TODO: api error. skip email, b/c all info is not available,
836 # flag_set will not be recorded.
838 nodenet_str = network_config_to_str(net)
840 args['hostname'] = hostname
841 args['network_config'] = nodenet_str
842 args['interface_id'] = net['interface_id']
844 sitehist.sendMessage('baddns_notice', **args)
849 # MAIN -------------------------------------------------------------------
852 from monitor import parser as parsermodule
853 parser = parsermodule.getParser()
855 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
856 force=None, quiet=False)
857 parser.add_option("", "--child", dest="child", action="store_true",
858 help="This is the child mode of this process.")
859 parser.add_option("", "--force", dest="force", metavar="boot_state",
860 help="Force a boot state passed to BootManager.py.")
861 parser.add_option("", "--quiet", dest="quiet", action="store_true",
862 help="Extra quiet output messages.")
863 parser.add_option("", "--verbose", dest="verbose", action="store_true",
864 help="Extra debug output messages.")
865 parser.add_option("", "--nonet", dest="nonet", action="store_true",
866 help="Do not setup the network, use existing log files to re-run a test pass.")
867 parser.add_option("", "--collect", dest="collect", action="store_true",
868 help="No action, just collect dmesg, and bm.log")
869 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
870 help="Do not perform the orginary setup phase.")
872 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
873 config = parsermodule.parse_args(parser)
876 nodes = config.getListFromFile(config.nodelist)
878 nodes = [ config.node ]
885 lb = plccache.plcdb_hn2lb[node]
886 sitehist = SiteInterface.get_or_make(loginbase=lb)
887 #reboot(node, config)
888 restore(sitehist, node, config=None, forced_action=None)
890 if __name__ == "__main__":