3 # Attempt to reboot a node in debug state.
16 from getsshkeys import SSHKnownHosts
18 from Rpyc import SocketConnection, Async
19 from Rpyc.Utils import *
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.wrapper import plc
28 from monitor.wrapper import plccache
29 from monitor.wrapper.emailTxt import mailtxt
31 from pcucontrol.util import command as moncommands
32 from pcucontrol.util.command import Sopen
33 from pcucontrol.transports.ssh import pxssh as pxssh
34 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
35 from pcucontrol.transports.ssh import pexpect as pexpect
37 from nodeconfig import network_config_to_str
40 api = plc.getAuthAPI()
45 def __init__(self, connection, node, config):
50 def get_boot_state(self):
52 if self.c.modules.os.path.exists('/tmp/source'):
54 elif self.c.modules.os.path.exists('/vservers'):
60 print self.c.modules.sys.path
67 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
68 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
69 log = open("log/dmesg.%s.log" % self.node, 'r')
72 def get_bootmanager_log(self):
73 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
74 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
75 log = open("log/bm.%s.log" % self.node, 'r')
78 def dump_plconf_file(self):
80 self.c.modules.sys.path.append("/tmp/source/")
81 self.c.modules.os.chdir('/tmp/source')
83 log = c.modules.BootManager.log('/tmp/new.log')
84 bm = c.modules.BootManager.BootManager(log,'boot')
86 BootManagerException = c.modules.Exceptions.BootManagerException
87 InitializeBootManager = c.modules.BootManager.InitializeBootManager
88 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
91 InitializeBootManager.Run(bm.VARS, bm.LOG)
92 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
96 print " Possibly, unable to find valid configuration file"
99 for key in bm.VARS.keys():
100 print key, " == ", bm.VARS[key]
102 print " Unable to read Node Configuration"
105 def compare_and_repair_nodekeys(self):
107 self.c.modules.sys.path.append("/tmp/source/")
108 self.c.modules.os.chdir('/tmp/source')
110 log = c.modules.BootManager.log('/tmp/new.log')
111 bm = c.modules.BootManager.BootManager(log,'boot')
113 BootManagerException = c.modules.Exceptions.BootManagerException
114 InitializeBootManager = c.modules.BootManager.InitializeBootManager
115 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
118 plcnode = plccache.GetNodeByName(self.node)
120 InitializeBootManager.Run(bm.VARS, bm.LOG)
121 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
126 print " Possibly, unable to find valid configuration file"
129 print " NODE: %s" % bm.VARS['NODE_KEY']
130 print " PLC : %s" % plcnode['key']
132 if bm.VARS['NODE_KEY'] == plcnode['key']:
135 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
136 print " Successfully updated NODE_KEY with PLC"
141 #for key in bm.VARS.keys():
142 # print key, " == ", bm.VARS[key]
144 print " Unable to retrieve NODE_KEY"
146 def bootmanager_running(self):
147 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
152 def set_nodestate(self, state='boot'):
153 return api.UpdateNode(self.node, {'boot_state' : state})
155 def restart_node(self, state='boot'):
156 api.UpdateNode(self.node, {'boot_state' : state})
158 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
159 if not pflags.getRecentFlag('gentlekill'):
160 print " Killing all slice processes... : %s" % self.node
161 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
162 self.c.modules.os.system(cmd_slicekill)
163 cmd = """ shutdown -r +1 & """
164 print " Restarting %s : %s" % ( self.node, cmd)
165 self.c.modules.os.system(cmd)
167 pflags.setRecentFlag('gentlekill')
170 print " Restarting with sysrq 'sub' %s" % self.node
171 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
172 self.c.modules.os.system(cmd)
176 def restart_bootmanager(self, forceState):
178 self.c.modules.os.chdir('/tmp/source')
179 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
180 print " BootManager is already running: try again soon..."
182 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
183 cmd = "( touch /tmp/BM_RUNNING ; " + \
184 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
185 " rm -f /tmp/BM_RUNNING " + \
187 cmd = cmd % forceState
188 self.c.modules.os.system(cmd)
193 class PlanetLabSession:
194 globalport = 22000 + int(random.random()*1000)
196 def __init__(self, node, nosetup, verbose):
197 self.verbose = verbose
200 self.nosetup = nosetup
204 def get_connection(self, config):
205 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
208 # print i, conn.c.modules.sys.path
209 # print conn.c.modules.os.path.exists('/tmp/source')
214 def setup_host(self):
215 self.port = PlanetLabSession.globalport
216 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
219 args['port'] = self.port
220 args['user'] = 'root'
221 args['hostname'] = self.node
222 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
226 print "Skipping setup"
229 # COPY Rpyc files to host
230 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
231 if self.verbose: print cmd
235 localos = moncommands.CMD()
237 ret = localos.system(cmd, timeout)
240 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
241 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
242 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
243 ret = localos.system(cmd, timeout)
246 print "\tFAILED TWICE"
248 raise Exception("Failed twice trying to login with updated ssh host key")
251 # KILL any already running servers.
252 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
253 (ov,ev) = ssh.run_noexcept2("""<<\EOF
255 echo "kill server" >> out.log
256 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
257 echo "export" >> out.log
258 export PYTHONPATH=$HOME ;
259 echo "start server" >> out.log
260 python Rpyc/Servers/forking_server.py &> server.log &
261 echo "done" >> out.log
263 #cmd = """ssh %(user)s@%(hostname)s """ + \
264 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
266 #if self.verbose: print cmd
268 #print localos.system(cmd,timeout)
270 ## START a new rpyc server.
271 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
272 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
274 #if self.verbose: print cmd
275 #print localos.system(cmd,timeout)
276 print "setup rpyc server over ssh"
280 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
281 # and the following options seems to work well.
282 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
283 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
284 """-o ConnectTimeout=120 """ + \
285 """-n -N -L %(port)s:localhost:18812 """ + \
286 """%(user)s@%(hostname)s"""
288 if self.verbose: print cmd
290 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
291 # TODO: the read() here may block indefinitely. Need a better
292 # approach therefore, that includes a timeout.
293 #ret = self.command.stdout.read(5)
294 ret = moncommands.read_t(self.command.stdout, 5)
298 # NOTE: There is still a slight race for machines that are slow...
299 self.timeout = 2*(t2-t1)
300 print "Sleeping for %s sec" % self.timeout
301 time.sleep(self.timeout)
304 if self.command.returncode is not None:
305 print "Failed to establish tunnel!"
306 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
308 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
312 if self.verbose: print "Killing SSH session %s" % self.port
313 print "Killing SSH session %s" % self.port
317 def steps_to_list(steps, index=1):
318 return map(lambda x: x[index], steps)
320 def index_to_id(steps,index):
321 if index < len(steps):
322 return steps[index][0]
326 class DebugInterface:
327 def __init__(self, hostname):
328 self.hostname = hostname
331 def getConnection(self):
332 print "Creating session for %s" % self.hostname
333 # update known_hosts file (in case the node has rebooted since last run)
335 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
338 print traceback.print_exc()
343 self.session = PlanetLabSession(self.hostname, False, True)
345 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
347 msg = "ERROR setting up session for %s" % self.hostname
349 traceback.print_exc()
354 conn = self.session.get_connection(config)
356 # NOTE: sometimes the wait in setup_host() is not long enough.
357 # So, here we try to wait a little longer before giving up entirely.
359 time.sleep(self.session.timeout*5)
360 conn = self.session.get_connection(config)
362 traceback.print_exc()
363 email_exception(self.hostname)
365 #print "trying to use conn before returning it."
366 #print conn.c.modules.sys.path
367 #print conn.c.modules.os.path.exists('/tmp/source')
370 #print "conn: %s" % conn
373 def getSequences(self):
375 # TODO: This can be replaced with a DB definition at a future time.
376 # This would make it possible for an admin to introduce new
377 # patterns without touching code.
380 # restart_bootmanager_boot
381 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
382 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
383 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
385 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
387 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
388 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
389 "bminit-cfg-auth-getplc-update-debug-done",
390 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
391 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
392 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
393 "bminit-cfg-auth-protoerror-exception-update-debug-done",
394 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
395 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
397 sequences.update({n : "restart_bootmanager_boot"})
399 # conn.restart_bootmanager('rins')
400 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
401 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
402 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
403 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
404 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
405 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
406 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
407 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
408 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
409 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
410 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
411 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
412 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
413 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
414 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
415 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
416 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
417 # actual solution appears to involve removing the bad files, and
418 # continually trying to boot the node.
419 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
420 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
421 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
423 sequences.update({n : "restart_bootmanager_rins"})
426 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
428 # conn.restart_node('rins')
429 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
430 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
431 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
432 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
433 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
434 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
435 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
436 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
437 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
438 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
439 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
440 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
442 sequences.update({n : "restart_node_rins"})
445 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
446 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
447 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
448 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
449 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
450 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
452 sequences.update({n: "restart_node_boot"})
454 # update_node_config_email
455 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
456 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
457 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
459 sequences.update({n : "update_node_config_email"})
461 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
462 "bminit-cfg-update-exception-nodehostname-update-debug-done",
464 sequences.update({n : "nodenetwork_email"})
466 # update_bootcd_email
467 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
468 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
469 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
470 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
471 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
473 sequences.update({n : "update_bootcd_email"})
475 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
477 sequences.update({n: "suspect_error_email"})
479 # update_hardware_email
480 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
481 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
483 # broken_hardware_email
484 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
488 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
489 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
491 sequences.update( { n : "bad_dns_email"})
495 def getDiskSteps(self):
497 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
498 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
499 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
501 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
503 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
504 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
506 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
507 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
509 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
510 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
512 ('floppytimeout','floppy0: floppy timeout called'),
513 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
515 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
516 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
518 # floppy0: floppy timeout called
519 # end_request: I/O error, dev fd0, sector 0
521 # Buffer I/O error on device dm-2, logical block 8888896
522 # ata1: status=0x51 { DriveReady SeekComplete Error }
523 # ata1: error=0x40 { UncorrectableError }
524 # SCSI error : <0 0 0 0> return code = 0x8000002
525 # sda: Current: sense key: Medium Error
526 # Additional sense: Unrecovered read error - auto reallocate failed
528 # SCSI error : <0 2 0 0> return code = 0x40001
529 # end_request: I/O error, dev sda, sector 572489600
533 def getDiskSequence(self, steps, child):
536 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
543 def getBootManagerStepPatterns(self):
545 ('bminit' , 'Initializing the BootManager.'),
546 ('cfg' , 'Reading node configuration file.'),
547 ('auth' , 'Authenticating node with PLC.'),
548 ('getplc' , 'Retrieving details of node from PLC.'),
549 ('update' , 'Updating node boot state at PLC.'),
550 ('hardware' , 'Checking if hardware requirements met.'),
551 ('installinit' , 'Install: Initializing.'),
552 ('installdisk' , 'Install: partitioning disks.'),
553 ('installbootfs', 'Install: bootstrapfs tarball.'),
554 ('installcfg' , 'Install: Writing configuration files.'),
555 ('installstop' , 'Install: Shutting down installer.'),
556 ('update2' , 'Updating node boot state at PLC.'),
557 ('installinit2' , 'Install: Initializing.'),
558 ('validate' , 'Validating node installation.'),
559 ('rebuildinitrd', 'Rebuilding initrd'),
560 ('netcfg' , 'Install: Writing Network Configuration files.'),
561 ('update3' , 'Updating node configuration.'),
562 ('disk' , 'Checking for unused disks to add to LVM.'),
563 ('update4' , 'Sending hardware configuration to PLC.'),
564 ('debug' , 'Starting debug mode'),
565 ('bmexceptmount', 'BootManagerException during mount'),
566 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
567 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
568 ('exception' , 'Exception'),
569 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
570 ('protoerror' , 'XML RPC protocol error'),
571 ('nodehostname' , 'Configured node hostname does not resolve'),
572 ('implementerror', 'Implementation Error'),
573 ('readonlyfs' , '[Errno 30] Read-only file system'),
574 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
575 ('noinstall' , 'notinstalled'),
576 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
577 ('noblockdev' , "No block devices detected."),
578 ('dnserror' , 'Name or service not known'),
579 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
580 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
581 ('hardwarerequirefail' , 'Hardware requirements not met'),
582 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
583 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
584 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
585 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
586 ('modulefail' , 'Unable to get list of system modules'),
587 ('writeerror' , 'write error: No space left on device'),
588 ('nospace' , "No space left on device"),
589 ('nonode' , 'Failed to authenticate call: No such node'),
590 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
591 ('bootcheckfail' , 'BootCheckAuthentication'),
592 ('bootupdatefail' , 'BootUpdateNode'),
596 def getBootManagerSequenceFromLog(self, steps, child):
600 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
601 id = index_to_id(steps,index)
604 if id == "exception":
605 print "...Found An Exception!!!"
606 elif id == "done": #index == len(steps_to_list(steps)):
613 def restore(sitehist, hostname, config=None, forced_action=None):
615 # NOTE: Nothing works if the bootcd is REALLY old.
616 # So, this is the first step.
618 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
619 recent_actions = sitehist.getRecentActions(hostname=hostname)
621 if fbnode['observed_category'] == "OLDBOOTCD":
622 print "\t...Notify owner to update BootImage!!!"
624 if not found_within(recent_actions, 'newbootcd_notice', 3):
625 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
627 print "\tDisabling %s due to out-of-date BootImage" % hostname
628 api.UpdateNode(hostname, {'boot_state' : 'disable'})
630 # NOTE: nothing else is possible.
633 debugnode = DebugInterface(hostname)
634 conn = debugnode.getConnection()
635 #print "conn: %s" % conn
636 #print "trying to use conn after returning it."
637 #print conn.c.modules.sys.path
638 #print conn.c.modules.os.path.exists('/tmp/source')
639 if type(conn) == type(False): return False
641 #if forced_action == "reboot":
642 # conn.restart_node('rins')
645 boot_state = conn.get_boot_state()
646 if boot_state != "debug":
647 print "... %s in %s state: skipping..." % (hostname , boot_state)
648 return boot_state == "boot"
650 if conn.bootmanager_running():
651 print "...BootManager is currently running. Skipping host %s" %hostname
654 # Read persistent flags, tagged on one week intervals.
656 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
657 dmesg = conn.get_dmesg()
658 child = fdpexpect.fdspawn(dmesg)
660 steps = debugnode.getDiskSteps()
661 sequence = debugnode.getDiskSequence(steps, child)
664 if config and not config.quiet: print "\tSET: ", s
667 print "...Potential drive errors on %s" % hostname
668 if len(s) == 2 and 'floppyerror' in s:
669 print "...Should investigate. Continuing with node."
671 print "...Should investigate. Skipping node."
672 # TODO: send message related to these errors.
674 if not found_within(recent_actions, 'newbootcd_notice', 3):
676 log=conn.get_dmesg().read()
677 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
678 conn.set_nodestate('disable')
682 print "...Downloading bm.log from %s" %hostname
683 log = conn.get_bootmanager_log()
684 child = fdpexpect.fdspawn(log)
686 if hasattr(config, 'collect') and config.collect: return True
688 if config and not config.quiet: print "...Scanning bm.log for errors"
692 steps = debugnode.getBootManagerStepPatterns()
693 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
695 s = "-".join(sequence)
696 print " FOUND SEQUENCE: ", s
698 # NOTE: We get or set the flag based on the current sequence identifier.
699 # By using the sequence identifier, we guarantee that there will be no
700 # frequent loops. I'm guessing there is a better way to track loops,
703 sequences = debugnode.getSequences()
706 if s not in sequences:
707 print " HOST %s" % hostname
708 print " UNKNOWN SEQUENCE: %s" % s
711 args['hostname'] = hostname
713 args['bmlog'] = conn.get_bootmanager_log().read()
714 args['viart'] = False
716 sitehist.sendMessage('unknownsequence_notice', **args)
718 conn.restart_bootmanager('boot')
720 # NOTE: Do not set the pflags value for this sequence if it's unknown.
721 # This way, we can check it again after we've fixed it.
726 if sequences[s] == "restart_bootmanager_boot":
727 print "...Restarting BootManager.py on %s "%hostname
728 conn.restart_bootmanager('boot')
729 elif sequences[s] == "restart_bootmanager_rins":
730 print "...Restarting BootManager.py on %s "%hostname
731 conn.restart_bootmanager('rins')
732 elif sequences[s] == "restart_node_rins":
733 conn.restart_node('rins')
734 elif sequences[s] == "restart_node_boot":
735 conn.restart_node('boot')
736 elif sequences[s] == "repair_node_keys":
737 if conn.compare_and_repair_nodekeys():
738 # the keys either are in sync or were forced in sync.
739 # so try to reboot the node again.
740 conn.restart_bootmanager('rins')
743 # there was some failure to synchronize the keys.
744 print "...Unable to repair node keys on %s" %hostname
746 elif sequences[s] == "suspect_error_email":
748 args['hostname'] = hostname
750 args['bmlog'] = conn.get_bootmanager_log().read()
751 args['viart'] = False
753 sitehist.sendMessage('unknownsequence_notice', **args)
754 conn.restart_bootmanager('boot')
756 # TODO: differentiate this and the 'nodenetwork_email' actions.
757 elif sequences[s] == "update_node_config_email":
759 if not found_within(recent_actions, 'nodeconfig_notice', 3):
761 args['hostname'] = hostname
762 sitehist.sendMessage('nodeconfig_notice', **args)
763 conn.dump_plconf_file()
765 elif sequences[s] == "nodenetwork_email":
767 if not found_within(recent_actions, 'nodeconfig_notice', 3):
769 args['hostname'] = hostname
770 args['bmlog'] = conn.get_bootmanager_log().read()
771 sitehist.sendMessage('nodeconfig_notice', **args)
772 conn.dump_plconf_file()
774 elif sequences[s] == "update_bootcd_email":
776 if not found_within(recent_actions, 'newalphacd_notice', 3):
778 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
779 args['hostname'] = hostname
781 sitehist.sendMessage('newalphacd_notice', **args)
783 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
785 elif sequences[s] == "broken_hardware_email":
786 # MAKE An ACTION record that this host has failed hardware. May
787 # require either an exception "/minhw" or other manual intervention.
788 # Definitely need to send out some more EMAIL.
789 # TODO: email notice of broken hardware
790 if not found_within(recent_actions, 'baddisk_notice', 1):
791 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
793 args['hostname'] = hostname
794 args['log'] = conn.get_dmesg().read()
796 sitehist.sendMessage('baddisk_notice', **args)
797 conn.set_nodestate('disable')
799 elif sequences[s] == "update_hardware_email":
800 if not found_within(recent_actions, 'minimalhardware_notice', 1):
801 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
803 args['hostname'] = hostname
804 args['bmlog'] = conn.get_bootmanager_log().read()
805 sitehist.sendMessage('minimalhardware_notice', **args)
807 elif sequences[s] == "bad_dns_email":
808 if not found_within(recent_actions, 'baddns_notice', 1):
809 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
812 node = plccache.GetNodeByName(hostname)
813 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
816 print traceback.print_exc()
817 # TODO: api error. skip email, b/c all info is not available,
818 # flag_set will not be recorded.
820 nodenet_str = network_config_to_str(net)
822 args['hostname'] = hostname
823 args['network_config'] = nodenet_str
824 args['nodenetwork_id'] = net['nodenetwork_id']
826 sitehist.sendMessage('baddns_notice', **args)
831 # MAIN -------------------------------------------------------------------
834 from monitor import parser as parsermodule
835 parser = parsermodule.getParser()
837 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
838 force=None, quiet=False)
839 parser.add_option("", "--child", dest="child", action="store_true",
840 help="This is the child mode of this process.")
841 parser.add_option("", "--force", dest="force", metavar="boot_state",
842 help="Force a boot state passed to BootManager.py.")
843 parser.add_option("", "--quiet", dest="quiet", action="store_true",
844 help="Extra quiet output messages.")
845 parser.add_option("", "--verbose", dest="verbose", action="store_true",
846 help="Extra debug output messages.")
847 parser.add_option("", "--nonet", dest="nonet", action="store_true",
848 help="Do not setup the network, use existing log files to re-run a test pass.")
849 parser.add_option("", "--collect", dest="collect", action="store_true",
850 help="No action, just collect dmesg, and bm.log")
851 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
852 help="Do not perform the orginary setup phase.")
854 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
855 config = parsermodule.parse_args(parser)
858 nodes = config.getListFromFile(config.nodelist)
860 nodes = [ config.node ]
868 if __name__ == "__main__":