3 # Attempt to reboot a node in debug state.
16 from monitor.getsshkeys import SSHKnownHosts
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
41 api = plc.getAuthAPI()
46 def __init__(self, connection, node, config):
51 def get_boot_state(self):
53 if self.c.modules.os.path.exists('/tmp/source'):
55 elif self.c.modules.os.path.exists('/vservers'):
61 print self.c.modules.sys.path
69 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
70 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
71 log = open("log/dmesg.%s.log" % self.node, 'r')
74 def get_bootmanager_log(self):
75 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
76 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
77 os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
78 log = open("log/bm.%s.log" % self.node, 'r')
81 def dump_plconf_file(self):
83 self.c.modules.sys.path.append("/tmp/source/")
84 self.c.modules.os.chdir('/tmp/source')
86 log = c.modules.BootManager.log('/tmp/new.log')
87 bm = c.modules.BootManager.BootManager(log,'boot')
89 BootManagerException = c.modules.Exceptions.BootManagerException
90 InitializeBootManager = c.modules.BootManager.InitializeBootManager
91 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
94 InitializeBootManager.Run(bm.VARS, bm.LOG)
95 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
99 print " Possibly, unable to find valid configuration file"
102 for key in bm.VARS.keys():
103 print key, " == ", bm.VARS[key]
105 print " Unable to read Node Configuration"
108 def compare_and_repair_nodekeys(self):
110 self.c.modules.sys.path.append("/tmp/source/")
111 self.c.modules.os.chdir('/tmp/source')
113 log = c.modules.BootManager.log('/tmp/new.log')
114 bm = c.modules.BootManager.BootManager(log,'boot')
116 BootManagerException = c.modules.Exceptions.BootManagerException
117 InitializeBootManager = c.modules.BootManager.InitializeBootManager
118 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
121 plcnode = plccache.GetNodeByName(self.node)
123 InitializeBootManager.Run(bm.VARS, bm.LOG)
124 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
129 print " Possibly, unable to find valid configuration file"
132 print " NODE: %s" % bm.VARS['NODE_KEY']
133 print " PLC : %s" % plcnode['key']
135 if bm.VARS['NODE_KEY'] == plcnode['key']:
138 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
139 print " Successfully updated NODE_KEY with PLC"
144 #for key in bm.VARS.keys():
145 # print key, " == ", bm.VARS[key]
147 print " Unable to retrieve NODE_KEY"
149 def bootmanager_running(self):
150 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
155 def set_nodestate(self, state='boot'):
156 return api.UpdateNode(self.node, {'boot_state' : state})
158 def restart_node(self, state='boot'):
159 api.UpdateNode(self.node, {'boot_state' : state})
161 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
162 if not pflags.getRecentFlag('gentlekill'):
163 print " Killing all slice processes... : %s" % self.node
164 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
165 self.c.modules.os.system(cmd_slicekill)
166 cmd = """ shutdown -r +1 & """
167 print " Restarting %s : %s" % ( self.node, cmd)
168 self.c.modules.os.system(cmd)
170 pflags.setRecentFlag('gentlekill')
173 print " Restarting with sysrq 'sub' %s" % self.node
174 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
175 self.c.modules.os.system(cmd)
179 def restart_bootmanager(self, forceState):
181 self.c.modules.os.chdir('/tmp/source')
182 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
183 print " BootManager is already running: try again soon..."
185 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
186 cmd = "( touch /tmp/BM_RUNNING ; " + \
187 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
188 " rm -f /tmp/BM_RUNNING " + \
190 cmd = cmd % forceState
191 self.c.modules.os.system(cmd)
196 class PlanetLabSession:
197 globalport = 22000 + int(random.random()*1000)
199 def __init__(self, node, nosetup, verbose):
200 self.verbose = verbose
203 self.nosetup = nosetup
207 def get_connection(self, config):
208 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
211 # print i, conn.c.modules.sys.path
212 # print conn.c.modules.os.path.exists('/tmp/source')
217 def setup_host(self):
218 self.port = PlanetLabSession.globalport
219 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
222 args['port'] = self.port
223 args['user'] = 'root'
224 args['hostname'] = self.node
225 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
229 print "Skipping setup"
232 # COPY Rpyc files to host
233 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
234 if self.verbose: print cmd
238 localos = moncommands.CMD()
240 ret = localos.system(cmd, timeout)
243 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
244 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
245 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
246 ret = localos.system(cmd, timeout)
249 print "\tFAILED TWICE"
251 raise Exception("Failed twice trying to login with updated ssh host key")
254 # KILL any already running servers.
255 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
256 (ov,ev) = ssh.run_noexcept2("""<<\EOF
258 echo "kill server" >> out.log
259 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
260 echo "export" >> out.log
261 export PYTHONPATH=$HOME ;
262 echo "start server" >> out.log
263 python Rpyc/Servers/forking_server.py &> server.log &
264 echo "done" >> out.log
266 #cmd = """ssh %(user)s@%(hostname)s """ + \
267 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
269 #if self.verbose: print cmd
271 #print localos.system(cmd,timeout)
273 ## START a new rpyc server.
274 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
275 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
277 #if self.verbose: print cmd
278 #print localos.system(cmd,timeout)
279 print "setup rpyc server over ssh"
283 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
284 # and the following options seems to work well.
285 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
286 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
287 """-o ConnectTimeout=120 """ + \
288 """-n -N -L %(port)s:localhost:18812 """ + \
289 """%(user)s@%(hostname)s"""
291 if self.verbose: print cmd
293 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
294 # TODO: the read() here may block indefinitely. Need a better
295 # approach therefore, that includes a timeout.
296 #ret = self.command.stdout.read(5)
297 ret = moncommands.read_t(self.command.stdout, 5)
301 # NOTE: There is still a slight race for machines that are slow...
302 self.timeout = 2*(t2-t1)
303 print "Sleeping for %s sec" % self.timeout
304 time.sleep(self.timeout)
307 if self.command.returncode is not None:
308 print "Failed to establish tunnel!"
309 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
311 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
315 if self.verbose: print "Killing SSH session %s" % self.port
316 print "Killing SSH session %s" % self.port
320 def steps_to_list(steps, index=1):
321 return map(lambda x: x[index], steps)
323 def index_to_id(steps,index):
324 if index < len(steps):
325 return steps[index][0]
329 class DebugInterface:
330 def __init__(self, hostname):
331 self.hostname = hostname
334 def getConnection(self):
335 print "Creating session for %s" % self.hostname
336 # update known_hosts file (in case the node has rebooted since last run)
338 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
341 print traceback.print_exc()
346 self.session = PlanetLabSession(self.hostname, False, True)
348 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
350 msg = "ERROR setting up session for %s" % self.hostname
352 traceback.print_exc()
357 conn = self.session.get_connection(config)
359 # NOTE: sometimes the wait in setup_host() is not long enough.
360 # So, here we try to wait a little longer before giving up entirely.
362 time.sleep(self.session.timeout*5)
363 conn = self.session.get_connection(config)
365 traceback.print_exc()
366 email_exception(self.hostname)
368 #print "trying to use conn before returning it."
369 #print conn.c.modules.sys.path
370 #print conn.c.modules.os.path.exists('/tmp/source')
373 #print "conn: %s" % conn
376 def getSequences(self):
378 # TODO: This can be replaced with a DB definition at a future time.
379 # This would make it possible for an admin to introduce new
380 # patterns without touching code.
383 # restart_bootmanager_boot
384 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
385 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
386 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
388 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
390 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
391 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
392 "bminit-cfg-auth-getplc-update-debug-done",
393 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
394 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
395 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
396 "bminit-cfg-auth-protoerror-exception-update-debug-done",
397 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
398 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
400 sequences.update({n : "restart_bootmanager_boot"})
402 # conn.restart_bootmanager('rins')
403 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
404 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
405 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
406 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
407 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
408 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
409 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
410 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
411 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
412 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
413 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
414 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
415 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
416 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
417 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
418 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
419 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
420 # actual solution appears to involve removing the bad files, and
421 # continually trying to boot the node.
422 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
423 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
424 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
426 sequences.update({n : "restart_bootmanager_rins"})
429 sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
431 # conn.restart_node('rins')
432 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
433 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
434 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
435 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
436 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
437 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
438 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
439 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
440 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
441 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
442 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
443 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
445 sequences.update({n : "restart_node_rins"})
448 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
449 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
450 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
451 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
452 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
453 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
455 sequences.update({n: "restart_node_boot"})
457 # update_node_config_email
458 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
459 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
460 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
462 sequences.update({n : "update_node_config_email"})
464 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
465 "bminit-cfg-update-exception-nodehostname-update-debug-done",
467 sequences.update({n : "nodenetwork_email"})
469 # update_bootcd_email
470 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
471 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
472 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
473 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
474 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
476 sequences.update({n : "update_bootcd_email"})
478 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
480 sequences.update({n: "suspect_error_email"})
482 # update_hardware_email
483 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
484 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
486 # broken_hardware_email
487 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
491 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
492 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
494 sequences.update( { n : "bad_dns_email"})
498 def getDiskSteps(self):
500 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
501 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
502 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
504 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
506 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
507 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
509 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
510 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
512 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
513 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
515 ('floppytimeout','floppy0: floppy timeout called'),
516 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
518 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
519 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
521 # floppy0: floppy timeout called
522 # end_request: I/O error, dev fd0, sector 0
524 # Buffer I/O error on device dm-2, logical block 8888896
525 # ata1: status=0x51 { DriveReady SeekComplete Error }
526 # ata1: error=0x40 { UncorrectableError }
527 # SCSI error : <0 0 0 0> return code = 0x8000002
528 # sda: Current: sense key: Medium Error
529 # Additional sense: Unrecovered read error - auto reallocate failed
531 # SCSI error : <0 2 0 0> return code = 0x40001
532 # end_request: I/O error, dev sda, sector 572489600
536 def getDiskSequence(self, steps, child):
539 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
546 def getBootManagerStepPatterns(self):
548 ('bminit' , 'Initializing the BootManager.'),
549 ('cfg' , 'Reading node configuration file.'),
550 ('auth' , 'Authenticating node with PLC.'),
551 ('getplc' , 'Retrieving details of node from PLC.'),
552 ('update' , 'Updating node boot state at PLC.'),
553 ('hardware' , 'Checking if hardware requirements met.'),
554 ('installinit' , 'Install: Initializing.'),
555 ('installdisk' , 'Install: partitioning disks.'),
556 ('installbootfs', 'Install: bootstrapfs tarball.'),
557 ('installcfg' , 'Install: Writing configuration files.'),
558 ('installstop' , 'Install: Shutting down installer.'),
559 ('update2' , 'Updating node boot state at PLC.'),
560 ('installinit2' , 'Install: Initializing.'),
561 ('validate' , 'Validating node installation.'),
562 ('rebuildinitrd', 'Rebuilding initrd'),
563 ('netcfg' , 'Install: Writing Network Configuration files.'),
564 ('update3' , 'Updating node configuration.'),
565 ('disk' , 'Checking for unused disks to add to LVM.'),
566 ('update4' , 'Sending hardware configuration to PLC.'),
567 ('debug' , 'Starting debug mode'),
568 ('bmexceptmount', 'BootManagerException during mount'),
569 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
570 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
571 ('exception' , 'Exception'),
572 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
573 ('protoerror' , 'XML RPC protocol error'),
574 ('nodehostname' , 'Configured node hostname does not resolve'),
575 ('implementerror', 'Implementation Error'),
576 ('readonlyfs' , '[Errno 30] Read-only file system'),
577 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
578 ('noinstall' , 'notinstalled'),
579 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
580 ('noblockdev' , "No block devices detected."),
581 ('dnserror' , 'Name or service not known'),
582 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
583 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
584 ('hardwarerequirefail' , 'Hardware requirements not met'),
585 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
586 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
587 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
588 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
589 ('modulefail' , 'Unable to get list of system modules'),
590 ('writeerror' , 'write error: No space left on device'),
591 ('nospace' , "No space left on device"),
592 ('nonode' , 'Failed to authenticate call: No such node'),
593 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
594 ('bootcheckfail' , 'BootCheckAuthentication'),
595 ('bootupdatefail' , 'BootUpdateNode'),
599 def getBootManagerSequenceFromLog(self, steps, child):
603 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
604 id = index_to_id(steps,index)
607 if id == "exception":
608 print "...Found An Exception!!!"
609 elif id == "done": #index == len(steps_to_list(steps)):
616 def restore(sitehist, hostname, config=None, forced_action=None):
618 # NOTE: Nothing works if the bootcd is REALLY old.
619 # So, this is the first step.
621 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
622 recent_actions = sitehist.getRecentActions(hostname=hostname)
624 if fbnode['observed_category'] == "OLDBOOTCD":
625 print "\t...Notify owner to update BootImage!!!"
627 if not found_within(recent_actions, 'newbootcd_notice', 3):
628 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
630 print "\tDisabling %s due to out-of-date BootImage" % hostname
631 api.UpdateNode(hostname, {'boot_state' : 'disable'})
633 # NOTE: nothing else is possible.
636 debugnode = DebugInterface(hostname)
637 conn = debugnode.getConnection()
638 #print "conn: %s" % conn
639 #print "trying to use conn after returning it."
640 #print conn.c.modules.sys.path
641 #print conn.c.modules.os.path.exists('/tmp/source')
642 if type(conn) == type(False): return False
644 #if forced_action == "reboot":
645 # conn.restart_node('rins')
648 boot_state = conn.get_boot_state()
649 if boot_state != "debug":
650 print "... %s in %s state: skipping..." % (hostname , boot_state)
651 return boot_state == "boot"
653 if conn.bootmanager_running():
654 print "...BootManager is currently running. Skipping host %s" %hostname
657 # Read persistent flags, tagged on one week intervals.
659 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
660 dmesg = conn.get_dmesg()
661 child = fdpexpect.fdspawn(dmesg)
663 steps = debugnode.getDiskSteps()
664 sequence = debugnode.getDiskSequence(steps, child)
667 if config and not config.quiet: print "\tSET: ", s
670 print "...Potential drive errors on %s" % hostname
671 if len(s) == 2 and 'floppyerror' in s:
672 print "...Should investigate. Continuing with node."
674 print "...Should investigate. Skipping node."
675 # TODO: send message related to these errors.
677 if not found_within(recent_actions, 'newbootcd_notice', 3):
679 log=conn.get_dmesg().read()
680 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
681 conn.set_nodestate('disable')
685 print "...Downloading bm.log from %s" %hostname
686 log = conn.get_bootmanager_log()
687 child = fdpexpect.fdspawn(log)
689 if hasattr(config, 'collect') and config.collect: return True
691 if config and not config.quiet: print "...Scanning bm.log for errors"
695 steps = debugnode.getBootManagerStepPatterns()
696 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
698 s = "-".join(sequence)
699 print " FOUND SEQUENCE: ", s
701 # NOTE: We get or set the flag based on the current sequence identifier.
702 # By using the sequence identifier, we guarantee that there will be no
703 # frequent loops. I'm guessing there is a better way to track loops,
706 sequences = debugnode.getSequences()
709 if s not in sequences:
710 print " HOST %s" % hostname
711 print " UNKNOWN SEQUENCE: %s" % s
714 args['hostname'] = hostname
716 args['bmlog'] = conn.get_bootmanager_log().read()
717 args['viart'] = False
719 sitehist.sendMessage('unknownsequence_notice', **args)
721 conn.restart_bootmanager('boot')
723 # NOTE: Do not set the pflags value for this sequence if it's unknown.
724 # This way, we can check it again after we've fixed it.
729 if sequences[s] == "restart_bootmanager_boot":
730 print "...Restarting BootManager.py on %s "%hostname
731 conn.restart_bootmanager('boot')
732 elif sequences[s] == "restart_bootmanager_rins":
733 print "...Restarting BootManager.py on %s "%hostname
734 conn.restart_bootmanager('rins')
735 elif sequences[s] == "restart_node_rins":
736 conn.restart_node('rins')
737 elif sequences[s] == "restart_node_boot":
738 conn.restart_node('boot')
739 elif sequences[s] == "repair_node_keys":
740 if conn.compare_and_repair_nodekeys():
741 # the keys either are in sync or were forced in sync.
742 # so try to reboot the node again.
743 conn.restart_bootmanager('rins')
746 # there was some failure to synchronize the keys.
747 print "...Unable to repair node keys on %s" %hostname
749 elif sequences[s] == "suspect_error_email":
751 args['hostname'] = hostname
753 args['bmlog'] = conn.get_bootmanager_log().read()
754 args['viart'] = False
756 sitehist.sendMessage('unknownsequence_notice', **args)
757 conn.restart_bootmanager('boot')
759 # TODO: differentiate this and the 'nodenetwork_email' actions.
760 elif sequences[s] == "update_node_config_email":
762 if not found_within(recent_actions, 'nodeconfig_notice', 3):
764 args['hostname'] = hostname
765 sitehist.sendMessage('nodeconfig_notice', **args)
766 conn.dump_plconf_file()
768 elif sequences[s] == "nodenetwork_email":
770 if not found_within(recent_actions, 'nodeconfig_notice', 3):
772 args['hostname'] = hostname
773 args['bmlog'] = conn.get_bootmanager_log().read()
774 sitehist.sendMessage('nodeconfig_notice', **args)
775 conn.dump_plconf_file()
777 elif sequences[s] == "update_bootcd_email":
779 if not found_within(recent_actions, 'newalphacd_notice', 3):
781 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
782 args['hostname'] = hostname
784 sitehist.sendMessage('newalphacd_notice', **args)
786 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
788 elif sequences[s] == "broken_hardware_email":
789 # MAKE An ACTION record that this host has failed hardware. May
790 # require either an exception "/minhw" or other manual intervention.
791 # Definitely need to send out some more EMAIL.
792 # TODO: email notice of broken hardware
793 if not found_within(recent_actions, 'baddisk_notice', 1):
794 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
796 args['hostname'] = hostname
797 args['log'] = conn.get_dmesg().read()
799 sitehist.sendMessage('baddisk_notice', **args)
800 conn.set_nodestate('disable')
802 elif sequences[s] == "update_hardware_email":
803 if not found_within(recent_actions, 'minimalhardware_notice', 1):
804 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
806 args['hostname'] = hostname
807 args['bmlog'] = conn.get_bootmanager_log().read()
808 sitehist.sendMessage('minimalhardware_notice', **args)
810 elif sequences[s] == "bad_dns_email":
811 if not found_within(recent_actions, 'baddns_notice', 1):
812 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
815 node = plccache.GetNodeByName(hostname)
816 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
819 print traceback.print_exc()
820 # TODO: api error. skip email, b/c all info is not available,
821 # flag_set will not be recorded.
823 nodenet_str = network_config_to_str(net)
825 args['hostname'] = hostname
826 args['network_config'] = nodenet_str
827 args['nodenetwork_id'] = net['nodenetwork_id']
829 sitehist.sendMessage('baddns_notice', **args)
834 # MAIN -------------------------------------------------------------------
837 from monitor import parser as parsermodule
838 parser = parsermodule.getParser()
840 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
841 force=None, quiet=False)
842 parser.add_option("", "--child", dest="child", action="store_true",
843 help="This is the child mode of this process.")
844 parser.add_option("", "--force", dest="force", metavar="boot_state",
845 help="Force a boot state passed to BootManager.py.")
846 parser.add_option("", "--quiet", dest="quiet", action="store_true",
847 help="Extra quiet output messages.")
848 parser.add_option("", "--verbose", dest="verbose", action="store_true",
849 help="Extra debug output messages.")
850 parser.add_option("", "--nonet", dest="nonet", action="store_true",
851 help="Do not setup the network, use existing log files to re-run a test pass.")
852 parser.add_option("", "--collect", dest="collect", action="store_true",
853 help="No action, just collect dmesg, and bm.log")
854 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
855 help="Do not perform the orginary setup phase.")
857 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
858 config = parsermodule.parse_args(parser)
861 nodes = config.getListFromFile(config.nodelist)
863 nodes = [ config.node ]
870 lb = plccache.plcdb_hn2lb[node]
871 sitehist = SiteInterface.get_or_make(loginbase=lb)
872 #reboot(node, config)
873 restore(sitehist, node, config=None, forced_action=None)
875 if __name__ == "__main__":