3 # Attempt to reboot a node in debug state.
16 from monitor.getsshkeys import SSHKnownHosts
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
41 api = plc.getAuthAPI()
45 class ExceptionDoubleSSHError(Exception): pass
48 def __init__(self, connection, node, config):
53 def get_boot_state(self):
55 if self.c.modules.os.path.exists('/tmp/source'):
57 elif self.c.modules.os.path.exists('/vservers'):
63 print self.c.modules.sys.path
71 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
72 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
73 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
74 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
75 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
78 def get_bootmanager_log(self):
79 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
80 download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
81 os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
82 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
86 # def get_dmesg(self):
87 # self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
88 # download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
89 # log = open("log/dmesg.%s.log" % self.node, 'r')
92 # def get_bootmanager_log(self):
93 # download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
94 # #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
95 # os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
96 # log = open("log/bm.%s.log" % self.node, 'r')
99 def dump_plconf_file(self):
101 self.c.modules.sys.path.append("/tmp/source/")
102 self.c.modules.os.chdir('/tmp/source')
104 log = c.modules.BootManager.log('/tmp/new.log')
105 bm = c.modules.BootManager.BootManager(log,'boot')
107 BootManagerException = c.modules.Exceptions.BootManagerException
108 InitializeBootManager = c.modules.BootManager.InitializeBootManager
109 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
112 InitializeBootManager.Run(bm.VARS, bm.LOG)
113 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
117 print " Possibly, unable to find valid configuration file"
120 for key in bm.VARS.keys():
121 print key, " == ", bm.VARS[key]
123 print " Unable to read Node Configuration"
126 def compare_and_repair_nodekeys(self):
128 self.c.modules.sys.path.append("/tmp/source/")
129 self.c.modules.os.chdir('/tmp/source')
131 log = c.modules.BootManager.log('/tmp/new.log')
132 bm = c.modules.BootManager.BootManager(log,'boot')
134 BootManagerException = c.modules.Exceptions.BootManagerException
135 InitializeBootManager = c.modules.BootManager.InitializeBootManager
136 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
139 plcnode = plccache.GetNodeByName(self.node)
141 InitializeBootManager.Run(bm.VARS, bm.LOG)
142 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
147 print " Possibly, unable to find valid configuration file"
150 print " NODE: %s" % bm.VARS['NODE_KEY']
151 print " PLC : %s" % plcnode['key']
153 if bm.VARS['NODE_KEY'] == plcnode['key']:
156 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
157 print " Successfully updated NODE_KEY with PLC"
162 #for key in bm.VARS.keys():
163 # print key, " == ", bm.VARS[key]
165 print " Unable to retrieve NODE_KEY"
167 def bootmanager_running(self):
168 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
173 def set_nodestate(self, state='boot'):
174 return api.UpdateNode(self.node, {'boot_state' : state})
176 def restart_node(self, state='boot'):
177 api.UpdateNode(self.node, {'boot_state' : state})
179 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
180 if not pflags.getRecentFlag('gentlekill'):
181 print " Killing all slice processes... : %s" % self.node
182 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
183 self.c.modules.os.system(cmd_slicekill)
184 cmd = """ shutdown -r +1 & """
185 print " Restarting %s : %s" % ( self.node, cmd)
186 self.c.modules.os.system(cmd)
188 pflags.setRecentFlag('gentlekill')
191 print " Restarting with sysrq 'sub' %s" % self.node
192 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
193 self.c.modules.os.system(cmd)
197 def restart_bootmanager(self, forceState):
199 self.c.modules.os.chdir('/tmp/source')
200 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
201 print " BootManager is already running: try again soon..."
203 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
204 cmd = "( touch /tmp/BM_RUNNING ; " + \
205 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
206 " rm -f /tmp/BM_RUNNING " + \
208 cmd = cmd % forceState
209 self.c.modules.os.system(cmd)
214 class PlanetLabSession:
215 globalport = 22000 + int(random.random()*1000)
217 def __init__(self, node, nosetup, verbose):
218 self.verbose = verbose
221 self.nosetup = nosetup
225 def get_connection(self, config):
226 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
229 # print i, conn.c.modules.sys.path
230 # print conn.c.modules.os.path.exists('/tmp/source')
235 def setup_host(self):
236 self.port = PlanetLabSession.globalport
237 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
240 args['port'] = self.port
241 args['user'] = 'root'
242 args['hostname'] = self.node
243 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
247 print "Skipping setup"
250 # COPY Rpyc files to host
251 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
252 if self.verbose: print cmd
256 localos = moncommands.CMD()
258 ret = localos.system(cmd, timeout)
261 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
262 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
263 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
264 ret = localos.system(cmd, timeout)
267 print "\tFAILED TWICE"
269 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
272 # KILL any already running servers.
273 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
274 (ov,ev) = ssh.run_noexcept2("""<<\EOF
276 echo "kill server" >> out.log
277 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
278 echo "export" >> out.log
279 export PYTHONPATH=$HOME ;
280 echo "start server" >> out.log
281 python Rpyc/Servers/forking_server.py &> server.log &
282 echo "done" >> out.log
284 #cmd = """ssh %(user)s@%(hostname)s """ + \
285 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
287 #if self.verbose: print cmd
289 #print localos.system(cmd,timeout)
291 ## START a new rpyc server.
292 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
293 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
295 #if self.verbose: print cmd
296 #print localos.system(cmd,timeout)
297 print "setup rpyc server over ssh"
301 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
302 # and the following options seems to work well.
303 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
304 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
305 """-o ConnectTimeout=120 """ + \
306 """-n -N -L %(port)s:localhost:18812 """ + \
307 """%(user)s@%(hostname)s"""
309 if self.verbose: print cmd
311 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
312 # TODO: the read() here may block indefinitely. Need a better
313 # approach therefore, that includes a timeout.
314 #ret = self.command.stdout.read(5)
315 ret = moncommands.read_t(self.command.stdout, 5)
319 # NOTE: There is still a slight race for machines that are slow...
320 self.timeout = 2*(t2-t1)
321 print "Sleeping for %s sec" % self.timeout
322 time.sleep(self.timeout)
325 if self.command.returncode is not None:
326 print "Failed to establish tunnel!"
327 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
329 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
333 if self.verbose: print "Killing SSH session %s" % self.port
334 print "Killing SSH session %s" % self.port
338 def steps_to_list(steps, index=1):
339 return map(lambda x: x[index], steps)
341 def index_to_id(steps,index):
342 if index < len(steps):
343 return steps[index][0]
347 class DebugInterface:
348 def __init__(self, hostname):
349 self.hostname = hostname
352 def getConnection(self):
353 print "Creating session for %s" % self.hostname
354 # update known_hosts file (in case the node has rebooted since last run)
356 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
359 print traceback.print_exc()
364 self.session = PlanetLabSession(self.hostname, False, True)
366 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
367 except ExceptionDoubleSSHError, e:
368 msg = "ERROR setting up session for %s" % self.hostname
372 traceback.print_exc()
377 conn = self.session.get_connection(config)
379 # NOTE: sometimes the wait in setup_host() is not long enough.
380 # So, here we try to wait a little longer before giving up entirely.
382 time.sleep(self.session.timeout*5)
383 conn = self.session.get_connection(config)
385 # failed twice... no need to report this really, it's just in a
389 traceback.print_exc()
390 email_exception(self.hostname)
392 #print "trying to use conn before returning it."
393 #print conn.c.modules.sys.path
394 #print conn.c.modules.os.path.exists('/tmp/source')
397 #print "conn: %s" % conn
400 def getSequences(self):
402 # TODO: This can be replaced with a DB definition at a future time.
403 # This would make it possible for an admin to introduce new
404 # patterns without touching code.
407 # restart_bootmanager_boot
408 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
409 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
410 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
412 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
414 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
415 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
416 "bminit-cfg-auth-getplc-update-debug-done",
417 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
418 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
419 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
420 "bminit-cfg-auth-protoerror-exception-update-debug-done",
421 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
422 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
424 sequences.update({n : "restart_bootmanager_boot"})
426 # conn.restart_bootmanager('reinstall')
427 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
428 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
429 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
430 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
431 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
432 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
433 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
434 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
435 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
436 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
437 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
438 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
439 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
440 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
441 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
442 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
443 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
444 # actual solution appears to involve removing the bad files, and
445 # continually trying to boot the node.
446 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
447 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
448 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
449 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
450 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
452 sequences.update({n : "restart_bootmanager_rins"})
455 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
456 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
458 sequences.update({n: "repair_node_keys"})
460 # conn.restart_node('reinstall')
461 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
462 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
463 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
464 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
465 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
466 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
467 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
468 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
469 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
470 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
471 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
472 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
474 sequences.update({n : "restart_node_rins"})
477 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
478 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
479 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
480 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
481 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
482 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
483 "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
485 sequences.update({n: "restart_node_boot"})
487 # update_node_config_email
488 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
489 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
490 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
491 "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
492 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
494 sequences.update({n : "update_node_config_email"})
496 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
497 "bminit-cfg-update-exception-nodehostname-update-debug-done",
499 sequences.update({n : "nodenetwork_email"})
501 # update_bootcd_email
502 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
503 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
504 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
505 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
506 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
508 sequences.update({n : "update_bootcd_email"})
510 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
512 sequences.update({n: "suspect_error_email"})
514 # update_hardware_email
515 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
516 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
518 # broken_hardware_email
519 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
523 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
524 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
526 sequences.update( { n : "bad_dns_email"})
530 def getDiskSteps(self):
532 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
533 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
534 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
536 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
538 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
539 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
541 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
542 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
544 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
545 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
547 ('floppytimeout','floppy0: floppy timeout called'),
548 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
550 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
551 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
553 # floppy0: floppy timeout called
554 # end_request: I/O error, dev fd0, sector 0
556 # Buffer I/O error on device dm-2, logical block 8888896
557 # ata1: status=0x51 { DriveReady SeekComplete Error }
558 # ata1: error=0x40 { UncorrectableError }
559 # SCSI error : <0 0 0 0> return code = 0x8000002
560 # sda: Current: sense key: Medium Error
561 # Additional sense: Unrecovered read error - auto reallocate failed
563 # SCSI error : <0 2 0 0> return code = 0x40001
564 # end_request: I/O error, dev sda, sector 572489600
568 def getDiskSequence(self, steps, child):
571 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
578 def getBootManagerStepPatterns(self):
580 ('bminit' , 'Initializing the BootManager.'),
581 ('cfg' , 'Reading node configuration file.'),
582 ('auth' , 'Authenticating node with PLC.'),
583 ('getplc' , 'Retrieving details of node from PLC.'),
584 ('update' , 'Updating node boot state at PLC.'),
585 ('hardware' , 'Checking if hardware requirements met.'),
586 ('installinit' , 'Install: Initializing.'),
587 ('installdisk' , 'Install: partitioning disks.'),
588 ('installbootfs', 'Install: bootstrapfs tarball.'),
589 ('installcfg' , 'Install: Writing configuration files.'),
590 ('installstop' , 'Install: Shutting down installer.'),
591 ('update2' , 'Updating node boot state at PLC.'),
592 ('installinit2' , 'Install: Initializing.'),
593 ('validate' , 'Validating node installation.'),
594 ('rebuildinitrd', 'Rebuilding initrd'),
595 ('netcfg' , 'Install: Writing Network Configuration files.'),
596 ('update3' , 'Updating node configuration.'),
597 ('disk' , 'Checking for unused disks to add to LVM.'),
598 ('update4' , 'Sending hardware configuration to PLC.'),
599 ('debug' , 'Starting debug mode'),
600 ('bmexceptmount', 'BootManagerException during mount'),
601 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
602 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
603 ('exception' , 'Exception'),
604 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
605 ('protoerror' , 'XML RPC protocol error'),
606 ('nodehostname' , 'Configured node hostname does not resolve'),
607 ('implementerror', 'Implementation Error'),
608 ('readonlyfs' , '[Errno 30] Read-only file system'),
609 ('baddisk' , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
610 ('noinstall' , 'notinstalled'),
611 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
612 ('noblockdev' , "No block devices detected."),
613 ('dnserror' , 'Name or service not known'),
614 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
615 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
616 ('hardwarerequirefail' , 'Hardware requirements not met'),
617 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
618 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
619 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
620 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
621 ('modulefail' , 'Unable to get list of system modules'),
622 ('writeerror' , 'write error: No space left on device'),
623 ('nospace' , "No space left on device"),
624 ('nonode' , 'Failed to authenticate call: No such node'),
625 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
626 ('bootcheckfail' , 'BootCheckAuthentication'),
627 ('bootupdatefail' , 'BootUpdateNode'),
631 def getBootManagerSequenceFromLog(self, steps, child):
635 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
636 id = index_to_id(steps,index)
639 if id == "exception":
640 print "...Found An Exception!!!"
641 elif id == "done": #index == len(steps_to_list(steps)):
647 def restore(sitehist, hostname, config=None, forced_action=None):
648 ret = restore_basic(sitehist, hostname, config, forced_action)
652 def restore_basic(sitehist, hostname, config=None, forced_action=None):
654 # NOTE: Nothing works if the bootcd is REALLY old.
655 # So, this is the first step.
657 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
658 recent_actions = sitehist.getRecentActions(hostname=hostname)
660 if fbnode['observed_category'] == "OLDBOOTCD":
661 print "\t...Notify owner to update BootImage!!!"
663 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
664 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
666 print "\tDisabling %s due to out-of-date BootImage" % hostname
667 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
669 # NOTE: nothing else is possible.
672 debugnode = DebugInterface(hostname)
673 conn = debugnode.getConnection()
674 if type(conn) == type(False): return False
676 boot_state = conn.get_boot_state()
677 if boot_state != "debug":
678 print "... %s in %s state: skipping..." % (hostname , boot_state)
679 return boot_state == "boot"
681 if conn.bootmanager_running():
682 print "...BootManager is currently running. Skipping host %s" %hostname
685 # Read persistent flags, tagged on one week intervals.
687 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
688 dmesg = conn.get_dmesg()
689 child = fdpexpect.fdspawn(dmesg)
691 steps = debugnode.getDiskSteps()
692 sequence = debugnode.getDiskSequence(steps, child)
695 if config and not config.quiet: print "\tSET: ", s
698 print "...Potential drive errors on %s" % hostname
699 if len(s) == 2 and 'floppyerror' in s:
700 print "...Should investigate. Continuing with node."
702 print "...Should investigate. Skipping node."
703 # TODO: send message related to these errors.
705 if not found_within(recent_actions, 'baddisk_notice', 7):
706 print "baddisk_notice not found recently"
708 log=conn.get_dmesg().read()
709 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
710 conn.set_nodestate('disabled')
714 print "...Downloading bm.log from %s" %hostname
715 log = conn.get_bootmanager_log()
716 child = fdpexpect.fdspawn(log)
718 if hasattr(config, 'collect') and config.collect: return True
720 if config and not config.quiet: print "...Scanning bm.log for errors"
724 steps = debugnode.getBootManagerStepPatterns()
725 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
727 s = "-".join(sequence)
728 print " FOUND SEQUENCE: ", s
730 # NOTE: We get or set the flag based on the current sequence identifier.
731 # By using the sequence identifier, we guarantee that there will be no
732 # frequent loops. I'm guessing there is a better way to track loops,
735 sequences = debugnode.getSequences()
738 if s not in sequences:
739 print " HOST %s" % hostname
740 print " UNKNOWN SEQUENCE: %s" % s
743 args['hostname'] = hostname
745 args['bmlog'] = conn.get_bootmanager_log().read()
746 args['viart'] = False
747 args['saveact'] = True
748 args['ccemail'] = True
750 sitehist.sendMessage('unknownsequence_notice', **args)
752 conn.restart_bootmanager('boot')
754 # NOTE: Do not set the pflags value for this sequence if it's unknown.
755 # This way, we can check it again after we've fixed it.
760 if sequences[s] == "restart_bootmanager_boot":
761 print "...Restarting BootManager.py on %s "%hostname
762 conn.restart_bootmanager('boot')
763 elif sequences[s] == "restart_bootmanager_rins":
764 print "...Restarting BootManager.py on %s "%hostname
765 conn.restart_bootmanager('reinstall')
766 elif sequences[s] == "restart_node_rins":
767 conn.restart_node('reinstall')
768 elif sequences[s] == "restart_node_boot":
769 conn.restart_node('boot')
770 elif sequences[s] == "repair_node_keys":
771 if conn.compare_and_repair_nodekeys():
772 # the keys either are in sync or were forced in sync.
773 # so try to reboot the node again.
774 # TODO: why was this originally 'reinstall' instead of 'boot'??
775 conn.restart_bootmanager('boot')
778 # there was some failure to synchronize the keys.
779 print "...Unable to repair node keys on %s" %hostname
781 elif sequences[s] == "suspect_error_email":
783 args['hostname'] = hostname
785 args['bmlog'] = conn.get_bootmanager_log().read()
786 args['viart'] = False
787 args['saveact'] = True
788 args['ccemail'] = True
790 sitehist.sendMessage('unknownsequence_notice', **args)
791 conn.restart_bootmanager('boot')
793 # TODO: differentiate this and the 'nodenetwork_email' actions.
794 elif sequences[s] == "update_node_config_email":
796 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
798 args['hostname'] = hostname
799 sitehist.sendMessage('nodeconfig_notice', **args)
800 conn.dump_plconf_file()
802 elif sequences[s] == "nodenetwork_email":
804 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
806 args['hostname'] = hostname
807 args['bmlog'] = conn.get_bootmanager_log().read()
808 sitehist.sendMessage('nodeconfig_notice', **args)
809 conn.dump_plconf_file()
811 elif sequences[s] == "update_bootcd_email":
813 if not found_within(recent_actions, 'newalphacd_notice', 3.5):
815 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
816 args['hostname'] = hostname
818 sitehist.sendMessage('newalphacd_notice', **args)
820 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
822 elif sequences[s] == "broken_hardware_email":
823 # MAKE An ACTION record that this host has failed hardware. May
824 # require either an exception "/minhw" or other manual intervention.
825 # Definitely need to send out some more EMAIL.
826 # TODO: email notice of broken hardware
827 if not found_within(recent_actions, 'baddisk_notice', 7):
828 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
830 args['hostname'] = hostname
831 args['log'] = conn.get_dmesg().read()
833 sitehist.sendMessage('baddisk_notice', **args)
834 conn.set_nodestate('disabled')
836 elif sequences[s] == "update_hardware_email":
837 if not found_within(recent_actions, 'minimalhardware_notice', 7):
838 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
840 args['hostname'] = hostname
841 args['bmlog'] = conn.get_bootmanager_log().read()
842 sitehist.sendMessage('minimalhardware_notice', **args)
844 elif sequences[s] == "bad_dns_email":
845 if not found_within(recent_actions, 'baddns_notice', 1):
846 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
849 node = plccache.GetNodeByName(hostname)
850 net = api.GetInterfaces(node['interface_ids'])[0]
853 print traceback.print_exc()
854 # TODO: api error. skip email, b/c all info is not available,
855 # flag_set will not be recorded.
857 nodenet_str = network_config_to_str(net)
859 args['hostname'] = hostname
860 args['network_config'] = nodenet_str
861 args['interface_id'] = net['interface_id']
863 sitehist.sendMessage('baddns_notice', **args)
868 # MAIN -------------------------------------------------------------------
871 from monitor import parser as parsermodule
872 parser = parsermodule.getParser()
874 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
875 force=None, quiet=False)
876 parser.add_option("", "--child", dest="child", action="store_true",
877 help="This is the child mode of this process.")
878 parser.add_option("", "--force", dest="force", metavar="boot_state",
879 help="Force a boot state passed to BootManager.py.")
880 parser.add_option("", "--quiet", dest="quiet", action="store_true",
881 help="Extra quiet output messages.")
882 parser.add_option("", "--verbose", dest="verbose", action="store_true",
883 help="Extra debug output messages.")
884 parser.add_option("", "--nonet", dest="nonet", action="store_true",
885 help="Do not setup the network, use existing log files to re-run a test pass.")
886 parser.add_option("", "--collect", dest="collect", action="store_true",
887 help="No action, just collect dmesg, and bm.log")
888 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
889 help="Do not perform the orginary setup phase.")
891 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
892 config = parsermodule.parse_args(parser)
895 nodes = config.getListFromFile(config.nodelist)
897 nodes = [ config.node ]
904 lb = plccache.plcdb_hn2lb[node]
905 sitehist = SiteInterface.get_or_make(loginbase=lb)
906 #reboot(node, config)
907 restore(sitehist, node, config=None, forced_action=None)
909 if __name__ == "__main__":