3 # Attempt to reboot a node in debug state.
16 from monitor.getsshkeys import SSHKnownHosts
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
41 api = plc.getAuthAPI()
45 class ExceptionDoubleSSHError(Exception): pass
48 def __init__(self, connection, node, config):
53 def get_boot_state(self):
55 if self.c.modules.os.path.exists('/tmp/source'):
57 elif self.c.modules.os.path.exists('/vservers'):
63 print self.c.modules.sys.path
71 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
72 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
73 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
74 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
75 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
78 def get_bootmanager_log(self):
79 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
80 download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
81 os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
82 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
86 # def get_dmesg(self):
87 # self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
88 # download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
89 # log = open("log/dmesg.%s.log" % self.node, 'r')
92 # def get_bootmanager_log(self):
93 # download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
94 # #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
95 # os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
96 # log = open("log/bm.%s.log" % self.node, 'r')
99 def dump_plconf_file(self):
101 self.c.modules.sys.path.append("/tmp/source/")
102 self.c.modules.os.chdir('/tmp/source')
104 log = c.modules.BootManager.log('/tmp/new.log')
105 bm = c.modules.BootManager.BootManager(log,'boot')
107 BootManagerException = c.modules.Exceptions.BootManagerException
108 InitializeBootManager = c.modules.BootManager.InitializeBootManager
109 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
112 InitializeBootManager.Run(bm.VARS, bm.LOG)
113 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
117 print " Possibly, unable to find valid configuration file"
120 for key in bm.VARS.keys():
121 print key, " == ", bm.VARS[key]
123 print " Unable to read Node Configuration"
125 def fsck_repair_node(self):
127 self.c.modules.sys.path.append("/tmp/source/")
128 self.c.modules.os.chdir('/tmp/source')
130 # TODO: set boot state to node's actually boot state.
131 # could be 'boot' or 'safeboot'
132 self.c.modules.os.chdir('/tmp/source')
133 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
134 print "Running MANUAL FSCK already... try again soon."
136 print "Running MANUAL fsck on %s" % self.node
137 cmd = "( touch /tmp/BM_RUNNING ; " + \
138 " fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
139 " fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
140 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
141 " rm -f /tmp/BM_RUNNING " + \
143 cmd = cmd % self.get_nodestate()
144 self.c.modules.os.system(cmd)
145 #self.restart_bootmanager('boot')
148 def compare_and_repair_nodekeys(self):
150 self.c.modules.sys.path.append("/tmp/source/")
151 self.c.modules.os.chdir('/tmp/source')
153 log = c.modules.BootManager.log('/tmp/new.log')
154 bm = c.modules.BootManager.BootManager(log,'boot')
156 BootManagerException = c.modules.Exceptions.BootManagerException
157 InitializeBootManager = c.modules.BootManager.InitializeBootManager
158 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
161 plcnode = plccache.GetNodeByName(self.node)
163 InitializeBootManager.Run(bm.VARS, bm.LOG)
164 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
169 print " Possibly, unable to find valid configuration file"
172 print " NODE: %s" % bm.VARS['NODE_KEY']
173 print " PLC : %s" % plcnode['key']
175 if bm.VARS['NODE_KEY'] == plcnode['key']:
178 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
179 print " Successfully updated NODE_KEY with PLC"
184 #for key in bm.VARS.keys():
185 # print key, " == ", bm.VARS[key]
187 print " Unable to retrieve NODE_KEY"
189 def bootmanager_running(self):
190 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
195 def set_nodestate(self, state='boot'):
196 return api.UpdateNode(self.node, {'boot_state' : state})
198 def get_nodestate(self):
200 return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
202 traceback.print_exc()
203 # NOTE: use last cached value from plc
204 fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
205 return fbnode['plc_node_stats']['boot_state']
208 def restart_node(self, state='boot'):
209 api.UpdateNode(self.node, {'boot_state' : state})
211 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
212 if not pflags.getRecentFlag('gentlekill'):
213 print " Killing all slice processes... : %s" % self.node
214 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
215 self.c.modules.os.system(cmd_slicekill)
216 cmd = """ shutdown -r +1 & """
217 print " Restarting %s : %s" % ( self.node, cmd)
218 self.c.modules.os.system(cmd)
220 pflags.setRecentFlag('gentlekill')
223 print " Restarting with sysrq 'sub' %s" % self.node
224 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
225 self.c.modules.os.system(cmd)
229 def restart_bootmanager(self, forceState):
231 self.c.modules.os.chdir('/tmp/source')
232 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
233 print " BootManager is already running: try again soon..."
235 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
236 cmd = "( touch /tmp/BM_RUNNING ; " + \
237 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
238 " rm -f /tmp/BM_RUNNING " + \
240 cmd = cmd % forceState
241 self.c.modules.os.system(cmd)
246 class PlanetLabSession:
247 globalport = 22000 + int(random.random()*1000)
249 def __init__(self, node, nosetup, verbose):
250 self.verbose = verbose
253 self.nosetup = nosetup
257 def get_connection(self, config):
258 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
261 # print i, conn.c.modules.sys.path
262 # print conn.c.modules.os.path.exists('/tmp/source')
267 def setup_host(self):
268 self.port = PlanetLabSession.globalport
269 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
272 args['port'] = self.port
273 args['user'] = 'root'
274 args['hostname'] = self.node
275 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
279 print "Skipping setup"
282 # COPY Rpyc files to host
283 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
284 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
285 if self.verbose: print cmd
289 localos = moncommands.CMD()
291 ret = localos.system(cmd, timeout)
294 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
295 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
296 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
297 print "trying: ", cmd
298 print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
299 ret = localos.system(cmd, timeout)
302 print "\tFAILED TWICE"
303 #email_exception("%s rsync failed twice" % self.node)
304 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
307 # KILL any already running servers.
308 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
309 (ov,ev) = ssh.run_noexcept2("""<<\EOF
311 echo "kill server" >> out.log
312 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
313 echo "export" >> out.log
314 export PYTHONPATH=$HOME ;
315 echo "start server" >> out.log
316 python Rpyc/Servers/forking_server.py &> server.log &
317 echo "done" >> out.log
319 print "setup rpyc server over ssh"
323 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
324 # and the following options seems to work well.
325 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
326 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
327 """-o ConnectTimeout=120 """ + \
328 """-n -N -L %(port)s:localhost:18812 """ + \
329 """%(user)s@%(hostname)s"""
331 if self.verbose: print cmd
333 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
334 # TODO: the read() here may block indefinitely. Need a better
335 # approach therefore, that includes a timeout.
336 #ret = self.command.stdout.read(5)
337 ret = moncommands.read_t(self.command.stdout, 5)
341 # NOTE: There is still a slight race for machines that are slow...
342 self.timeout = 2*(t2-t1)
343 print "Sleeping for %s sec" % self.timeout
344 time.sleep(self.timeout)
347 if self.command.returncode is not None:
348 print "Failed to establish tunnel!"
349 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
351 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
355 if self.verbose: print "Killing SSH session %s" % self.port
356 print "Killing SSH session %s" % self.port
360 def steps_to_list(steps, index=1):
361 return map(lambda x: x[index], steps)
363 def index_to_id(steps,index):
364 if index < len(steps):
365 return steps[index][0]
369 class DebugInterface:
370 def __init__(self, hostname):
371 self.hostname = hostname
374 def getConnection(self):
375 print "Creating session for %s" % self.hostname
376 # update known_hosts file (in case the node has rebooted since last run)
378 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
381 print traceback.print_exc()
384 msg = "ERROR setting up session for %s" % self.hostname
387 self.session = PlanetLabSession(self.hostname, False, True)
389 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
390 except ExceptionDoubleSSHError, e:
394 traceback.print_exc()
399 conn = self.session.get_connection(config)
401 # NOTE: sometimes the wait in setup_host() is not long enough.
402 # So, here we try to wait a little longer before giving up entirely.
404 time.sleep(self.session.timeout*5)
405 conn = self.session.get_connection(config)
407 # failed twice... no need to report this really, it's just in a
411 traceback.print_exc()
412 email_exception(self.hostname)
414 #print "trying to use conn before returning it."
415 #print conn.c.modules.sys.path
416 #print conn.c.modules.os.path.exists('/tmp/source')
419 #print "conn: %s" % conn
422 def getSequences(self):
424 # TODO: This can be replaced with a DB definition at a future time.
425 # This would make it possible for an admin to introduce new
426 # patterns without touching code.
429 # restart_bootmanager_boot
430 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
431 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
432 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
434 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
436 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
437 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
438 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
439 "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
440 "bminit-cfg-auth-getplc-update-debug-done",
441 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
442 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
443 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
444 "bminit-cfg-auth-protoerror-exception-update-debug-done",
445 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
446 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
448 sequences.update({n : "restart_bootmanager_boot"})
450 # conn.restart_bootmanager('reinstall')
451 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
452 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
453 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
454 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
455 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
456 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
457 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
458 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
459 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
460 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
461 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
462 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
463 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
464 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
465 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
466 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
467 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
468 # actual solution appears to involve removing the bad files, and
469 # continually trying to boot the node.
470 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
471 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
472 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
473 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
474 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
475 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
476 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
478 sequences.update({n : "restart_bootmanager_rins"})
481 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
482 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
483 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
484 "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
486 sequences.update({n: "repair_node_keys"})
488 # conn.restart_node('reinstall')
489 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
490 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
491 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
492 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
493 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
494 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
495 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
496 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
497 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
498 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
499 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
500 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
502 sequences.update({n : "restart_node_rins"})
505 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
506 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
507 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
508 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
509 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
510 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
511 "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
513 sequences.update({n: "restart_node_boot"})
516 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
517 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
518 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
519 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
520 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
521 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
522 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
523 "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
524 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
525 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
526 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
527 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
528 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
529 "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
531 sequences.update({n : "fsck_repair"})
534 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
535 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
536 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
537 "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
538 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
539 "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
540 "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
542 sequences.update({n : "nodeconfig_notice"})
544 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
545 "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
546 "bminit-cfg-update-exception-nodehostname-update-debug-done",
547 "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
549 sequences.update({n : "nodenetwork_email"})
551 # noblockdevice_notice
552 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
553 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
554 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
555 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
556 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
558 sequences.update({n : "noblockdevice_notice"})
560 # update_bootcd_email
561 for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
563 sequences.update({n : "update_bootcd_email"})
565 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
567 sequences.update({n: "unknownsequence_notice"})
569 # minimalhardware_notice
570 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
571 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
574 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
578 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
579 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
581 sequences.update( { n : "baddns_notice"})
585 def getDiskSteps(self):
587 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
588 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
589 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
591 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
593 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
594 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
596 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
597 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
599 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
600 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
602 ('floppytimeout','floppy0: floppy timeout called'),
603 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
605 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
606 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
608 # floppy0: floppy timeout called
609 # end_request: I/O error, dev fd0, sector 0
611 # Buffer I/O error on device dm-2, logical block 8888896
612 # ata1: status=0x51 { DriveReady SeekComplete Error }
613 # ata1: error=0x40 { UncorrectableError }
614 # SCSI error : <0 0 0 0> return code = 0x8000002
615 # sda: Current: sense key: Medium Error
616 # Additional sense: Unrecovered read error - auto reallocate failed
618 # SCSI error : <0 2 0 0> return code = 0x40001
619 # end_request: I/O error, dev sda, sector 572489600
623 def getDiskSequence(self, steps, child):
626 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
633 def getBootManagerStepPatterns(self):
635 ('bminit' , 'Initializing the BootManager.'),
636 ('cfg' , 'Reading node configuration file.'),
637 ('auth' , 'Authenticating node with PLC.'),
638 ('getplc' , 'Retrieving details of node from PLC.'),
639 ('update' , 'Updating node boot state at PLC.'),
640 ('hardware' , 'Checking if hardware requirements met.'),
641 ('installinit' , 'Install: Initializing.'),
642 ('installdisk' , 'Install: partitioning disks.'),
643 ('installbootfs', 'Install: bootstrapfs tarball.'),
644 ('installcfg' , 'Install: Writing configuration files.'),
645 ('installstop' , 'Install: Shutting down installer.'),
646 ('update2' , 'Updating node boot state at PLC.'),
647 ('installinit2' , 'Install: Initializing.'),
648 ('validate' , 'Validating node installation.'),
649 ('rebuildinitrd', 'Rebuilding initrd'),
650 ('netcfg' , 'Install: Writing Network Configuration files.'),
651 ('update3' , 'Updating node configuration.'),
652 ('disk' , 'Checking for unused disks to add to LVM.'),
653 ('update4' , 'Sending hardware configuration to PLC.'),
654 ('debug' , 'Starting debug mode'),
655 ('bmexceptmount', 'BootManagerException during mount'),
656 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
657 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
658 ('exception' , 'Exception'),
659 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
660 ('protoerror' , 'XML RPC protocol error'),
661 ('nodehostname' , 'Configured node hostname does not resolve'),
662 ('implementerror', 'Implementation Error'),
663 ('fsckabort' , 'is mounted. e2fsck: Cannot continue, aborting'),
664 ('fsckfail' , 'Running e2fsck -v -p /dev/planetlab/root failed'),
665 ('fsckfail2' , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
666 ('readonlyfs' , '\[Errno 30\] Read-only file system'),
667 ('baddisk' , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
668 ('noinstall' , 'notinstalled'),
669 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
670 ('noblockdev' , "No block devices detected."),
671 ('dnserror' , 'Name or service not known'),
672 ('noconfig' , "Unable to find and read a node configuration file"),
673 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
674 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
675 ('hardwarerequirefail' , 'Hardware requirements not met'),
676 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
677 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
678 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
679 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
680 ('modulefail' , 'Unable to get list of system modules'),
681 ('writeerror' , 'write error: No space left on device'),
682 ('nospace' , "No space left on device"),
683 ('nonode' , 'Failed to authenticate call: No such node'),
684 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
685 ('bootcheckfail' , 'BootCheckAuthentication'),
686 ('bootupdatefail' , 'BootUpdateNode'),
690 def getBootManagerSequenceFromLog(self, steps, child):
694 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
695 id = index_to_id(steps,index)
698 if id == "exception":
699 print "...Found An Exception!!!"
700 elif id == "done": #index == len(steps_to_list(steps)):
706 def restore(sitehist, hostname, config=None, forced_action=None):
707 ret = restore_basic(sitehist, hostname, config, forced_action)
711 def restore_basic(sitehist, hostname, config=None, forced_action=None):
713 # NOTE: Nothing works if the bootcd is REALLY old.
714 # So, this is the first step.
716 bootman_action = "unknown"
718 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
719 recent_actions = sitehist.getRecentActions(hostname=hostname)
721 if fbnode['observed_category'] == "OLDBOOTCD":
722 print "\t...Notify owner to update BootImage!!!"
724 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
725 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
727 print "\tDisabling %s due to out-of-date BootImage" % hostname
728 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
730 # NOTE: nothing else is possible.
733 debugnode = DebugInterface(hostname)
734 conn = debugnode.getConnection()
735 if type(conn) == type(False): return "error"
737 boot_state = conn.get_boot_state()
738 if boot_state != "debug":
739 print "... %s in %s state: skipping..." % (hostname , boot_state)
740 return "skipped" #boot_state == "boot"
742 if conn.bootmanager_running():
743 print "...BootManager is currently running. Skipping host %s" %hostname
744 return "skipped" # True
746 # Read persistent flags, tagged on one week intervals.
748 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
749 dmesg = conn.get_dmesg()
750 child = fdpexpect.fdspawn(dmesg)
752 steps = debugnode.getDiskSteps()
753 sequence = debugnode.getDiskSequence(steps, child)
756 if config and not config.quiet: print "\tSET: ", s
759 print "...Potential drive errors on %s" % hostname
760 if len(s) == 2 and 'floppyerror' in s:
761 print "...Should investigate. Continuing with node."
763 print "...Should investigate. Skipping node."
764 # TODO: send message related to these errors.
766 if not found_within(recent_actions, 'baddisk_notice', 7):
767 print "baddisk_notice not found recently"
769 log=conn.get_dmesg().read()
770 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
771 #conn.set_nodestate('disabled')
773 return "skipping_baddisk"
775 print "...Downloading bm.log from %s" %hostname
776 log = conn.get_bootmanager_log()
777 child = fdpexpect.fdspawn(log)
779 if hasattr(config, 'collect') and config.collect: return "collect"
781 if config and not config.quiet: print "...Scanning bm.log for errors"
785 steps = debugnode.getBootManagerStepPatterns()
786 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
788 s = "-".join(sequence)
789 print " FOUND SEQUENCE: ", s
791 # NOTE: We get or set the flag based on the current sequence identifier.
792 # By using the sequence identifier, we guarantee that there will be no
793 # frequent loops. I'm guessing there is a better way to track loops,
796 sequences = debugnode.getSequences()
799 if s not in sequences:
800 print " HOST %s" % hostname
801 print " UNKNOWN SEQUENCE: %s" % s
804 args['hostname'] = hostname
806 args['bmlog'] = conn.get_bootmanager_log().read()
807 args['viart'] = False
808 args['saveact'] = True
809 args['ccemail'] = True
811 sitehist.sendMessage('unknownsequence_notice', **args)
813 conn.restart_bootmanager('boot')
815 bootman_action = "restart_bootmanager"
817 # NOTE: Do not set the pflags value for this sequence if it's unknown.
818 # This way, we can check it again after we've fixed it.
822 bootman_action = sequences[s]
824 if sequences[s] == "restart_bootmanager_boot":
825 print "...Restarting BootManager.py on %s "%hostname
826 conn.restart_bootmanager('boot')
827 elif sequences[s] == "restart_bootmanager_rins":
828 print "...Restarting BootManager.py on %s "%hostname
829 conn.restart_bootmanager('reinstall')
830 elif sequences[s] == "restart_node_rins":
831 conn.restart_node('reinstall')
832 elif sequences[s] == "restart_node_boot":
833 conn.restart_node('boot')
834 elif sequences[s] == "fsck_repair":
835 conn.fsck_repair_node()
836 elif sequences[s] == "repair_node_keys":
837 if conn.compare_and_repair_nodekeys():
838 # the keys either are in sync or were forced in sync.
839 # so try to start BM again.
840 conn.restart_bootmanager(conn.get_nodestate())
843 # there was some failure to synchronize the keys.
844 print "...Unable to repair node keys on %s" %hostname
846 elif sequences[s] == "unknownsequence_notice":
848 args['hostname'] = hostname
850 args['bmlog'] = conn.get_bootmanager_log().read()
851 args['viart'] = False
852 args['saveact'] = True
853 args['ccemail'] = True
855 sitehist.sendMessage('unknownsequence_notice', **args)
856 conn.restart_bootmanager('boot')
858 elif sequences[s] == "nodeconfig_notice":
860 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
862 args['hostname'] = hostname
863 sitehist.sendMessage('nodeconfig_notice', **args)
864 conn.dump_plconf_file()
866 elif sequences[s] == "nodenetwork_email":
868 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
870 args['hostname'] = hostname
871 args['bmlog'] = conn.get_bootmanager_log().read()
872 sitehist.sendMessage('nodeconfig_notice', **args)
873 conn.dump_plconf_file()
875 elif sequences[s] == "noblockdevice_notice":
877 if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
879 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
880 args['hostname'] = hostname
882 sitehist.sendMessage('noblockdevice_notice', **args)
884 elif sequences[s] == "baddisk_notice":
885 # MAKE An ACTION record that this host has failed hardware. May
886 # require either an exception "/minhw" or other manual intervention.
887 # Definitely need to send out some more EMAIL.
888 # TODO: email notice of broken hardware
889 if not found_within(recent_actions, 'baddisk_notice', 7):
890 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
892 args['hostname'] = hostname
893 args['log'] = conn.get_dmesg().read()
895 sitehist.sendMessage('baddisk_notice', **args)
896 #conn.set_nodestate('disabled')
898 elif sequences[s] == "minimalhardware_notice":
899 if not found_within(recent_actions, 'minimalhardware_notice', 7):
900 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
902 args['hostname'] = hostname
903 args['bmlog'] = conn.get_bootmanager_log().read()
904 sitehist.sendMessage('minimalhardware_notice', **args)
906 elif sequences[s] == "baddns_notice":
907 if not found_within(recent_actions, 'baddns_notice', 1):
908 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
911 node = plccache.GetNodeByName(hostname)
912 net = api.GetInterfaces(node['interface_ids'])[0]
915 print traceback.print_exc()
916 # TODO: api error. skip email, b/c all info is not available,
917 # flag_set will not be recorded.
919 nodenet_str = network_config_to_str(net)
921 args['hostname'] = hostname
922 args['network_config'] = nodenet_str
923 args['interface_id'] = net['interface_id']
925 sitehist.sendMessage('baddns_notice', **args)
927 return bootman_action
930 # MAIN -------------------------------------------------------------------
933 from monitor import parser as parsermodule
934 parser = parsermodule.getParser()
936 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
937 force=None, quiet=False)
938 parser.add_option("", "--child", dest="child", action="store_true",
939 help="This is the child mode of this process.")
940 parser.add_option("", "--force", dest="force", metavar="boot_state",
941 help="Force a boot state passed to BootManager.py.")
942 parser.add_option("", "--quiet", dest="quiet", action="store_true",
943 help="Extra quiet output messages.")
944 parser.add_option("", "--verbose", dest="verbose", action="store_true",
945 help="Extra debug output messages.")
946 parser.add_option("", "--nonet", dest="nonet", action="store_true",
947 help="Do not setup the network, use existing log files to re-run a test pass.")
948 parser.add_option("", "--collect", dest="collect", action="store_true",
949 help="No action, just collect dmesg, and bm.log")
950 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
951 help="Do not perform the orginary setup phase.")
953 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
954 config = parsermodule.parse_args(parser)
957 nodes = config.getListFromFile(config.nodelist)
959 nodes = [ config.node ]
966 lb = plccache.plcdb_hn2lb[node]
967 sitehist = SiteInterface.get_or_make(loginbase=lb)
968 #reboot(node, config)
969 restore(sitehist, node, config=None, forced_action=None)
971 if __name__ == "__main__":