3 # Attempt to reboot a node in debug state.
14 from monitor.getsshkeys import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
36 api = plc.getAuthAPI()
39 def bootmanager_log_name(hostname):
40 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41 base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42 short_target_filename = os.path.join('history', base_filename)
43 return short_target_filename
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
47 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
52 err = traceback.format_exc()
54 act = ActionRecord(loginbase=loginbase,
58 log_path=short_log_path,
63 class ExceptionDoubleSSHError(Exception): pass
66 def __init__(self, connection, node, config):
71 def get_boot_state(self):
73 if self.c.modules.os.path.exists('/tmp/source'):
75 elif self.c.modules.os.path.exists('/vservers'):
81 print self.c.modules.sys.path
89 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
90 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
91 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
92 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
93 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
96 def get_bootmanager_log(self):
97 bm_name = bootmanager_log_name(self.node)
98 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
99 #email_exception(self.node, "collected BM log for %s" % self.node)
100 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
101 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
102 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
105 def dump_plconf_file(self):
107 self.c.modules.sys.path.append("/tmp/source/")
108 self.c.modules.os.chdir('/tmp/source')
110 log = c.modules.BootManager.log('/tmp/new.log')
111 bm = c.modules.BootManager.BootManager(log,'boot')
113 BootManagerException = c.modules.Exceptions.BootManagerException
114 InitializeBootManager = c.modules.BootManager.InitializeBootManager
115 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
118 InitializeBootManager.Run(bm.VARS, bm.LOG)
119 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
123 print " Possibly, unable to find valid configuration file"
126 for key in bm.VARS.keys():
127 print key, " == ", bm.VARS[key]
129 print " Unable to read Node Configuration"
131 def fsck_repair_node(self):
133 self.c.modules.sys.path.append("/tmp/source/")
134 self.c.modules.os.chdir('/tmp/source')
136 # TODO: set boot state to node's actually boot state.
137 # could be 'boot' or 'safeboot'
138 self.c.modules.os.chdir('/tmp/source')
139 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
140 print "Running MANUAL FSCK already... try again soon."
142 print "Running MANUAL fsck on %s" % self.node
143 cmd = "( touch /tmp/BM_RUNNING ; " + \
144 " fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
145 " fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
146 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
147 " rm -f /tmp/BM_RUNNING " + \
149 cmd = cmd % self.get_nodestate()
150 self.c.modules.os.system(cmd)
151 #self.restart_bootmanager('boot')
154 def compare_and_repair_nodekeys(self):
156 self.c.modules.sys.path.append("/tmp/source/")
157 self.c.modules.os.chdir('/tmp/source')
159 log = c.modules.BootManager.log('/tmp/new.log')
160 bm = c.modules.BootManager.BootManager(log,'boot')
162 BootManagerException = c.modules.Exceptions.BootManagerException
163 InitializeBootManager = c.modules.BootManager.InitializeBootManager
164 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
167 plcnode = plccache.GetNodeByName(self.node)
169 InitializeBootManager.Run(bm.VARS, bm.LOG)
170 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
175 print " Possibly, unable to find valid configuration file"
178 print " NODE: %s" % bm.VARS['NODE_KEY']
179 print " PLC : %s" % plcnode['key']
181 if bm.VARS['NODE_KEY'] == plcnode['key']:
184 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
185 print " Successfully updated NODE_KEY with PLC"
190 #for key in bm.VARS.keys():
191 # print key, " == ", bm.VARS[key]
193 print " Unable to retrieve NODE_KEY"
195 def bootmanager_running(self):
196 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
201 def set_nodestate(self, state='boot'):
202 return api.UpdateNode(self.node, {'boot_state' : state})
204 def get_nodestate(self):
206 return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
208 traceback.print_exc()
209 # NOTE: use last cached value from plc
210 fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
211 return fbnode['plc_node_stats']['boot_state']
214 def restart_node(self, state='boot'):
215 api.UpdateNode(self.node, {'boot_state' : state})
217 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
218 if not pflags.getRecentFlag('gentlekill'):
219 print " Killing all slice processes... : %s" % self.node
220 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
221 self.c.modules.os.system(cmd_slicekill)
222 cmd = """ shutdown -r +1 & """
223 print " Restarting %s : %s" % ( self.node, cmd)
224 self.c.modules.os.system(cmd)
226 pflags.setRecentFlag('gentlekill')
229 print " Restarting with sysrq 'sub' %s" % self.node
230 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
231 self.c.modules.os.system(cmd)
235 def restart_bootmanager(self, forceState):
237 self.c.modules.os.chdir('/tmp/source')
238 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
239 print " BootManager is already running: try again soon..."
241 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
242 cmd = "( touch /tmp/BM_RUNNING ; " + \
243 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
244 " rm -f /tmp/BM_RUNNING " + \
246 cmd = cmd % forceState
247 self.c.modules.os.system(cmd)
252 class PlanetLabSession:
253 globalport = 22000 + int(random.random()*1000)
255 def __init__(self, node, nosetup, verbose):
256 self.verbose = verbose
259 self.nosetup = nosetup
263 def get_connection(self, config):
265 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
267 # NOTE: try twice since this can sometimes fail the first time. If
268 # it fails again, let it go.
269 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
272 def setup_host(self):
273 self.port = PlanetLabSession.globalport
274 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
277 args['port'] = self.port
278 args['user'] = 'root'
279 args['hostname'] = self.node
280 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
284 print "Skipping setup"
287 # COPY Rpyc files to host
288 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
289 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
290 if self.verbose: print cmd
294 localos = moncommands.CMD()
296 ret = localos.system(cmd, timeout)
299 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
300 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
301 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
302 print "trying: ", cmd
303 print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
304 ret = localos.system(cmd, timeout)
307 print "\tFAILED TWICE"
308 #email_exception("%s rsync failed twice" % self.node)
309 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
312 # KILL any already running servers.
313 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
314 (ov,ev) = ssh.run_noexcept2("""<<\EOF
316 echo "kill server" >> out.log
317 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
318 echo "export" >> out.log
319 export PYTHONPATH=$HOME ;
320 echo "start server" >> out.log
321 python Rpyc/Servers/forking_server.py &> server.log &
322 echo "done" >> out.log
324 print "setup rpyc server over ssh"
328 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
329 # and the following options seems to work well.
330 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
331 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
332 """-o ConnectTimeout=120 """ + \
333 """-n -N -L %(port)s:localhost:18812 """ + \
334 """%(user)s@%(hostname)s"""
336 if self.verbose: print cmd
338 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
339 # TODO: the read() here may block indefinitely. Need a better
340 # approach therefore, that includes a timeout.
341 #ret = self.command.stdout.read(5)
342 ret = moncommands.read_t(self.command.stdout, 5)
346 # NOTE: There is still a slight race for machines that are slow...
347 self.timeout = 2*(t2-t1)
348 print "Sleeping for %s sec" % self.timeout
349 time.sleep(self.timeout)
352 if self.command.returncode is not None:
353 print "Failed to establish tunnel!"
354 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
356 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
360 if self.verbose: print "Killing SSH session %s" % self.port
361 print "Killing SSH session %s" % self.port
365 def steps_to_list(steps, index=1):
366 return map(lambda x: x[index], steps)
368 def index_to_id(steps,index):
369 if index < len(steps):
370 return steps[index][0]
374 class DebugInterface:
375 def __init__(self, hostname):
376 self.hostname = hostname
379 def getConnection(self):
380 print "Creating session for %s" % self.hostname
381 # update known_hosts file (in case the node has rebooted since last run)
383 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
386 print traceback.print_exc()
389 msg = "ERROR setting up session for %s" % self.hostname
392 self.session = PlanetLabSession(self.hostname, False, True)
394 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
395 except ExceptionDoubleSSHError, e:
399 traceback.print_exc()
404 conn = self.session.get_connection(config)
406 # NOTE: sometimes the wait in setup_host() is not long enough.
407 # So, here we try to wait a little longer before giving up entirely.
409 time.sleep(self.session.timeout*5)
410 conn = self.session.get_connection(config)
412 # failed twice... no need to report this really, it's just in a
416 traceback.print_exc()
417 email_exception(self.hostname)
419 #print "trying to use conn before returning it."
420 #print conn.c.modules.sys.path
421 #print conn.c.modules.os.path.exists('/tmp/source')
424 #print "conn: %s" % conn
427 def getSequences(self):
429 # TODO: This can be replaced with a DB definition at a future time.
430 # This would make it possible for an admin to introduce new
431 # patterns without touching code.
434 # restart_bootmanager_boot
435 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
436 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
437 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
439 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
441 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
442 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
443 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
444 "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
445 "bminit-cfg-auth-getplc-update-debug-done",
446 "bminit-cfg-auth-protoerror2-debug-done",
447 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
448 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
449 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
450 "bminit-cfg-auth-protoerror-exception-update-debug-done",
451 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
452 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
453 "bminit-cfg-auth-authfail2-protoerror2-debug-done",
455 sequences.update({n : "restart_bootmanager_boot"})
457 # conn.restart_bootmanager('reinstall')
458 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
459 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
460 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
461 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
462 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
463 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
464 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
465 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
466 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
467 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
468 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
469 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
470 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
471 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
472 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
473 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
474 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
475 # actual solution appears to involve removing the bad files, and
476 # continually trying to boot the node.
477 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
478 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
479 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
480 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
481 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
482 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
483 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
484 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
486 sequences.update({n : "restart_bootmanager_rins"})
489 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
490 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
491 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
492 "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
493 "bminit-cfg-auth-authfail-debug-done",
494 "bminit-cfg-auth-authfail2-authfail-debug-done",
496 sequences.update({n: "repair_node_keys"})
498 # conn.restart_node('reinstall')
499 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
500 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
501 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
502 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
503 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
504 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
505 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
506 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
507 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
508 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
509 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
510 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
512 sequences.update({n : "restart_node_rins"})
515 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
516 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
517 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
518 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
519 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
520 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
521 "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
523 sequences.update({n: "restart_node_boot"})
526 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
527 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
528 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
529 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
530 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
531 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
532 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
533 "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
534 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
535 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
536 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
537 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
538 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
539 "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
541 sequences.update({n : "fsck_repair"})
544 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
545 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
546 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
547 "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
548 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
549 "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
550 "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
552 sequences.update({n : "nodeconfig_notice"})
554 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
555 "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
556 "bminit-cfg-update-exception-nodehostname-update-debug-done",
557 "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
559 sequences.update({n : "nodenetwork_email"})
561 # noblockdevice_notice
562 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
563 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
564 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
565 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
566 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
567 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
568 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
570 sequences.update({n : "noblockdevice_notice"})
572 # update_bootcd_email
573 for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
575 sequences.update({n : "update_bootcd_email"})
577 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
579 sequences.update({n: "unknownsequence_notice"})
581 # minimalhardware_notice
582 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
583 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
586 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
590 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
591 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
593 sequences.update( { n : "baddns_notice"})
597 def getDiskSteps(self):
599 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
600 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
601 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
603 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
605 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
606 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
608 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
609 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
611 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
612 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
614 ('floppytimeout','floppy0: floppy timeout called'),
615 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
617 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
618 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
620 # floppy0: floppy timeout called
621 # end_request: I/O error, dev fd0, sector 0
623 # Buffer I/O error on device dm-2, logical block 8888896
624 # ata1: status=0x51 { DriveReady SeekComplete Error }
625 # ata1: error=0x40 { UncorrectableError }
626 # SCSI error : <0 0 0 0> return code = 0x8000002
627 # sda: Current: sense key: Medium Error
628 # Additional sense: Unrecovered read error - auto reallocate failed
630 # SCSI error : <0 2 0 0> return code = 0x40001
631 # end_request: I/O error, dev sda, sector 572489600
635 def getDiskSequence(self, steps, child):
638 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
645 def getBootManagerStepPatterns(self):
647 ('bminit' , 'Initializing the BootManager.'),
648 ('cfg' , 'Reading node configuration file.'),
649 ('auth' , 'Authenticating node with PLC.'),
650 ('getplc' , 'Retrieving details of node from PLC.'),
651 ('update' , 'Updating node boot state at PLC.'),
652 ('hardware' , 'Checking if hardware requirements met.'),
653 ('installinit' , 'Install: Initializing.'),
654 ('installdisk' , 'Install: partitioning disks.'),
655 ('installbootfs', 'Install: bootstrapfs tarball.'),
656 ('installcfg' , 'Install: Writing configuration files.'),
657 ('installstop' , 'Install: Shutting down installer.'),
658 ('update2' , 'Updating node boot state at PLC.'),
659 ('installinit2' , 'Install: Initializing.'),
660 ('validate' , 'Validating node installation.'),
661 ('rebuildinitrd', 'Rebuilding initrd'),
662 ('netcfg' , 'Install: Writing Network Configuration files.'),
663 ('update3' , 'Updating node configuration.'),
664 ('disk' , 'Checking for unused disks to add to LVM.'),
665 ('update4' , 'Sending hardware configuration to PLC.'),
666 ('debug' , 'Starting debug mode'),
667 ('bmexceptmount', 'BootManagerException during mount'),
668 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
669 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
670 ('exception' , 'Exception'),
671 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
672 ('protoerror2' , '500 Internal Server Error'),
673 ('protoerror' , 'XML RPC protocol error'),
674 ('nodehostname' , 'Configured node hostname does not resolve'),
675 ('implementerror', 'Implementation Error'),
676 ('fsckabort' , 'is mounted. e2fsck: Cannot continue, aborting'),
677 ('fsckfail' , 'Running e2fsck -v -p /dev/planetlab/root failed'),
678 ('fsckfail2' , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
679 ('readonlyfs' , '\[Errno 30\] Read-only file system'),
680 ('baddisk' , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
681 ('noinstall' , 'notinstalled'),
682 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
683 ('noblockdev' , "No block devices detected."),
684 ('dnserror' , 'Name or service not known'),
685 ('noconfig' , "Unable to find and read a node configuration file"),
686 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
687 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
688 ('hardwarerequirefail' , 'Hardware requirements not met'),
689 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
690 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
691 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
692 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
693 ('modulefail' , 'Unable to get list of system modules'),
694 ('writeerror' , 'write error: No space left on device'),
695 ('nospace' , "No space left on device"),
696 ('nonode' , 'Failed to authenticate call: No such node'),
697 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
698 ('authfail2' , 'Authentication Failed'),
699 ('bootcheckfail' , 'BootCheckAuthentication'),
700 ('bootupdatefail' , 'BootUpdateNode'),
704 def getBootManagerSequenceFromLog(self, steps, child):
708 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
709 id = index_to_id(steps,index)
712 if id == "exception":
713 print "...Found An Exception!!!"
714 elif id == "done": #index == len(steps_to_list(steps)):
720 def restore(sitehist, hostname, config=None, forced_action=None):
721 ret = restore_basic(sitehist, hostname, config, forced_action)
725 def restore_basic(sitehist, hostname, config=None, forced_action=None):
727 # NOTE: Nothing works if the bootcd is REALLY old.
728 # So, this is the first step.
730 bootman_action = "unknown"
732 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
733 recent_actions = sitehist.getRecentActions(hostname=hostname)
735 if fbnode['observed_category'] == "OLDBOOTCD":
736 print "\t...Notify owner to update BootImage!!!"
738 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
739 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
741 print "\tDisabling %s due to out-of-date BootImage" % hostname
742 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
744 # NOTE: nothing else is possible.
747 debugnode = DebugInterface(hostname)
748 conn = debugnode.getConnection()
749 if type(conn) == type(False): return "connect_failed"
751 boot_state = conn.get_boot_state()
752 if boot_state != "debug":
753 print "... %s in %s state: skipping..." % (hostname , boot_state)
754 return "skipped" #boot_state == "boot"
756 if conn.bootmanager_running():
757 print "...BootManager is currently running. Skipping host %s" %hostname
758 return "skipped" # True
760 # Read persistent flags, tagged on one week intervals.
762 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
763 dmesg = conn.get_dmesg()
764 child = fdpexpect.fdspawn(dmesg)
766 steps = debugnode.getDiskSteps()
767 sequence = debugnode.getDiskSequence(steps, child)
770 if config and not config.quiet: print "\tSET: ", s
773 print "...Potential drive errors on %s" % hostname
774 if len(s) == 2 and 'floppyerror' in s:
775 print "...Should investigate. Continuing with node."
777 print "...Should investigate. Skipping node."
778 # TODO: send message related to these errors.
780 if not found_within(recent_actions, 'baddisk_notice', 7):
781 print "baddisk_notice not found recently"
783 log=conn.get_dmesg().read()
784 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
785 #conn.set_nodestate('disabled')
787 return "skipping_baddisk"
789 print "...Downloading bm.log from %s" %hostname
790 log = conn.get_bootmanager_log()
791 bm_log_data = log.read() # get data
792 log.seek(0) # reset fd pointer for fdspawn
793 child = fdpexpect.fdspawn(log)
795 if hasattr(config, 'collect') and config.collect: return "collect"
797 if config and not config.quiet: print "...Scanning bm.log for errors"
801 steps = debugnode.getBootManagerStepPatterns()
802 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
804 s = "-".join(sequence)
805 print " FOUND SEQUENCE: ", s
807 # NOTE: We get or set the flag based on the current sequence identifier.
808 # By using the sequence identifier, we guarantee that there will be no
809 # frequent loops. I'm guessing there is a better way to track loops,
812 sequences = debugnode.getSequences()
815 if s not in sequences:
816 print " HOST %s" % hostname
817 print " UNKNOWN SEQUENCE: %s" % s
820 args['hostname'] = hostname
822 args['bmlog'] = bm_log_data
823 args['viart'] = False
824 args['saveact'] = True
825 args['ccemail'] = True
827 sitehist.sendMessage('unknownsequence_notice', **args)
829 conn.restart_bootmanager('boot')
831 bootman_action = "restart_bootmanager"
833 # NOTE: Do not set the pflags value for this sequence if it's unknown.
834 # This way, we can check it again after we've fixed it.
838 bootman_action = sequences[s]
840 if sequences[s] == "restart_bootmanager_boot":
841 print "...Restarting BootManager.py on %s "%hostname
842 conn.restart_bootmanager('boot')
843 elif sequences[s] == "restart_bootmanager_rins":
844 print "...Restarting BootManager.py on %s "%hostname
845 conn.restart_bootmanager('reinstall')
846 elif sequences[s] == "restart_node_rins":
847 conn.restart_node('reinstall')
848 elif sequences[s] == "restart_node_boot":
849 conn.restart_node('boot')
850 elif sequences[s] == "fsck_repair":
851 conn.fsck_repair_node()
852 elif sequences[s] == "repair_node_keys":
853 if conn.compare_and_repair_nodekeys():
854 # the keys either are in sync or were forced in sync.
855 # so try to start BM again.
856 conn.restart_bootmanager(conn.get_nodestate())
858 # there was some failure to synchronize the keys.
859 print "...Unable to repair node keys on %s" %hostname
860 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
862 args['hostname'] = hostname
863 sitehist.sendMessage('nodeconfig_notice', **args)
864 conn.dump_plconf_file()
866 # NOTE: do not add a new action record
869 elif sequences[s] == "unknownsequence_notice":
871 args['hostname'] = hostname
873 args['bmlog'] = bm_log_data
874 args['viart'] = False
875 args['saveact'] = True
876 args['ccemail'] = True
878 sitehist.sendMessage('unknownsequence_notice', **args)
879 conn.restart_bootmanager('boot')
881 elif sequences[s] == "nodeconfig_notice":
883 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
885 args['hostname'] = hostname
886 sitehist.sendMessage('nodeconfig_notice', **args)
887 conn.dump_plconf_file()
889 # NOTE: do not add a new action record
892 elif sequences[s] == "nodenetwork_email":
894 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
896 args['hostname'] = hostname
897 args['bmlog'] = bm_log_data
898 sitehist.sendMessage('nodeconfig_notice', **args)
899 conn.dump_plconf_file()
901 # NOTE: do not add a new action record
904 elif sequences[s] == "noblockdevice_notice":
906 if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
908 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
909 args['hostname'] = hostname
911 sitehist.sendMessage('noblockdevice_notice', **args)
913 # NOTE: do not add a new action record
916 elif sequences[s] == "baddisk_notice":
917 # MAKE An ACTION record that this host has failed hardware. May
918 # require either an exception "/minhw" or other manual intervention.
919 # Definitely need to send out some more EMAIL.
920 # TODO: email notice of broken hardware
921 if not found_within(recent_actions, 'baddisk_notice', 7):
922 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
924 args['hostname'] = hostname
925 args['log'] = conn.get_dmesg().read()
927 sitehist.sendMessage('baddisk_notice', **args)
928 #conn.set_nodestate('disabled')
930 # NOTE: do not add a new action record
933 elif sequences[s] == "minimalhardware_notice":
934 if not found_within(recent_actions, 'minimalhardware_notice', 7):
935 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
937 args['hostname'] = hostname
938 args['bmlog'] = bm_log_data
939 sitehist.sendMessage('minimalhardware_notice', **args)
941 # NOTE: do not add a new action record
944 elif sequences[s] == "baddns_notice":
945 if not found_within(recent_actions, 'baddns_notice', 1):
946 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
949 node = plccache.GetNodeByName(hostname)
950 net = api.GetInterfaces(node['interface_ids'])[0]
953 print traceback.print_exc()
954 # TODO: api error. skip email, b/c all info is not available,
955 # flag_set will not be recorded.
957 nodenet_str = network_config_to_str(net)
959 args['hostname'] = hostname
960 args['network_config'] = nodenet_str
961 args['interface_id'] = net['interface_id']
963 sitehist.sendMessage('baddns_notice', **args)
965 # NOTE: do not add a new action record
968 return bootman_action
971 # MAIN -------------------------------------------------------------------
974 from monitor import parser as parsermodule
975 parser = parsermodule.getParser()
977 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
978 force=None, quiet=False)
979 parser.add_option("", "--child", dest="child", action="store_true",
980 help="This is the child mode of this process.")
981 parser.add_option("", "--force", dest="force", metavar="boot_state",
982 help="Force a boot state passed to BootManager.py.")
983 parser.add_option("", "--quiet", dest="quiet", action="store_true",
984 help="Extra quiet output messages.")
985 parser.add_option("", "--verbose", dest="verbose", action="store_true",
986 help="Extra debug output messages.")
987 parser.add_option("", "--nonet", dest="nonet", action="store_true",
988 help="Do not setup the network, use existing log files to re-run a test pass.")
989 parser.add_option("", "--collect", dest="collect", action="store_true",
990 help="No action, just collect dmesg, and bm.log")
991 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
992 help="Do not perform the orginary setup phase.")
994 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
995 config = parsermodule.parse_args(parser)
998 nodes = config.getListFromFile(config.nodelist)
1000 nodes = [ config.node ]
1007 lb = plccache.plcdb_hn2lb[node]
1008 sitehist = SiteInterface.get_or_make(loginbase=lb)
1009 #reboot(node, config)
1010 restore(sitehist, node, config=None, forced_action=None)
1012 if __name__ == "__main__":