3 # Attempt to reboot a node in debug state.
14 from monitor.getsshkeys import SSHKnownHosts
15 from monitor.Rpyc import SocketConnection, Async
16 from monitor.Rpyc.Utils import *
18 from monitor import getconf
19 from monitor import config
20 from monitor import const
21 from monitor.model import *
22 from monitor.common import email_exception, found_within
23 from monitor.database.info.model import *
24 from monitor.database.info.interface import *
25 from monitor.wrapper import plc
26 from monitor.wrapper import plccache
27 from monitor.wrapper.emailTxt import mailtxt
28 from monitor.nodeconfig import network_config_to_str
30 from pcucontrol.util import command as moncommands
31 from pcucontrol.util.command import Sopen
32 from pcucontrol.transports.ssh import pxssh as pxssh
33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
34 from pcucontrol.transports.ssh import pexpect as pexpect
36 api = plc.getAuthAPI()
39 def bootmanager_log_name(hostname):
40 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
41 base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
42 short_target_filename = os.path.join('history', base_filename)
43 return short_target_filename
45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
47 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
48 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
52 err = traceback.format_exc()
54 act = ActionRecord(loginbase=loginbase,
58 log_path=short_log_path,
60 session.flush(); session.clear()
64 class ExceptionDoubleSSHError(Exception): pass
67 def __init__(self, connection, node, config):
72 def get_boot_state(self):
74 if self.c.modules.os.path.exists('/tmp/source'):
76 elif self.c.modules.os.path.exists('/vservers'):
82 print self.c.modules.sys.path
90 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
91 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
92 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
93 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
94 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
97 def get_bootmanager_log(self):
98 bm_name = bootmanager_log_name(self.node)
99 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
100 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
101 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
102 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
106 # def get_dmesg(self):
107 # self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
108 # download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
109 # log = open("log/dmesg.%s.log" % self.node, 'r')
112 # def get_bootmanager_log(self):
113 # download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
114 # #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
115 # os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
116 # log = open("log/bm.%s.log" % self.node, 'r')
119 def dump_plconf_file(self):
121 self.c.modules.sys.path.append("/tmp/source/")
122 self.c.modules.os.chdir('/tmp/source')
124 log = c.modules.BootManager.log('/tmp/new.log')
125 bm = c.modules.BootManager.BootManager(log,'boot')
127 BootManagerException = c.modules.Exceptions.BootManagerException
128 InitializeBootManager = c.modules.BootManager.InitializeBootManager
129 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
132 InitializeBootManager.Run(bm.VARS, bm.LOG)
133 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
137 print " Possibly, unable to find valid configuration file"
140 for key in bm.VARS.keys():
141 print key, " == ", bm.VARS[key]
143 print " Unable to read Node Configuration"
145 def fsck_repair_node(self):
147 self.c.modules.sys.path.append("/tmp/source/")
148 self.c.modules.os.chdir('/tmp/source')
150 # TODO: set boot state to node's actually boot state.
151 # could be 'boot' or 'safeboot'
152 self.c.modules.os.chdir('/tmp/source')
153 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
154 print "Running MANUAL FSCK already... try again soon."
156 print "Running MANUAL fsck on %s" % self.node
157 cmd = "( touch /tmp/BM_RUNNING ; " + \
158 " fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
159 " fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
160 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
161 " rm -f /tmp/BM_RUNNING " + \
163 cmd = cmd % self.get_nodestate()
164 self.c.modules.os.system(cmd)
165 #self.restart_bootmanager('boot')
168 def compare_and_repair_nodekeys(self):
170 self.c.modules.sys.path.append("/tmp/source/")
171 self.c.modules.os.chdir('/tmp/source')
173 log = c.modules.BootManager.log('/tmp/new.log')
174 bm = c.modules.BootManager.BootManager(log,'boot')
176 BootManagerException = c.modules.Exceptions.BootManagerException
177 InitializeBootManager = c.modules.BootManager.InitializeBootManager
178 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
181 plcnode = plccache.GetNodeByName(self.node)
183 InitializeBootManager.Run(bm.VARS, bm.LOG)
184 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
189 print " Possibly, unable to find valid configuration file"
192 print " NODE: %s" % bm.VARS['NODE_KEY']
193 print " PLC : %s" % plcnode['key']
195 if bm.VARS['NODE_KEY'] == plcnode['key']:
198 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
199 print " Successfully updated NODE_KEY with PLC"
204 #for key in bm.VARS.keys():
205 # print key, " == ", bm.VARS[key]
207 print " Unable to retrieve NODE_KEY"
209 def bootmanager_running(self):
210 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
215 def set_nodestate(self, state='boot'):
216 return api.UpdateNode(self.node, {'boot_state' : state})
218 def get_nodestate(self):
220 return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
222 traceback.print_exc()
223 # NOTE: use last cached value from plc
224 fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
225 return fbnode['plc_node_stats']['boot_state']
228 def restart_node(self, state='boot'):
229 api.UpdateNode(self.node, {'boot_state' : state})
231 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
232 if not pflags.getRecentFlag('gentlekill'):
233 print " Killing all slice processes... : %s" % self.node
234 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
235 self.c.modules.os.system(cmd_slicekill)
236 cmd = """ shutdown -r +1 & """
237 print " Restarting %s : %s" % ( self.node, cmd)
238 self.c.modules.os.system(cmd)
240 pflags.setRecentFlag('gentlekill')
243 print " Restarting with sysrq 'sub' %s" % self.node
244 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
245 self.c.modules.os.system(cmd)
249 def restart_bootmanager(self, forceState):
251 self.c.modules.os.chdir('/tmp/source')
252 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
253 print " BootManager is already running: try again soon..."
255 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
256 cmd = "( touch /tmp/BM_RUNNING ; " + \
257 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
258 " rm -f /tmp/BM_RUNNING " + \
260 cmd = cmd % forceState
261 self.c.modules.os.system(cmd)
266 class PlanetLabSession:
267 globalport = 22000 + int(random.random()*1000)
269 def __init__(self, node, nosetup, verbose):
270 self.verbose = verbose
273 self.nosetup = nosetup
277 def get_connection(self, config):
279 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
281 # NOTE: try twice since this can sometimes fail the first time. If
282 # it fails again, let it go.
283 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
286 def setup_host(self):
287 self.port = PlanetLabSession.globalport
288 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
291 args['port'] = self.port
292 args['user'] = 'root'
293 args['hostname'] = self.node
294 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
298 print "Skipping setup"
301 # COPY Rpyc files to host
302 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
303 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
304 if self.verbose: print cmd
308 localos = moncommands.CMD()
310 ret = localos.system(cmd, timeout)
313 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
314 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
315 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
316 print "trying: ", cmd
317 print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
318 ret = localos.system(cmd, timeout)
321 print "\tFAILED TWICE"
322 #email_exception("%s rsync failed twice" % self.node)
323 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
326 # KILL any already running servers.
327 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
328 (ov,ev) = ssh.run_noexcept2("""<<\EOF
330 echo "kill server" >> out.log
331 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
332 echo "export" >> out.log
333 export PYTHONPATH=$HOME ;
334 echo "start server" >> out.log
335 python Rpyc/Servers/forking_server.py &> server.log &
336 echo "done" >> out.log
338 print "setup rpyc server over ssh"
342 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
343 # and the following options seems to work well.
344 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
345 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
346 """-o ConnectTimeout=120 """ + \
347 """-n -N -L %(port)s:localhost:18812 """ + \
348 """%(user)s@%(hostname)s"""
350 if self.verbose: print cmd
352 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
353 # TODO: the read() here may block indefinitely. Need a better
354 # approach therefore, that includes a timeout.
355 #ret = self.command.stdout.read(5)
356 ret = moncommands.read_t(self.command.stdout, 5)
360 # NOTE: There is still a slight race for machines that are slow...
361 self.timeout = 2*(t2-t1)
362 print "Sleeping for %s sec" % self.timeout
363 time.sleep(self.timeout)
366 if self.command.returncode is not None:
367 print "Failed to establish tunnel!"
368 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
370 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
374 if self.verbose: print "Killing SSH session %s" % self.port
375 print "Killing SSH session %s" % self.port
379 def steps_to_list(steps, index=1):
380 return map(lambda x: x[index], steps)
382 def index_to_id(steps,index):
383 if index < len(steps):
384 return steps[index][0]
388 class DebugInterface:
389 def __init__(self, hostname):
390 self.hostname = hostname
393 def getConnection(self):
394 print "Creating session for %s" % self.hostname
395 # update known_hosts file (in case the node has rebooted since last run)
397 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
400 print traceback.print_exc()
403 msg = "ERROR setting up session for %s" % self.hostname
406 self.session = PlanetLabSession(self.hostname, False, True)
408 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
409 except ExceptionDoubleSSHError, e:
413 traceback.print_exc()
418 conn = self.session.get_connection(config)
420 # NOTE: sometimes the wait in setup_host() is not long enough.
421 # So, here we try to wait a little longer before giving up entirely.
423 time.sleep(self.session.timeout*5)
424 conn = self.session.get_connection(config)
426 # failed twice... no need to report this really, it's just in a
430 traceback.print_exc()
431 email_exception(self.hostname)
433 #print "trying to use conn before returning it."
434 #print conn.c.modules.sys.path
435 #print conn.c.modules.os.path.exists('/tmp/source')
438 #print "conn: %s" % conn
441 def getSequences(self):
443 # TODO: This can be replaced with a DB definition at a future time.
444 # This would make it possible for an admin to introduce new
445 # patterns without touching code.
448 # restart_bootmanager_boot
449 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
450 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
451 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
453 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
455 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
456 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
457 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-protoerror-debug-validate-done",
458 "bminit-cfg-auth-protoerror-exception-update-debug-validate-exception-done",
459 "bminit-cfg-auth-getplc-update-debug-done",
460 "bminit-cfg-auth-protoerror2-debug-done",
461 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
462 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
463 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
464 "bminit-cfg-auth-protoerror-exception-update-debug-done",
465 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
466 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
467 "bminit-cfg-auth-authfail2-protoerror2-debug-done",
469 sequences.update({n : "restart_bootmanager_boot"})
471 # conn.restart_bootmanager('reinstall')
472 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
473 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
474 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
475 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
476 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
477 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
478 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
479 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
480 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
481 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
482 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
483 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
484 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
485 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
486 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
487 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
488 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
489 # actual solution appears to involve removing the bad files, and
490 # continually trying to boot the node.
491 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
492 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
493 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
494 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
495 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
496 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
497 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
498 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-debug-validate-bmexceptvgscan-done",
500 sequences.update({n : "restart_bootmanager_rins"})
503 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
504 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
505 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
506 "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
507 "bminit-cfg-auth-authfail-debug-done",
508 "bminit-cfg-auth-authfail2-authfail-debug-done",
510 sequences.update({n: "repair_node_keys"})
512 # conn.restart_node('reinstall')
513 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
514 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
515 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
516 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
517 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
518 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
519 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
520 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
521 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
522 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
523 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
524 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
526 sequences.update({n : "restart_node_rins"})
529 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
530 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
531 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
532 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
533 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
534 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
535 "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
537 sequences.update({n: "restart_node_boot"})
540 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
541 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
542 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
543 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-noinstall-update-debug-validate-exception-fsckfail2-done",
544 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-done",
545 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
546 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail-done",
547 "bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-exception-debug-validate-fsckabort-exception-fsckfail-done",
548 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail2-exception-debug-validate-exception-fsckfail2-done",
549 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail2-done",
550 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
551 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-debug-validate-done",
552 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-debug-validate-exception-fsckfail-done",
553 "bminit-cfg-auth-getplc-update-debug-validate-exception-fsckfail-done",
555 sequences.update({n : "fsck_repair"})
558 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
559 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
560 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
561 "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
562 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
563 "bminit-cfg-exception-noconfig-nonode-debug-validate-exception-done",
564 "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
566 sequences.update({n : "nodeconfig_notice"})
568 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
569 "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
570 "bminit-cfg-update-exception-nodehostname-update-debug-done",
571 "bminit-cfg-exception-nodehostname-debug-validate-exception-done",
573 sequences.update({n : "nodenetwork_email"})
575 # noblockdevice_notice
576 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
577 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-validate-bmexceptvgscan-done",
578 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
579 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
580 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
581 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
582 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-debug-validate-bmexceptvgscan-done",
584 sequences.update({n : "noblockdevice_notice"})
586 # update_bootcd_email
587 for n in [ "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
589 sequences.update({n : "update_bootcd_email"})
591 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
593 sequences.update({n: "unknownsequence_notice"})
595 # minimalhardware_notice
596 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
597 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "minimalhardware_notice"})
600 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "baddisk_notice"})
604 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
605 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
607 sequences.update( { n : "baddns_notice"})
611 def getDiskSteps(self):
613 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
614 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
615 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
617 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
619 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
620 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
622 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
623 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
625 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
626 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
628 ('floppytimeout','floppy0: floppy timeout called'),
629 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
631 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
632 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
634 # floppy0: floppy timeout called
635 # end_request: I/O error, dev fd0, sector 0
637 # Buffer I/O error on device dm-2, logical block 8888896
638 # ata1: status=0x51 { DriveReady SeekComplete Error }
639 # ata1: error=0x40 { UncorrectableError }
640 # SCSI error : <0 0 0 0> return code = 0x8000002
641 # sda: Current: sense key: Medium Error
642 # Additional sense: Unrecovered read error - auto reallocate failed
644 # SCSI error : <0 2 0 0> return code = 0x40001
645 # end_request: I/O error, dev sda, sector 572489600
649 def getDiskSequence(self, steps, child):
652 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
659 def getBootManagerStepPatterns(self):
661 ('bminit' , 'Initializing the BootManager.'),
662 ('cfg' , 'Reading node configuration file.'),
663 ('auth' , 'Authenticating node with PLC.'),
664 ('getplc' , 'Retrieving details of node from PLC.'),
665 ('update' , 'Updating node boot state at PLC.'),
666 ('hardware' , 'Checking if hardware requirements met.'),
667 ('installinit' , 'Install: Initializing.'),
668 ('installdisk' , 'Install: partitioning disks.'),
669 ('installbootfs', 'Install: bootstrapfs tarball.'),
670 ('installcfg' , 'Install: Writing configuration files.'),
671 ('installstop' , 'Install: Shutting down installer.'),
672 ('update2' , 'Updating node boot state at PLC.'),
673 ('installinit2' , 'Install: Initializing.'),
674 ('validate' , 'Validating node installation.'),
675 ('rebuildinitrd', 'Rebuilding initrd'),
676 ('netcfg' , 'Install: Writing Network Configuration files.'),
677 ('update3' , 'Updating node configuration.'),
678 ('disk' , 'Checking for unused disks to add to LVM.'),
679 ('update4' , 'Sending hardware configuration to PLC.'),
680 ('debug' , 'Starting debug mode'),
681 ('bmexceptmount', 'BootManagerException during mount'),
682 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
683 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
684 ('exception' , 'Exception'),
685 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
686 ('protoerror2' , '500 Internal Server Error'),
687 ('protoerror' , 'XML RPC protocol error'),
688 ('nodehostname' , 'Configured node hostname does not resolve'),
689 ('implementerror', 'Implementation Error'),
690 ('fsckabort' , 'is mounted. e2fsck: Cannot continue, aborting'),
691 ('fsckfail' , 'Running e2fsck -v -p /dev/planetlab/root failed'),
692 ('fsckfail2' , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
693 ('readonlyfs' , '\[Errno 30\] Read-only file system'),
694 ('baddisk' , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
695 ('noinstall' , 'notinstalled'),
696 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
697 ('noblockdev' , "No block devices detected."),
698 ('dnserror' , 'Name or service not known'),
699 ('noconfig' , "Unable to find and read a node configuration file"),
700 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
701 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
702 ('hardwarerequirefail' , 'Hardware requirements not met'),
703 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
704 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
705 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
706 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
707 ('modulefail' , 'Unable to get list of system modules'),
708 ('writeerror' , 'write error: No space left on device'),
709 ('nospace' , "No space left on device"),
710 ('nonode' , 'Failed to authenticate call: No such node'),
711 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
712 ('authfail2' , 'Authentication Failed'),
713 ('bootcheckfail' , 'BootCheckAuthentication'),
714 ('bootupdatefail' , 'BootUpdateNode'),
718 def getBootManagerSequenceFromLog(self, steps, child):
722 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
723 id = index_to_id(steps,index)
726 if id == "exception":
727 print "...Found An Exception!!!"
728 elif id == "done": #index == len(steps_to_list(steps)):
734 def restore(sitehist, hostname, config=None, forced_action=None):
735 ret = restore_basic(sitehist, hostname, config, forced_action)
739 def restore_basic(sitehist, hostname, config=None, forced_action=None):
741 # NOTE: Nothing works if the bootcd is REALLY old.
742 # So, this is the first step.
744 bootman_action = "unknown"
746 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
747 recent_actions = sitehist.getRecentActions(hostname=hostname)
749 if fbnode['observed_category'] == "OLDBOOTCD":
750 print "\t...Notify owner to update BootImage!!!"
752 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
753 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
755 print "\tDisabling %s due to out-of-date BootImage" % hostname
756 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
758 # NOTE: nothing else is possible.
761 debugnode = DebugInterface(hostname)
762 conn = debugnode.getConnection()
763 if type(conn) == type(False): return "connect_failed"
765 boot_state = conn.get_boot_state()
766 if boot_state != "debug":
767 print "... %s in %s state: skipping..." % (hostname , boot_state)
768 return "skipped" #boot_state == "boot"
770 if conn.bootmanager_running():
771 print "...BootManager is currently running. Skipping host %s" %hostname
772 return "skipped" # True
774 # Read persistent flags, tagged on one week intervals.
776 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
777 dmesg = conn.get_dmesg()
778 child = fdpexpect.fdspawn(dmesg)
780 steps = debugnode.getDiskSteps()
781 sequence = debugnode.getDiskSequence(steps, child)
784 if config and not config.quiet: print "\tSET: ", s
787 print "...Potential drive errors on %s" % hostname
788 if len(s) == 2 and 'floppyerror' in s:
789 print "...Should investigate. Continuing with node."
791 print "...Should investigate. Skipping node."
792 # TODO: send message related to these errors.
794 if not found_within(recent_actions, 'baddisk_notice', 7):
795 print "baddisk_notice not found recently"
797 log=conn.get_dmesg().read()
798 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
799 #conn.set_nodestate('disabled')
801 return "skipping_baddisk"
803 print "...Downloading bm.log from %s" %hostname
804 log = conn.get_bootmanager_log()
805 child = fdpexpect.fdspawn(log)
807 if hasattr(config, 'collect') and config.collect: return "collect"
809 if config and not config.quiet: print "...Scanning bm.log for errors"
813 steps = debugnode.getBootManagerStepPatterns()
814 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
816 s = "-".join(sequence)
817 print " FOUND SEQUENCE: ", s
819 # NOTE: We get or set the flag based on the current sequence identifier.
820 # By using the sequence identifier, we guarantee that there will be no
821 # frequent loops. I'm guessing there is a better way to track loops,
824 sequences = debugnode.getSequences()
827 if s not in sequences:
828 print " HOST %s" % hostname
829 print " UNKNOWN SEQUENCE: %s" % s
832 args['hostname'] = hostname
834 args['bmlog'] = conn.get_bootmanager_log().read()
835 args['viart'] = False
836 args['saveact'] = True
837 args['ccemail'] = True
839 sitehist.sendMessage('unknownsequence_notice', **args)
841 conn.restart_bootmanager('boot')
843 bootman_action = "restart_bootmanager"
845 # NOTE: Do not set the pflags value for this sequence if it's unknown.
846 # This way, we can check it again after we've fixed it.
850 bootman_action = sequences[s]
852 if sequences[s] == "restart_bootmanager_boot":
853 print "...Restarting BootManager.py on %s "%hostname
854 conn.restart_bootmanager('boot')
855 elif sequences[s] == "restart_bootmanager_rins":
856 print "...Restarting BootManager.py on %s "%hostname
857 conn.restart_bootmanager('reinstall')
858 elif sequences[s] == "restart_node_rins":
859 conn.restart_node('reinstall')
860 elif sequences[s] == "restart_node_boot":
861 conn.restart_node('boot')
862 elif sequences[s] == "fsck_repair":
863 conn.fsck_repair_node()
864 elif sequences[s] == "repair_node_keys":
865 if conn.compare_and_repair_nodekeys():
866 # the keys either are in sync or were forced in sync.
867 # so try to start BM again.
868 conn.restart_bootmanager(conn.get_nodestate())
870 # there was some failure to synchronize the keys.
871 print "...Unable to repair node keys on %s" %hostname
872 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
874 args['hostname'] = hostname
875 sitehist.sendMessage('nodeconfig_notice', **args)
876 conn.dump_plconf_file()
878 # NOTE: do not add a new action record
881 elif sequences[s] == "unknownsequence_notice":
883 args['hostname'] = hostname
885 args['bmlog'] = conn.get_bootmanager_log().read()
886 args['viart'] = False
887 args['saveact'] = True
888 args['ccemail'] = True
890 sitehist.sendMessage('unknownsequence_notice', **args)
891 conn.restart_bootmanager('boot')
893 elif sequences[s] == "nodeconfig_notice":
895 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
897 args['hostname'] = hostname
898 sitehist.sendMessage('nodeconfig_notice', **args)
899 conn.dump_plconf_file()
901 # NOTE: do not add a new action record
904 elif sequences[s] == "nodenetwork_email":
906 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
908 args['hostname'] = hostname
909 args['bmlog'] = conn.get_bootmanager_log().read()
910 sitehist.sendMessage('nodeconfig_notice', **args)
911 conn.dump_plconf_file()
913 # NOTE: do not add a new action record
916 elif sequences[s] == "noblockdevice_notice":
918 if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
920 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
921 args['hostname'] = hostname
923 sitehist.sendMessage('noblockdevice_notice', **args)
925 # NOTE: do not add a new action record
928 elif sequences[s] == "baddisk_notice":
929 # MAKE An ACTION record that this host has failed hardware. May
930 # require either an exception "/minhw" or other manual intervention.
931 # Definitely need to send out some more EMAIL.
932 # TODO: email notice of broken hardware
933 if not found_within(recent_actions, 'baddisk_notice', 7):
934 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
936 args['hostname'] = hostname
937 args['log'] = conn.get_dmesg().read()
939 sitehist.sendMessage('baddisk_notice', **args)
940 #conn.set_nodestate('disabled')
942 # NOTE: do not add a new action record
945 elif sequences[s] == "minimalhardware_notice":
946 if not found_within(recent_actions, 'minimalhardware_notice', 7):
947 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
949 args['hostname'] = hostname
950 args['bmlog'] = conn.get_bootmanager_log().read()
951 sitehist.sendMessage('minimalhardware_notice', **args)
953 # NOTE: do not add a new action record
956 elif sequences[s] == "baddns_notice":
957 if not found_within(recent_actions, 'baddns_notice', 1):
958 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
961 node = plccache.GetNodeByName(hostname)
962 net = api.GetInterfaces(node['interface_ids'])[0]
965 print traceback.print_exc()
966 # TODO: api error. skip email, b/c all info is not available,
967 # flag_set will not be recorded.
969 nodenet_str = network_config_to_str(net)
971 args['hostname'] = hostname
972 args['network_config'] = nodenet_str
973 args['interface_id'] = net['interface_id']
975 sitehist.sendMessage('baddns_notice', **args)
977 # NOTE: do not add a new action record
980 return bootman_action
983 # MAIN -------------------------------------------------------------------
986 from monitor import parser as parsermodule
987 parser = parsermodule.getParser()
989 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
990 force=None, quiet=False)
991 parser.add_option("", "--child", dest="child", action="store_true",
992 help="This is the child mode of this process.")
993 parser.add_option("", "--force", dest="force", metavar="boot_state",
994 help="Force a boot state passed to BootManager.py.")
995 parser.add_option("", "--quiet", dest="quiet", action="store_true",
996 help="Extra quiet output messages.")
997 parser.add_option("", "--verbose", dest="verbose", action="store_true",
998 help="Extra debug output messages.")
999 parser.add_option("", "--nonet", dest="nonet", action="store_true",
1000 help="Do not setup the network, use existing log files to re-run a test pass.")
1001 parser.add_option("", "--collect", dest="collect", action="store_true",
1002 help="No action, just collect dmesg, and bm.log")
1003 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
1004 help="Do not perform the orginary setup phase.")
1006 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
1007 config = parsermodule.parse_args(parser)
1010 nodes = config.getListFromFile(config.nodelist)
1012 nodes = [ config.node ]
1019 lb = plccache.plcdb_hn2lb[node]
1020 sitehist = SiteInterface.get_or_make(loginbase=lb)
1021 #reboot(node, config)
1022 restore(sitehist, node, config=None, forced_action=None)
1024 if __name__ == "__main__":