3 # Attempt to reboot a node in debug state.
16 from monitor.getsshkeys import SSHKnownHosts
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
41 api = plc.getAuthAPI()
45 class ExceptionDoubleSSHError(Exception): pass
48 def __init__(self, connection, node, config):
53 def get_boot_state(self):
55 if self.c.modules.os.path.exists('/tmp/source'):
57 elif self.c.modules.os.path.exists('/vservers'):
63 print self.c.modules.sys.path
71 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
72 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
73 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
74 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
75 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
78 def get_bootmanager_log(self):
79 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
80 download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
81 os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
82 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
86 # def get_dmesg(self):
87 # self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
88 # download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
89 # log = open("log/dmesg.%s.log" % self.node, 'r')
92 # def get_bootmanager_log(self):
93 # download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
94 # #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
95 # os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
96 # log = open("log/bm.%s.log" % self.node, 'r')
99 def dump_plconf_file(self):
101 self.c.modules.sys.path.append("/tmp/source/")
102 self.c.modules.os.chdir('/tmp/source')
104 log = c.modules.BootManager.log('/tmp/new.log')
105 bm = c.modules.BootManager.BootManager(log,'boot')
107 BootManagerException = c.modules.Exceptions.BootManagerException
108 InitializeBootManager = c.modules.BootManager.InitializeBootManager
109 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
112 InitializeBootManager.Run(bm.VARS, bm.LOG)
113 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
117 print " Possibly, unable to find valid configuration file"
120 for key in bm.VARS.keys():
121 print key, " == ", bm.VARS[key]
123 print " Unable to read Node Configuration"
125 def fsck_repair_node(self):
127 self.c.modules.sys.path.append("/tmp/source/")
128 self.c.modules.os.chdir('/tmp/source')
130 # TODO: set boot state to node's actually boot state.
131 # could be 'boot' or 'safeboot'
132 self.c.modules.os.chdir('/tmp/source')
133 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
134 print "Running MANUAL FSCK already... try again soon."
136 print "Running MANUAL fsck on %s" % self.node
137 cmd = "( touch /tmp/BM_RUNNING ; " + \
138 " fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
139 " fsck -v -f -y /dev/planetlab/vserver >> out.fsck 2>&1 ; " + \
140 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
141 " rm -f /tmp/BM_RUNNING " + \
143 cmd = cmd % self.get_nodestate()
144 self.c.modules.os.system(cmd)
145 #self.restart_bootmanager('boot')
148 def compare_and_repair_nodekeys(self):
150 self.c.modules.sys.path.append("/tmp/source/")
151 self.c.modules.os.chdir('/tmp/source')
153 log = c.modules.BootManager.log('/tmp/new.log')
154 bm = c.modules.BootManager.BootManager(log,'boot')
156 BootManagerException = c.modules.Exceptions.BootManagerException
157 InitializeBootManager = c.modules.BootManager.InitializeBootManager
158 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
161 plcnode = plccache.GetNodeByName(self.node)
163 InitializeBootManager.Run(bm.VARS, bm.LOG)
164 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
169 print " Possibly, unable to find valid configuration file"
172 print " NODE: %s" % bm.VARS['NODE_KEY']
173 print " PLC : %s" % plcnode['key']
175 if bm.VARS['NODE_KEY'] == plcnode['key']:
178 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
179 print " Successfully updated NODE_KEY with PLC"
184 #for key in bm.VARS.keys():
185 # print key, " == ", bm.VARS[key]
187 print " Unable to retrieve NODE_KEY"
189 def bootmanager_running(self):
190 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
195 def set_nodestate(self, state='boot'):
196 return api.UpdateNode(self.node, {'boot_state' : state})
198 def get_nodestate(self):
200 return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
202 traceback.print_exc()
203 # NOTE: use last cached value from plc
204 fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
205 return fbnode['plc_node_stats']['boot_state']
208 def restart_node(self, state='boot'):
209 api.UpdateNode(self.node, {'boot_state' : state})
211 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
212 if not pflags.getRecentFlag('gentlekill'):
213 print " Killing all slice processes... : %s" % self.node
214 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
215 self.c.modules.os.system(cmd_slicekill)
216 cmd = """ shutdown -r +1 & """
217 print " Restarting %s : %s" % ( self.node, cmd)
218 self.c.modules.os.system(cmd)
220 pflags.setRecentFlag('gentlekill')
223 print " Restarting with sysrq 'sub' %s" % self.node
224 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
225 self.c.modules.os.system(cmd)
229 def restart_bootmanager(self, forceState):
231 self.c.modules.os.chdir('/tmp/source')
232 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
233 print " BootManager is already running: try again soon..."
235 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
236 cmd = "( touch /tmp/BM_RUNNING ; " + \
237 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
238 " rm -f /tmp/BM_RUNNING " + \
240 cmd = cmd % forceState
241 self.c.modules.os.system(cmd)
246 class PlanetLabSession:
247 globalport = 22000 + int(random.random()*1000)
249 def __init__(self, node, nosetup, verbose):
250 self.verbose = verbose
253 self.nosetup = nosetup
257 def get_connection(self, config):
258 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
261 # print i, conn.c.modules.sys.path
262 # print conn.c.modules.os.path.exists('/tmp/source')
267 def setup_host(self):
268 self.port = PlanetLabSession.globalport
269 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
272 args['port'] = self.port
273 args['user'] = 'root'
274 args['hostname'] = self.node
275 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
279 print "Skipping setup"
282 # COPY Rpyc files to host
283 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
284 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
285 if self.verbose: print cmd
289 localos = moncommands.CMD()
291 ret = localos.system(cmd, timeout)
294 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
295 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
296 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
297 print "trying: ", cmd
298 print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
299 ret = localos.system(cmd, timeout)
302 print "\tFAILED TWICE"
304 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
307 # KILL any already running servers.
308 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
309 (ov,ev) = ssh.run_noexcept2("""<<\EOF
311 echo "kill server" >> out.log
312 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
313 echo "export" >> out.log
314 export PYTHONPATH=$HOME ;
315 echo "start server" >> out.log
316 python Rpyc/Servers/forking_server.py &> server.log &
317 echo "done" >> out.log
319 #cmd = """ssh %(user)s@%(hostname)s """ + \
320 # """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
322 #if self.verbose: print cmd
324 #print localos.system(cmd,timeout)
326 ## START a new rpyc server.
327 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
328 # """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
330 #if self.verbose: print cmd
331 #print localos.system(cmd,timeout)
332 print "setup rpyc server over ssh"
336 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
337 # and the following options seems to work well.
338 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
339 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
340 """-o ConnectTimeout=120 """ + \
341 """-n -N -L %(port)s:localhost:18812 """ + \
342 """%(user)s@%(hostname)s"""
344 if self.verbose: print cmd
346 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
347 # TODO: the read() here may block indefinitely. Need a better
348 # approach therefore, that includes a timeout.
349 #ret = self.command.stdout.read(5)
350 ret = moncommands.read_t(self.command.stdout, 5)
354 # NOTE: There is still a slight race for machines that are slow...
355 self.timeout = 2*(t2-t1)
356 print "Sleeping for %s sec" % self.timeout
357 time.sleep(self.timeout)
360 if self.command.returncode is not None:
361 print "Failed to establish tunnel!"
362 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
364 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
368 if self.verbose: print "Killing SSH session %s" % self.port
369 print "Killing SSH session %s" % self.port
373 def steps_to_list(steps, index=1):
374 return map(lambda x: x[index], steps)
376 def index_to_id(steps,index):
377 if index < len(steps):
378 return steps[index][0]
382 class DebugInterface:
383 def __init__(self, hostname):
384 self.hostname = hostname
387 def getConnection(self):
388 print "Creating session for %s" % self.hostname
389 # update known_hosts file (in case the node has rebooted since last run)
391 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
394 print traceback.print_exc()
397 msg = "ERROR setting up session for %s" % self.hostname
400 self.session = PlanetLabSession(self.hostname, False, True)
402 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
403 except ExceptionDoubleSSHError, e:
407 traceback.print_exc()
412 conn = self.session.get_connection(config)
414 # NOTE: sometimes the wait in setup_host() is not long enough.
415 # So, here we try to wait a little longer before giving up entirely.
417 time.sleep(self.session.timeout*5)
418 conn = self.session.get_connection(config)
420 # failed twice... no need to report this really, it's just in a
424 traceback.print_exc()
425 email_exception(self.hostname)
427 #print "trying to use conn before returning it."
428 #print conn.c.modules.sys.path
429 #print conn.c.modules.os.path.exists('/tmp/source')
432 #print "conn: %s" % conn
435 def getSequences(self):
437 # TODO: This can be replaced with a DB definition at a future time.
438 # This would make it possible for an admin to introduce new
439 # patterns without touching code.
442 # restart_bootmanager_boot
443 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
444 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
445 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
447 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
449 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
450 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
451 "bminit-cfg-auth-getplc-update-debug-done",
452 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
453 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
454 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
455 "bminit-cfg-auth-protoerror-exception-update-debug-done",
456 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
457 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
459 sequences.update({n : "restart_bootmanager_boot"})
461 # conn.restart_bootmanager('reinstall')
462 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
463 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
464 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
465 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
466 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
467 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
468 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
469 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
470 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
471 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
472 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
473 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
474 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
475 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
476 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
477 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
478 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
479 # actual solution appears to involve removing the bad files, and
480 # continually trying to boot the node.
481 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
482 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
483 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
484 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
485 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
486 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
488 sequences.update({n : "restart_bootmanager_rins"})
491 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
492 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
493 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
495 sequences.update({n: "repair_node_keys"})
497 # conn.restart_node('reinstall')
498 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
499 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
500 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
501 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
502 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
503 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
504 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
505 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
506 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
507 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
508 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
509 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
511 sequences.update({n : "restart_node_rins"})
514 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
515 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
516 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
517 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
518 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
519 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
520 "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
522 sequences.update({n: "restart_node_boot"})
525 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
526 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
527 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done"
529 sequences.update({n : "fsck_repair"})
531 # update_node_config_email
532 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
533 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
534 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
535 "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
536 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
537 "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
539 sequences.update({n : "update_node_config_email"})
541 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
542 "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
543 "bminit-cfg-update-exception-nodehostname-update-debug-done",
545 sequences.update({n : "nodenetwork_email"})
547 # update_bootcd_email
548 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
549 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
550 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
551 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
552 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
554 sequences.update({n : "update_bootcd_email"})
556 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
558 sequences.update({n: "suspect_error_email"})
560 # update_hardware_email
561 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
562 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
564 # broken_hardware_email
565 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
569 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
570 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
572 sequences.update( { n : "bad_dns_email"})
576 def getDiskSteps(self):
578 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
579 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
580 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
582 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
584 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
585 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
587 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
588 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
590 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
591 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
593 ('floppytimeout','floppy0: floppy timeout called'),
594 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
596 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
597 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
599 # floppy0: floppy timeout called
600 # end_request: I/O error, dev fd0, sector 0
602 # Buffer I/O error on device dm-2, logical block 8888896
603 # ata1: status=0x51 { DriveReady SeekComplete Error }
604 # ata1: error=0x40 { UncorrectableError }
605 # SCSI error : <0 0 0 0> return code = 0x8000002
606 # sda: Current: sense key: Medium Error
607 # Additional sense: Unrecovered read error - auto reallocate failed
609 # SCSI error : <0 2 0 0> return code = 0x40001
610 # end_request: I/O error, dev sda, sector 572489600
614 def getDiskSequence(self, steps, child):
617 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
624 def getBootManagerStepPatterns(self):
626 ('bminit' , 'Initializing the BootManager.'),
627 ('cfg' , 'Reading node configuration file.'),
628 ('auth' , 'Authenticating node with PLC.'),
629 ('getplc' , 'Retrieving details of node from PLC.'),
630 ('update' , 'Updating node boot state at PLC.'),
631 ('hardware' , 'Checking if hardware requirements met.'),
632 ('installinit' , 'Install: Initializing.'),
633 ('installdisk' , 'Install: partitioning disks.'),
634 ('installbootfs', 'Install: bootstrapfs tarball.'),
635 ('installcfg' , 'Install: Writing configuration files.'),
636 ('installstop' , 'Install: Shutting down installer.'),
637 ('update2' , 'Updating node boot state at PLC.'),
638 ('installinit2' , 'Install: Initializing.'),
639 ('validate' , 'Validating node installation.'),
640 ('rebuildinitrd', 'Rebuilding initrd'),
641 ('netcfg' , 'Install: Writing Network Configuration files.'),
642 ('update3' , 'Updating node configuration.'),
643 ('disk' , 'Checking for unused disks to add to LVM.'),
644 ('update4' , 'Sending hardware configuration to PLC.'),
645 ('debug' , 'Starting debug mode'),
646 ('bmexceptmount', 'BootManagerException during mount'),
647 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
648 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
649 ('exception' , 'Exception'),
650 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
651 ('protoerror' , 'XML RPC protocol error'),
652 ('nodehostname' , 'Configured node hostname does not resolve'),
653 ('implementerror', 'Implementation Error'),
654 ('fsckabort' , 'is mounted. e2fsck: Cannot continue, aborting'),
655 ('fsckfail' , 'Running e2fsck -v -p /dev/planetlab/root failed'),
656 ('readonlyfs' , '\[Errno 30\] Read-only file system'),
657 ('baddisk' , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
658 ('noinstall' , 'notinstalled'),
659 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
660 ('noblockdev' , "No block devices detected."),
661 ('dnserror' , 'Name or service not known'),
662 ('noconfig' , "Unable to find and read a node configuration file"),
663 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
664 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
665 ('hardwarerequirefail' , 'Hardware requirements not met'),
666 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
667 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
668 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
669 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
670 ('modulefail' , 'Unable to get list of system modules'),
671 ('writeerror' , 'write error: No space left on device'),
672 ('nospace' , "No space left on device"),
673 ('nonode' , 'Failed to authenticate call: No such node'),
674 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
675 ('bootcheckfail' , 'BootCheckAuthentication'),
676 ('bootupdatefail' , 'BootUpdateNode'),
680 def getBootManagerSequenceFromLog(self, steps, child):
684 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
685 id = index_to_id(steps,index)
688 if id == "exception":
689 print "...Found An Exception!!!"
690 elif id == "done": #index == len(steps_to_list(steps)):
696 def restore(sitehist, hostname, config=None, forced_action=None):
697 ret = restore_basic(sitehist, hostname, config, forced_action)
701 def restore_basic(sitehist, hostname, config=None, forced_action=None):
703 # NOTE: Nothing works if the bootcd is REALLY old.
704 # So, this is the first step.
706 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
707 recent_actions = sitehist.getRecentActions(hostname=hostname)
709 if fbnode['observed_category'] == "OLDBOOTCD":
710 print "\t...Notify owner to update BootImage!!!"
712 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
713 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
715 print "\tDisabling %s due to out-of-date BootImage" % hostname
716 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
718 # NOTE: nothing else is possible.
721 debugnode = DebugInterface(hostname)
722 conn = debugnode.getConnection()
723 if type(conn) == type(False): return False
725 boot_state = conn.get_boot_state()
726 if boot_state != "debug":
727 print "... %s in %s state: skipping..." % (hostname , boot_state)
728 return boot_state == "boot"
730 if conn.bootmanager_running():
731 print "...BootManager is currently running. Skipping host %s" %hostname
734 # Read persistent flags, tagged on one week intervals.
736 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
737 dmesg = conn.get_dmesg()
738 child = fdpexpect.fdspawn(dmesg)
740 steps = debugnode.getDiskSteps()
741 sequence = debugnode.getDiskSequence(steps, child)
744 if config and not config.quiet: print "\tSET: ", s
747 print "...Potential drive errors on %s" % hostname
748 if len(s) == 2 and 'floppyerror' in s:
749 print "...Should investigate. Continuing with node."
751 print "...Should investigate. Skipping node."
752 # TODO: send message related to these errors.
754 if not found_within(recent_actions, 'baddisk_notice', 7):
755 print "baddisk_notice not found recently"
757 log=conn.get_dmesg().read()
758 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
759 conn.set_nodestate('disabled')
763 print "...Downloading bm.log from %s" %hostname
764 log = conn.get_bootmanager_log()
765 child = fdpexpect.fdspawn(log)
767 if hasattr(config, 'collect') and config.collect: return True
769 if config and not config.quiet: print "...Scanning bm.log for errors"
773 steps = debugnode.getBootManagerStepPatterns()
774 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
776 s = "-".join(sequence)
777 print " FOUND SEQUENCE: ", s
779 # NOTE: We get or set the flag based on the current sequence identifier.
780 # By using the sequence identifier, we guarantee that there will be no
781 # frequent loops. I'm guessing there is a better way to track loops,
784 sequences = debugnode.getSequences()
787 if s not in sequences:
788 print " HOST %s" % hostname
789 print " UNKNOWN SEQUENCE: %s" % s
792 args['hostname'] = hostname
794 args['bmlog'] = conn.get_bootmanager_log().read()
795 args['viart'] = False
796 args['saveact'] = True
797 args['ccemail'] = True
799 sitehist.sendMessage('unknownsequence_notice', **args)
801 conn.restart_bootmanager('boot')
803 # NOTE: Do not set the pflags value for this sequence if it's unknown.
804 # This way, we can check it again after we've fixed it.
809 if sequences[s] == "restart_bootmanager_boot":
810 print "...Restarting BootManager.py on %s "%hostname
811 conn.restart_bootmanager('boot')
812 elif sequences[s] == "restart_bootmanager_rins":
813 print "...Restarting BootManager.py on %s "%hostname
814 conn.restart_bootmanager('reinstall')
815 elif sequences[s] == "restart_node_rins":
816 conn.restart_node('reinstall')
817 elif sequences[s] == "restart_node_boot":
818 conn.restart_node('boot')
819 elif sequences[s] == "fsck_repair":
820 conn.fsck_repair_node()
821 elif sequences[s] == "repair_node_keys":
822 if conn.compare_and_repair_nodekeys():
823 # the keys either are in sync or were forced in sync.
824 # so try to start BM again.
825 conn.restart_bootmanager(conn.get_nodestate())
828 # there was some failure to synchronize the keys.
829 print "...Unable to repair node keys on %s" %hostname
831 elif sequences[s] == "suspect_error_email":
833 args['hostname'] = hostname
835 args['bmlog'] = conn.get_bootmanager_log().read()
836 args['viart'] = False
837 args['saveact'] = True
838 args['ccemail'] = True
840 sitehist.sendMessage('unknownsequence_notice', **args)
841 conn.restart_bootmanager('boot')
843 # TODO: differentiate this and the 'nodenetwork_email' actions.
844 elif sequences[s] == "update_node_config_email":
846 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
848 args['hostname'] = hostname
849 sitehist.sendMessage('nodeconfig_notice', **args)
850 conn.dump_plconf_file()
852 elif sequences[s] == "nodenetwork_email":
854 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
856 args['hostname'] = hostname
857 args['bmlog'] = conn.get_bootmanager_log().read()
858 sitehist.sendMessage('nodeconfig_notice', **args)
859 conn.dump_plconf_file()
861 elif sequences[s] == "update_bootcd_email":
863 if not found_within(recent_actions, 'newalphacd_notice', 3.5):
865 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
866 args['hostname'] = hostname
868 sitehist.sendMessage('newalphacd_notice', **args)
870 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
872 elif sequences[s] == "broken_hardware_email":
873 # MAKE An ACTION record that this host has failed hardware. May
874 # require either an exception "/minhw" or other manual intervention.
875 # Definitely need to send out some more EMAIL.
876 # TODO: email notice of broken hardware
877 if not found_within(recent_actions, 'baddisk_notice', 7):
878 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
880 args['hostname'] = hostname
881 args['log'] = conn.get_dmesg().read()
883 sitehist.sendMessage('baddisk_notice', **args)
884 conn.set_nodestate('disabled')
886 elif sequences[s] == "update_hardware_email":
887 if not found_within(recent_actions, 'minimalhardware_notice', 7):
888 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
890 args['hostname'] = hostname
891 args['bmlog'] = conn.get_bootmanager_log().read()
892 sitehist.sendMessage('minimalhardware_notice', **args)
894 elif sequences[s] == "bad_dns_email":
895 if not found_within(recent_actions, 'baddns_notice', 1):
896 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
899 node = plccache.GetNodeByName(hostname)
900 net = api.GetInterfaces(node['interface_ids'])[0]
903 print traceback.print_exc()
904 # TODO: api error. skip email, b/c all info is not available,
905 # flag_set will not be recorded.
907 nodenet_str = network_config_to_str(net)
909 args['hostname'] = hostname
910 args['network_config'] = nodenet_str
911 args['interface_id'] = net['interface_id']
913 sitehist.sendMessage('baddns_notice', **args)
918 # MAIN -------------------------------------------------------------------
921 from monitor import parser as parsermodule
922 parser = parsermodule.getParser()
924 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
925 force=None, quiet=False)
926 parser.add_option("", "--child", dest="child", action="store_true",
927 help="This is the child mode of this process.")
928 parser.add_option("", "--force", dest="force", metavar="boot_state",
929 help="Force a boot state passed to BootManager.py.")
930 parser.add_option("", "--quiet", dest="quiet", action="store_true",
931 help="Extra quiet output messages.")
932 parser.add_option("", "--verbose", dest="verbose", action="store_true",
933 help="Extra debug output messages.")
934 parser.add_option("", "--nonet", dest="nonet", action="store_true",
935 help="Do not setup the network, use existing log files to re-run a test pass.")
936 parser.add_option("", "--collect", dest="collect", action="store_true",
937 help="No action, just collect dmesg, and bm.log")
938 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
939 help="Do not perform the orginary setup phase.")
941 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
942 config = parsermodule.parse_args(parser)
945 nodes = config.getListFromFile(config.nodelist)
947 nodes = [ config.node ]
954 lb = plccache.plcdb_hn2lb[node]
955 sitehist = SiteInterface.get_or_make(loginbase=lb)
956 #reboot(node, config)
957 restore(sitehist, node, config=None, forced_action=None)
959 if __name__ == "__main__":