3 # Attempt to reboot a node in debug state.
16 from monitor.getsshkeys import SSHKnownHosts
18 from monitor.Rpyc import SocketConnection, Async
19 from monitor.Rpyc.Utils import *
21 from monitor import getconf
22 from monitor import config
23 from monitor import const
24 from monitor.model import *
25 from monitor.common import email_exception, found_within
26 from monitor.database.info.model import *
27 from monitor.database.info.interface import *
28 from monitor.wrapper import plc
29 from monitor.wrapper import plccache
30 from monitor.wrapper.emailTxt import mailtxt
31 from monitor.nodeconfig import network_config_to_str
33 from pcucontrol.util import command as moncommands
34 from pcucontrol.util.command import Sopen
35 from pcucontrol.transports.ssh import pxssh as pxssh
36 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
37 from pcucontrol.transports.ssh import pexpect as pexpect
41 api = plc.getAuthAPI()
45 class ExceptionDoubleSSHError(Exception): pass
48 def __init__(self, connection, node, config):
53 def get_boot_state(self):
55 if self.c.modules.os.path.exists('/tmp/source'):
57 elif self.c.modules.os.path.exists('/vservers'):
63 print self.c.modules.sys.path
71 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
72 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
73 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
74 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
75 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
78 def get_bootmanager_log(self):
79 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
80 download(self.c, "/tmp/bm.log", "%s/history/%s-bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
81 os.system("cp %s/history/%s-bm.%s.log %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
82 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
86 # def get_dmesg(self):
87 # self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
88 # download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
89 # log = open("log/dmesg.%s.log" % self.node, 'r')
92 # def get_bootmanager_log(self):
93 # download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
94 # #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
95 # os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
96 # log = open("log/bm.%s.log" % self.node, 'r')
99 def dump_plconf_file(self):
101 self.c.modules.sys.path.append("/tmp/source/")
102 self.c.modules.os.chdir('/tmp/source')
104 log = c.modules.BootManager.log('/tmp/new.log')
105 bm = c.modules.BootManager.BootManager(log,'boot')
107 BootManagerException = c.modules.Exceptions.BootManagerException
108 InitializeBootManager = c.modules.BootManager.InitializeBootManager
109 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
112 InitializeBootManager.Run(bm.VARS, bm.LOG)
113 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
117 print " Possibly, unable to find valid configuration file"
120 for key in bm.VARS.keys():
121 print key, " == ", bm.VARS[key]
123 print " Unable to read Node Configuration"
125 def fsck_repair_node(self):
127 self.c.modules.sys.path.append("/tmp/source/")
128 self.c.modules.os.chdir('/tmp/source')
130 # TODO: set boot state to node's actually boot state.
131 # could be 'boot' or 'safeboot'
132 self.c.modules.os.chdir('/tmp/source')
133 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
134 print "Running MANUAL FSCK already... try again soon."
136 print "Running MANUAL fsck on %s" % self.node
137 cmd = "( touch /tmp/BM_RUNNING ; " + \
138 " fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
139 " fsck -v -f -y /dev/planetlab/vserver >> out.fsck 2>&1 ; " + \
140 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
141 " rm -f /tmp/BM_RUNNING " + \
143 cmd = cmd % self.get_nodestate()
144 self.c.modules.os.system(cmd)
145 #self.restart_bootmanager('boot')
148 def compare_and_repair_nodekeys(self):
150 self.c.modules.sys.path.append("/tmp/source/")
151 self.c.modules.os.chdir('/tmp/source')
153 log = c.modules.BootManager.log('/tmp/new.log')
154 bm = c.modules.BootManager.BootManager(log,'boot')
156 BootManagerException = c.modules.Exceptions.BootManagerException
157 InitializeBootManager = c.modules.BootManager.InitializeBootManager
158 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
161 plcnode = plccache.GetNodeByName(self.node)
163 InitializeBootManager.Run(bm.VARS, bm.LOG)
164 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
169 print " Possibly, unable to find valid configuration file"
172 print " NODE: %s" % bm.VARS['NODE_KEY']
173 print " PLC : %s" % plcnode['key']
175 if bm.VARS['NODE_KEY'] == plcnode['key']:
178 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
179 print " Successfully updated NODE_KEY with PLC"
184 #for key in bm.VARS.keys():
185 # print key, " == ", bm.VARS[key]
187 print " Unable to retrieve NODE_KEY"
189 def bootmanager_running(self):
190 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
195 def set_nodestate(self, state='boot'):
196 return api.UpdateNode(self.node, {'boot_state' : state})
198 def get_nodestate(self):
200 return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
202 traceback.print_exc()
203 # NOTE: use last cached value from plc
204 fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
205 return fbnode['plc_node_stats']['boot_state']
208 def restart_node(self, state='boot'):
209 api.UpdateNode(self.node, {'boot_state' : state})
211 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
212 if not pflags.getRecentFlag('gentlekill'):
213 print " Killing all slice processes... : %s" % self.node
214 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
215 self.c.modules.os.system(cmd_slicekill)
216 cmd = """ shutdown -r +1 & """
217 print " Restarting %s : %s" % ( self.node, cmd)
218 self.c.modules.os.system(cmd)
220 pflags.setRecentFlag('gentlekill')
223 print " Restarting with sysrq 'sub' %s" % self.node
224 cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
225 self.c.modules.os.system(cmd)
229 def restart_bootmanager(self, forceState):
231 self.c.modules.os.chdir('/tmp/source')
232 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
233 print " BootManager is already running: try again soon..."
235 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
236 cmd = "( touch /tmp/BM_RUNNING ; " + \
237 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
238 " rm -f /tmp/BM_RUNNING " + \
240 cmd = cmd % forceState
241 self.c.modules.os.system(cmd)
246 class PlanetLabSession:
247 globalport = 22000 + int(random.random()*1000)
249 def __init__(self, node, nosetup, verbose):
250 self.verbose = verbose
253 self.nosetup = nosetup
257 def get_connection(self, config):
258 conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
261 # print i, conn.c.modules.sys.path
262 # print conn.c.modules.os.path.exists('/tmp/source')
267 def setup_host(self):
268 self.port = PlanetLabSession.globalport
269 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
272 args['port'] = self.port
273 args['user'] = 'root'
274 args['hostname'] = self.node
275 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
279 print "Skipping setup"
282 # COPY Rpyc files to host
283 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
284 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
285 if self.verbose: print cmd
289 localos = moncommands.CMD()
291 ret = localos.system(cmd, timeout)
294 print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
295 #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
296 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
297 print "trying: ", cmd
298 print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
299 ret = localos.system(cmd, timeout)
302 print "\tFAILED TWICE"
303 email_exception("%s rsync failed twice" % self.node)
304 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
307 # KILL any already running servers.
308 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
309 (ov,ev) = ssh.run_noexcept2("""<<\EOF
311 echo "kill server" >> out.log
312 ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
313 echo "export" >> out.log
314 export PYTHONPATH=$HOME ;
315 echo "start server" >> out.log
316 python Rpyc/Servers/forking_server.py &> server.log &
317 echo "done" >> out.log
319 print "setup rpyc server over ssh"
323 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
324 # and the following options seems to work well.
325 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
326 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
327 """-o ConnectTimeout=120 """ + \
328 """-n -N -L %(port)s:localhost:18812 """ + \
329 """%(user)s@%(hostname)s"""
331 if self.verbose: print cmd
333 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
334 # TODO: the read() here may block indefinitely. Need a better
335 # approach therefore, that includes a timeout.
336 #ret = self.command.stdout.read(5)
337 ret = moncommands.read_t(self.command.stdout, 5)
341 # NOTE: There is still a slight race for machines that are slow...
342 self.timeout = 2*(t2-t1)
343 print "Sleeping for %s sec" % self.timeout
344 time.sleep(self.timeout)
347 if self.command.returncode is not None:
348 print "Failed to establish tunnel!"
349 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
351 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
355 if self.verbose: print "Killing SSH session %s" % self.port
356 print "Killing SSH session %s" % self.port
360 def steps_to_list(steps, index=1):
361 return map(lambda x: x[index], steps)
363 def index_to_id(steps,index):
364 if index < len(steps):
365 return steps[index][0]
369 class DebugInterface:
370 def __init__(self, hostname):
371 self.hostname = hostname
374 def getConnection(self):
375 print "Creating session for %s" % self.hostname
376 # update known_hosts file (in case the node has rebooted since last run)
378 k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
381 print traceback.print_exc()
384 msg = "ERROR setting up session for %s" % self.hostname
387 self.session = PlanetLabSession(self.hostname, False, True)
389 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
390 except ExceptionDoubleSSHError, e:
394 traceback.print_exc()
399 conn = self.session.get_connection(config)
401 # NOTE: sometimes the wait in setup_host() is not long enough.
402 # So, here we try to wait a little longer before giving up entirely.
404 time.sleep(self.session.timeout*5)
405 conn = self.session.get_connection(config)
407 # failed twice... no need to report this really, it's just in a
411 traceback.print_exc()
412 email_exception(self.hostname)
414 #print "trying to use conn before returning it."
415 #print conn.c.modules.sys.path
416 #print conn.c.modules.os.path.exists('/tmp/source')
419 #print "conn: %s" % conn
422 def getSequences(self):
424 # TODO: This can be replaced with a DB definition at a future time.
425 # This would make it possible for an admin to introduce new
426 # patterns without touching code.
429 # restart_bootmanager_boot
430 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
431 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
432 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
434 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
436 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
437 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
438 "bminit-cfg-auth-getplc-update-debug-done",
439 "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
440 "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
441 "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
442 "bminit-cfg-auth-protoerror-exception-update-debug-done",
443 "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
444 "bminit-cfg-auth-getplc-implementerror-update-debug-done",
446 sequences.update({n : "restart_bootmanager_boot"})
448 # conn.restart_bootmanager('reinstall')
449 for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
450 "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
451 "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
452 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
453 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
454 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
455 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
456 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
457 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
458 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
459 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
460 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
461 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
462 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
463 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
464 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
465 "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
466 # actual solution appears to involve removing the bad files, and
467 # continually trying to boot the node.
468 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
469 "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
470 "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
471 "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
472 "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
473 "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
475 sequences.update({n : "restart_bootmanager_rins"})
478 for n in ["bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-validate-exception-done",
479 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done",
480 "bminit-cfg-auth-bootcheckfail-authfail-exception-update-debug-validate-exception-done",
481 "bminit-cfg-auth-bootcheckfail-authfail-exception-authfail-debug-validate-exception-done",
483 sequences.update({n: "repair_node_keys"})
485 # conn.restart_node('reinstall')
486 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
487 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
488 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
489 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
490 "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
491 "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
492 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
493 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
494 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
495 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
496 "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
497 "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
499 sequences.update({n : "restart_node_rins"})
502 for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
503 "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
504 "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
505 "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
506 "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
507 "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
508 "bminit-cfg-auth-getplc-exception-update-bootupdatefail-debug-done",
510 sequences.update({n: "restart_node_boot"})
513 for n in ["bminit-cfg-auth-getplc-update-installinit-validate-fsckabort-exception-fsckfail-bmexceptmount-exception-noinstall-update-debug-validate-fsckabort-exception-fsckfail-bmexceptmount-done",
514 "bminit-cfg-auth-getplc-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done",
515 "bminit-cfg-auth-getplc-update-installinit-validate-exception-fsckfail-exception-noinstall-update-debug-validate-exception-fsckfail-done"
517 sequences.update({n : "fsck_repair"})
519 # update_node_config_email
520 for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
521 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
522 "bminit-cfg-exception-update-bootupdatefail-nonode-debug-validate-exception-done",
523 "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-validate-exception-done",
524 "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
525 "bminit-cfg-exception-noconfig-update-debug-validate-exception-done",
527 sequences.update({n : "update_node_config_email"})
529 for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
530 "bminit-cfg-update-exception-nodehostname-update-debug-validate-exception-done",
531 "bminit-cfg-update-exception-nodehostname-update-debug-done",
533 sequences.update({n : "nodenetwork_email"})
535 # update_bootcd_email
536 for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
537 "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
538 "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
539 "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
540 "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
542 sequences.update({n : "update_bootcd_email"})
544 for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
546 sequences.update({n: "suspect_error_email"})
548 # update_hardware_email
549 sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
550 sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
552 # broken_hardware_email
553 sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
557 "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
558 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
560 sequences.update( { n : "bad_dns_email"})
564 def getDiskSteps(self):
566 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
567 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
568 ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION byte \w+ = \w+'),
570 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
572 ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
573 ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
575 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
576 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
578 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
579 ('ext3error' , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
581 ('floppytimeout','floppy0: floppy timeout called'),
582 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
584 # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
585 # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
587 # floppy0: floppy timeout called
588 # end_request: I/O error, dev fd0, sector 0
590 # Buffer I/O error on device dm-2, logical block 8888896
591 # ata1: status=0x51 { DriveReady SeekComplete Error }
592 # ata1: error=0x40 { UncorrectableError }
593 # SCSI error : <0 0 0 0> return code = 0x8000002
594 # sda: Current: sense key: Medium Error
595 # Additional sense: Unrecovered read error - auto reallocate failed
597 # SCSI error : <0 2 0 0> return code = 0x40001
598 # end_request: I/O error, dev sda, sector 572489600
602 def getDiskSequence(self, steps, child):
605 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
612 def getBootManagerStepPatterns(self):
614 ('bminit' , 'Initializing the BootManager.'),
615 ('cfg' , 'Reading node configuration file.'),
616 ('auth' , 'Authenticating node with PLC.'),
617 ('getplc' , 'Retrieving details of node from PLC.'),
618 ('update' , 'Updating node boot state at PLC.'),
619 ('hardware' , 'Checking if hardware requirements met.'),
620 ('installinit' , 'Install: Initializing.'),
621 ('installdisk' , 'Install: partitioning disks.'),
622 ('installbootfs', 'Install: bootstrapfs tarball.'),
623 ('installcfg' , 'Install: Writing configuration files.'),
624 ('installstop' , 'Install: Shutting down installer.'),
625 ('update2' , 'Updating node boot state at PLC.'),
626 ('installinit2' , 'Install: Initializing.'),
627 ('validate' , 'Validating node installation.'),
628 ('rebuildinitrd', 'Rebuilding initrd'),
629 ('netcfg' , 'Install: Writing Network Configuration files.'),
630 ('update3' , 'Updating node configuration.'),
631 ('disk' , 'Checking for unused disks to add to LVM.'),
632 ('update4' , 'Sending hardware configuration to PLC.'),
633 ('debug' , 'Starting debug mode'),
634 ('bmexceptmount', 'BootManagerException during mount'),
635 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
636 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
637 ('exception' , 'Exception'),
638 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
639 ('protoerror' , 'XML RPC protocol error'),
640 ('nodehostname' , 'Configured node hostname does not resolve'),
641 ('implementerror', 'Implementation Error'),
642 ('fsckabort' , 'is mounted. e2fsck: Cannot continue, aborting'),
643 ('fsckfail' , 'Running e2fsck -v -p /dev/planetlab/root failed'),
644 ('readonlyfs' , '\[Errno 30\] Read-only file system'),
645 ('baddisk' , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
646 ('noinstall' , 'notinstalled'),
647 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
648 ('noblockdev' , "No block devices detected."),
649 ('dnserror' , 'Name or service not known'),
650 ('noconfig' , "Unable to find and read a node configuration file"),
651 ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
652 ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
653 ('hardwarerequirefail' , 'Hardware requirements not met'),
654 ('mkfsfail' , 'while running: Running mkfs.ext2 -q -m 0 -j /dev/planetlab/vservers failed'),
655 ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
656 ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
657 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
658 ('modulefail' , 'Unable to get list of system modules'),
659 ('writeerror' , 'write error: No space left on device'),
660 ('nospace' , "No space left on device"),
661 ('nonode' , 'Failed to authenticate call: No such node'),
662 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
663 ('bootcheckfail' , 'BootCheckAuthentication'),
664 ('bootupdatefail' , 'BootUpdateNode'),
668 def getBootManagerSequenceFromLog(self, steps, child):
672 index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
673 id = index_to_id(steps,index)
676 if id == "exception":
677 print "...Found An Exception!!!"
678 elif id == "done": #index == len(steps_to_list(steps)):
684 def restore(sitehist, hostname, config=None, forced_action=None):
685 ret = restore_basic(sitehist, hostname, config, forced_action)
689 def restore_basic(sitehist, hostname, config=None, forced_action=None):
691 # NOTE: Nothing works if the bootcd is REALLY old.
692 # So, this is the first step.
694 fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
695 recent_actions = sitehist.getRecentActions(hostname=hostname)
697 if fbnode['observed_category'] == "OLDBOOTCD":
698 print "\t...Notify owner to update BootImage!!!"
700 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
701 sitehist.sendMessage('newbootcd_notice', hostname=hostname)
703 print "\tDisabling %s due to out-of-date BootImage" % hostname
704 api.UpdateNode(hostname, {'boot_state' : 'disabled'})
706 # NOTE: nothing else is possible.
709 debugnode = DebugInterface(hostname)
710 conn = debugnode.getConnection()
711 if type(conn) == type(False): return False
713 boot_state = conn.get_boot_state()
714 if boot_state != "debug":
715 print "... %s in %s state: skipping..." % (hostname , boot_state)
716 return boot_state == "boot"
718 if conn.bootmanager_running():
719 print "...BootManager is currently running. Skipping host %s" %hostname
722 # Read persistent flags, tagged on one week intervals.
724 if config and not config.quiet: print "...downloading dmesg from %s" %hostname
725 dmesg = conn.get_dmesg()
726 child = fdpexpect.fdspawn(dmesg)
728 steps = debugnode.getDiskSteps()
729 sequence = debugnode.getDiskSequence(steps, child)
732 if config and not config.quiet: print "\tSET: ", s
735 print "...Potential drive errors on %s" % hostname
736 if len(s) == 2 and 'floppyerror' in s:
737 print "...Should investigate. Continuing with node."
739 print "...Should investigate. Skipping node."
740 # TODO: send message related to these errors.
742 if not found_within(recent_actions, 'baddisk_notice', 7):
743 print "baddisk_notice not found recently"
745 log=conn.get_dmesg().read()
746 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
747 conn.set_nodestate('disabled')
751 print "...Downloading bm.log from %s" %hostname
752 log = conn.get_bootmanager_log()
753 child = fdpexpect.fdspawn(log)
755 if hasattr(config, 'collect') and config.collect: return True
757 if config and not config.quiet: print "...Scanning bm.log for errors"
761 steps = debugnode.getBootManagerStepPatterns()
762 sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
764 s = "-".join(sequence)
765 print " FOUND SEQUENCE: ", s
767 # NOTE: We get or set the flag based on the current sequence identifier.
768 # By using the sequence identifier, we guarantee that there will be no
769 # frequent loops. I'm guessing there is a better way to track loops,
772 sequences = debugnode.getSequences()
775 if s not in sequences:
776 print " HOST %s" % hostname
777 print " UNKNOWN SEQUENCE: %s" % s
780 args['hostname'] = hostname
782 args['bmlog'] = conn.get_bootmanager_log().read()
783 args['viart'] = False
784 args['saveact'] = True
785 args['ccemail'] = True
787 sitehist.sendMessage('unknownsequence_notice', **args)
789 conn.restart_bootmanager('boot')
791 # NOTE: Do not set the pflags value for this sequence if it's unknown.
792 # This way, we can check it again after we've fixed it.
797 if sequences[s] == "restart_bootmanager_boot":
798 print "...Restarting BootManager.py on %s "%hostname
799 conn.restart_bootmanager('boot')
800 elif sequences[s] == "restart_bootmanager_rins":
801 print "...Restarting BootManager.py on %s "%hostname
802 conn.restart_bootmanager('reinstall')
803 elif sequences[s] == "restart_node_rins":
804 conn.restart_node('reinstall')
805 elif sequences[s] == "restart_node_boot":
806 conn.restart_node('boot')
807 elif sequences[s] == "fsck_repair":
808 conn.fsck_repair_node()
809 elif sequences[s] == "repair_node_keys":
810 if conn.compare_and_repair_nodekeys():
811 # the keys either are in sync or were forced in sync.
812 # so try to start BM again.
813 conn.restart_bootmanager(conn.get_nodestate())
816 # there was some failure to synchronize the keys.
817 print "...Unable to repair node keys on %s" %hostname
819 elif sequences[s] == "suspect_error_email":
821 args['hostname'] = hostname
823 args['bmlog'] = conn.get_bootmanager_log().read()
824 args['viart'] = False
825 args['saveact'] = True
826 args['ccemail'] = True
828 sitehist.sendMessage('unknownsequence_notice', **args)
829 conn.restart_bootmanager('boot')
831 # TODO: differentiate this and the 'nodenetwork_email' actions.
832 elif sequences[s] == "update_node_config_email":
834 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
836 args['hostname'] = hostname
837 sitehist.sendMessage('nodeconfig_notice', **args)
838 conn.dump_plconf_file()
840 elif sequences[s] == "nodenetwork_email":
842 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
844 args['hostname'] = hostname
845 args['bmlog'] = conn.get_bootmanager_log().read()
846 sitehist.sendMessage('nodeconfig_notice', **args)
847 conn.dump_plconf_file()
849 elif sequences[s] == "update_bootcd_email":
851 if not found_within(recent_actions, 'newalphacd_notice', 3.5):
853 args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
854 args['hostname'] = hostname
856 sitehist.sendMessage('newalphacd_notice', **args)
858 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
860 elif sequences[s] == "broken_hardware_email":
861 # MAKE An ACTION record that this host has failed hardware. May
862 # require either an exception "/minhw" or other manual intervention.
863 # Definitely need to send out some more EMAIL.
864 # TODO: email notice of broken hardware
865 if not found_within(recent_actions, 'baddisk_notice', 7):
866 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
868 args['hostname'] = hostname
869 args['log'] = conn.get_dmesg().read()
871 sitehist.sendMessage('baddisk_notice', **args)
872 conn.set_nodestate('disabled')
874 elif sequences[s] == "update_hardware_email":
875 if not found_within(recent_actions, 'minimalhardware_notice', 7):
876 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
878 args['hostname'] = hostname
879 args['bmlog'] = conn.get_bootmanager_log().read()
880 sitehist.sendMessage('minimalhardware_notice', **args)
882 elif sequences[s] == "bad_dns_email":
883 if not found_within(recent_actions, 'baddns_notice', 1):
884 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
887 node = plccache.GetNodeByName(hostname)
888 net = api.GetInterfaces(node['interface_ids'])[0]
891 print traceback.print_exc()
892 # TODO: api error. skip email, b/c all info is not available,
893 # flag_set will not be recorded.
895 nodenet_str = network_config_to_str(net)
897 args['hostname'] = hostname
898 args['network_config'] = nodenet_str
899 args['interface_id'] = net['interface_id']
901 sitehist.sendMessage('baddns_notice', **args)
906 # MAIN -------------------------------------------------------------------
909 from monitor import parser as parsermodule
910 parser = parsermodule.getParser()
912 parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
913 force=None, quiet=False)
914 parser.add_option("", "--child", dest="child", action="store_true",
915 help="This is the child mode of this process.")
916 parser.add_option("", "--force", dest="force", metavar="boot_state",
917 help="Force a boot state passed to BootManager.py.")
918 parser.add_option("", "--quiet", dest="quiet", action="store_true",
919 help="Extra quiet output messages.")
920 parser.add_option("", "--verbose", dest="verbose", action="store_true",
921 help="Extra debug output messages.")
922 parser.add_option("", "--nonet", dest="nonet", action="store_true",
923 help="Do not setup the network, use existing log files to re-run a test pass.")
924 parser.add_option("", "--collect", dest="collect", action="store_true",
925 help="No action, just collect dmesg, and bm.log")
926 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
927 help="Do not perform the orginary setup phase.")
929 parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
930 config = parsermodule.parse_args(parser)
933 nodes = config.getListFromFile(config.nodelist)
935 nodes = [ config.node ]
942 lb = plccache.plcdb_hn2lb[node]
943 sitehist = SiteInterface.get_or_make(loginbase=lb)
944 #reboot(node, config)
945 restore(sitehist, node, config=None, forced_action=None)
947 if __name__ == "__main__":