3 # Attempt to reboot a node in debug state.
8 api = plc.PLC(auth.auth, auth.plc)
13 from getsshkeys import SSHKnownHosts
20 import ssh.pxssh as pxssh
21 import ssh.fdpexpect as fdpexpect
22 import ssh.pexpect as pexpect
27 class Sopen(subprocess.Popen):
28 def kill(self, signal = signal.SIGTERM):
29 os.kill(self.pid, signal)
31 #from Rpyc import SocketConnection, Async
32 from Rpyc import SocketConnection, Async
33 from Rpyc.Utils import *
37 def __init__(self, connection, node, config):
42 def get_boot_state(self):
43 if self.c.modules.os.path.exists('/tmp/source'):
45 elif self.c.modules.os.path.exists('/vservers'):
51 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
52 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
53 log = open("log/dmesg.%s.log" % self.node, 'r')
56 def get_bootmanager_log(self):
57 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
58 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
59 log = open("log/bm.%s.log" % self.node, 'r')
62 def dump_plconf_file(self):
64 c.modules.sys.path.append("/tmp/source/")
65 c.modules.os.chdir('/tmp/source')
67 log = c.modules.BootManager.log('/tmp/new.log')
68 bm = c.modules.BootManager.BootManager(log,'boot')
70 BootManagerException = c.modules.Exceptions.BootManagerException
71 InitializeBootManager = c.modules.BootManager.InitializeBootManager
72 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
75 InitializeBootManager.Run(bm.VARS, bm.LOG)
76 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
80 print " Possibly, unable to find valid configuration file"
82 if bm_continue and self.config and not self.config.quiet:
83 for key in bm.VARS.keys():
84 print key, " == ", bm.VARS[key]
86 if self.config and not self.config.quiet: print " Unable to read Node Configuration"
89 def compare_and_repair_nodekeys(self):
91 c.modules.sys.path.append("/tmp/source/")
92 c.modules.os.chdir('/tmp/source')
94 log = c.modules.BootManager.log('/tmp/new.log')
95 bm = c.modules.BootManager.BootManager(log,'boot')
97 BootManagerException = c.modules.Exceptions.BootManagerException
98 InitializeBootManager = c.modules.BootManager.InitializeBootManager
99 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
102 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
104 InitializeBootManager.Run(bm.VARS, bm.LOG)
105 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
108 if not config.quiet: print "exception"
109 if not config.quiet: print x
110 print " Possibly, unable to find valid configuration file"
113 print " NODE: %s" % bm.VARS['NODE_KEY']
114 print " PLC : %s" % plcnode['key']
116 if bm.VARS['NODE_KEY'] == plcnode['key']:
119 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
120 print " Successfully updated NODE_KEY with PLC"
125 #for key in bm.VARS.keys():
126 # print key, " == ", bm.VARS[key]
128 print " Unable to retrieve NODE_KEY"
130 def bootmanager_running(self):
131 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
136 def restart_node(self, state='boot'):
137 api.UpdateNode(self.node, {'boot_state' : state})
139 print " Killing all slice processes... : %s" % self.node
140 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
141 self.c.modules.os.system(cmd_slicekill)
143 cmd = """ shutdown -r +1 & """
144 print " Restarting %s : %s" % ( self.node, cmd)
145 self.c.modules.os.system(cmd)
148 def restart_bootmanager(self, forceState):
150 self.c.modules.os.chdir('/tmp/source')
151 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
152 print " BootManager is already running: try again soon..."
154 print " Starting 'BootManager.py %s' on %s " % (forceState, self.node)
155 cmd = "( touch /tmp/BM_RUNNING ; " + \
156 " python ./BootManager.py %s &> server.log < /dev/null ; " + \
157 " rm -f /tmp/BM_RUNNING " + \
159 cmd = cmd % forceState
160 self.c.modules.os.system(cmd)
165 class PlanetLabSession:
168 def __init__(self, node, nosetup, verbose):
169 self.verbose = verbose
172 self.nosetup = nosetup
176 def get_connection(self, config):
177 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
179 def setup_host(self):
180 self.port = PlanetLabSession.globalport
181 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
184 args['port'] = self.port
185 args['user'] = 'root'
186 args['hostname'] = self.node
187 args['monitordir'] = "/home/soltesz/monitor"
190 print "Skipping setup"
193 # COPY Rpyc files to host
194 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
195 if self.verbose: print cmd
198 print "UNKNOWN SSH KEY FOR %s" % self.node
199 print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
200 k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
206 #cmd = "rsync -qv -az -e ssh %(monitordir)s/BootManager.py
207 # %(monitordir)s/ChainBoot.py %(user)s@%(hostname)s:/tmp/source" % args
208 #print cmd; os.system(cmd)
210 # KILL any already running servers.
211 cmd = """ssh %(user)s@%(hostname)s """ + \
212 """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
214 if self.verbose: print cmd
217 # START a new rpyc server.
218 cmd = """ssh %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
219 """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
221 if self.verbose: print cmd
224 # This was tricky to make synchronous. The combination of ssh-clients-4.7p1,
225 # and the following options seems to work well.
226 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
227 """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
228 """-o ConnectTimeout=120 """ + \
229 """-n -N -L %(port)s:localhost:18812 """ + \
230 """%(user)s@%(hostname)s"""
232 if self.verbose: print cmd
233 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
234 ret = self.command.stdout.read(5)
236 # We can return without delay.
240 if self.command.returncode is not None:
241 print "Failed to establish tunnel!"
242 raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
244 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
248 if self.verbose: print "Killing SSH session %s" % self.port
252 def steps_to_list(steps):
254 for (id,label) in steps:
255 ret_list.append(label)
258 def index_to_id(steps,index):
259 if index < len(steps):
260 return steps[index][0]
264 def reboot(hostname, config=None, forced_action=None):
267 print "Creating session for %s" % node
268 # update known_hosts file (in case the node has rebooted since last run)
269 if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
270 k = SSHKnownHosts(); k.update(node); k.write(); del k
273 session = PlanetLabSession(node, False, False)
275 session = PlanetLabSession(node, config.nosetup, config.verbose)
276 conn = session.get_connection(config)
278 if forced_action == "reboot":
279 conn.restart_node('rins')
282 boot_state = conn.get_boot_state()
283 if boot_state == "boot":
284 print "...Boot state of %s already completed : skipping..." % node
286 elif boot_state == "unknown":
287 print "...Unknown bootstate for %s : skipping..."% node
292 if conn.bootmanager_running():
293 print "...BootManager is currently running. Skipping host %s" % node
298 conn.restart_bootmanager(config.force)
301 if config and not config.quiet: print "...downloading dmesg from %s" % node
302 dmesg = conn.get_dmesg()
303 child = fdpexpect.fdspawn(dmesg)
308 ('scsierror' , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
309 ('ioerror' , 'end_request: I/O error, dev sd\w+, sector \d+'),
310 ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
311 ('atareadyerror' , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
312 ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
313 ('sdXerror' , 'sd\w: Current: sense key: Medium Error'),
314 ('floppytimeout','floppy0: floppy timeout called'),
315 ('floppyerror', 'end_request: I/O error, dev fd\w+, sector \d+'),
317 # floppy0: floppy timeout called
318 # end_request: I/O error, dev fd0, sector 0
320 #Buffer I/O error on device dm-2, logical block 8888896
321 #ata1: status=0x51 { DriveReady SeekComplete Error }
322 #ata1: error=0x40 { UncorrectableError }
323 #SCSI error : <0 0 0 0> return code = 0x8000002
324 #sda: Current: sense key: Medium Error
325 # Additional sense: Unrecovered read error - auto reallocate failed
327 #SCSI error : <0 2 0 0> return code = 0x40001
328 #end_request: I/O error, dev sda, sector 572489600
330 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
337 if config and not config.quiet: print "SET: ", s
340 print "...Potential drive errors on %s" % node
341 if len(s) == 2 and 'floppyerror' in s:
342 print "...Should investigate. Continuing with node."
344 print "...Should investigate. Skipping node."
347 print "...Downloading bm.log from %s" % node
348 log = conn.get_bootmanager_log()
349 child = fdpexpect.fdspawn(log)
353 if config and not config.quiet: print "...Scanning bm.log for errors"
359 ('bminit' , 'Initializing the BootManager.'),
360 ('cfg' , 'Reading node configuration file.'),
361 ('auth' , 'Authenticating node with PLC.'),
362 ('getplc' , 'Retrieving details of node from PLC.'),
363 ('update' , 'Updating node boot state at PLC.'),
364 ('hardware' , 'Checking if hardware requirements met.'),
365 ('installinit' , 'Install: Initializing.'),
366 ('installdisk' , 'Install: partitioning disks.'),
367 ('installbootfs', 'Install: bootstrapfs tarball.'),
368 ('installcfg' , 'Install: Writing configuration files.'),
369 ('installstop' , 'Install: Shutting down installer.'),
370 ('update2' , 'Updating node boot state at PLC.'),
371 ('installinit2' , 'Install: Initializing.'),
372 ('validate' , 'Validating node installation.'),
373 ('rebuildinitrd', 'Rebuilding initrd'),
374 ('netcfg' , 'Install: Writing Network Configuration files.'),
375 ('update3' , 'Updating node configuration.'),
376 ('disk' , 'Checking for unused disks to add to LVM.'),
377 ('update4' , 'Sending hardware configuration to PLC.'),
378 ('debug' , 'Starting debug mode'),
379 ('bmexceptmount', 'BootManagerException during mount'),
380 ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
381 ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
382 ('exception' , 'Exception'),
383 ('nocfg' , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
384 ('protoerror' , 'XML RPC protocol error'),
385 ('implementerror', 'Implementation Error'),
386 ('readonlyfs' , '[Errno 30] Read-only file system'),
387 ('noinstall' , 'notinstalled'),
388 ('bziperror' , 'bzip2: Data integrity error when decompressing.'),
389 ('noblockdev' , "No block devices detected."),
390 ('hardwarefail' , 'Hardware requirements not met'),
391 ('chrootfail' , 'Running chroot /tmp/mnt/sysimg'),
392 ('modulefail' , 'Unable to get list of system modules'),
393 ('writeerror' , 'write error: No space left on device'),
394 ('nonode' , 'Failed to authenticate call: No such node'),
395 ('authfail' , 'Failed to authenticate call: Call could not be authenticated'),
396 ('bootcheckfail' , 'BootCheckAuthentication'),
397 ('bootupdatefail' , 'BootUpdateNode'),
399 list = steps_to_list(steps)
400 index = child.expect( list + [ pexpect.EOF ])
401 id = index_to_id(steps,index)
404 if id == "exception":
405 if config and not config.quiet: print "...Found An Exception!!!"
406 elif index == len(list):
410 s = "-".join(sequence)
411 print " FOUND SEQUENCE: ", s
413 if s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done":
414 if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
415 conn.restart_bootmanager('boot')
416 elif s == "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done":
417 if conn.compare_and_repair_nodekeys():
418 # the keys either are in sync or were forced in sync.
419 # so try to reboot the node again.
420 conn.restart_bootmanager('boot')
422 # there was some failure to synchronize the keys.
423 print "...Unable to repair node keys on %s" % node
424 elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done" or \
425 s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done":
426 conn.restart_bootmanager('boot')
427 elif s == "bminit-cfg-auth-getplc-update-debug-done":
428 conn.restart_bootmanager('boot')
429 elif s == "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done" or \
430 s == "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done":
431 conn.restart_bootmanager('rins')
432 elif s == "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done":
433 conn.restart_bootmanager('boot')
434 elif s == "bminit-cfg-auth-protoerror-exception-update-debug-done":
435 conn.restart_bootmanager('boot')
436 elif s == "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done" or \
437 s == "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done":
438 # reinstall b/c it is not installed.
439 conn.restart_bootmanager('rins')
440 elif s == "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done" or \
441 s == "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done":
443 conn.restart_bootmanager('rins')
444 elif s == "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done" or \
445 s == "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done":
446 conn.restart_node('rins')
447 elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done":
448 conn.restart_node('rins')
449 elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done":
450 conn.restart_node('rins')
451 elif s == "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done":
452 conn.restart_bootmanager('rins')
453 elif s == "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done":
454 conn.restart_bootmanager('rins')
455 elif s == "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done" or \
456 s == "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done":
457 conn.dump_plconf_file()
458 elif s == "bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarefail-update-debug-done" or \
459 s == "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarefail-update-debug-done" or \
460 s == "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarefail-update-debug-done":
461 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
464 elif s == "bminit-cfg-auth-getplc-update-hardware-exception-hardwarefail-update-debug-done":
465 # MAKE An ACTION record that this host has failed hardware. May
466 # require either an exception "/minhw" or other manual intervention.
467 # Definitely need to send out some more EMAIL.
468 print "...NOTIFY OWNER OF BROKEN HARDWARE!!!"
471 elif s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done" or \
472 s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done" or \
473 s == "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done" or \
474 s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done":
475 conn.restart_node('rins')
476 #conn.restart_bootmanager('rins')
477 print "...Need to follow up on this one."
479 ## If the disk is full, just start over.
480 #conn.restart_bootmanager('rins')
485 print " HOST %s" % hostname
486 print " UNKNOWN SEQUENCE: %s" % s
492 # MAIN -------------------------------------------------------------------
495 from config import config
496 from optparse import OptionParser
497 parser = OptionParser()
498 parser.set_defaults(node=None, nodelist=None, child=False, nosetup=False, verbose=False, force=None, quiet=False)
499 parser.add_option("", "--child", dest="child", action="store_true",
500 help="This is the child mode of this process.")
501 parser.add_option("", "--force", dest="force", metavar="boot_state",
502 help="Force a boot state passed to BootManager.py.")
503 parser.add_option("", "--quiet", dest="quiet", action="store_true",
504 help="Extra quiet output messages.")
505 parser.add_option("", "--verbose", dest="verbose", action="store_true",
506 help="Extra debug output messages.")
507 parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
508 help="Do not perform the orginary setup phase.")
509 parser.add_option("", "--node", dest="node", metavar="nodename.edu",
510 help="A single node name to try to bring out of debug mode.")
511 parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt",
512 help="A list of nodes to bring out of debug mode.")
513 config = config(parser)
517 nodes = config.getListFromFile(config.nodelist)
519 nodes = [ config.node ]
527 if __name__ == "__main__":