48e99ee5132af75d5d510e05910d5651d2d069c5
[monitor.git] / nodereboot.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6 import plc
7 import auth
8 api = plc.PLC(auth.auth, auth.plc)
9
10 import sys
11 import os
12
13 from getsshkeys import SSHKnownHosts
14
15 import subprocess
16 import time
17 import soltesz
18 from sets import Set
19
20 import ssh.pxssh as pxssh
21 import ssh.fdpexpect as fdpexpect
22 import ssh.pexpect as pexpect
23
24
25
26 import signal
27 class Sopen(subprocess.Popen):
28         def kill(self, signal = signal.SIGTERM):
29                 os.kill(self.pid, signal)
30
31 #from Rpyc import SocketConnection, Async
32 from Rpyc import SocketConnection, Async
33 from Rpyc.Utils import *
34
35
36 class NodeConnection:
37         def __init__(self, connection, node, config):
38                 self.node = node
39                 self.c = connection
40                 self.config = config
41
42         def get_boot_state(self):
43                 if self.c.modules.os.path.exists('/tmp/source'):
44                         return "dbg"
45                 elif self.c.modules.os.path.exists('/vservers'): 
46                         return "boot"
47                 else:
48                         return "unknown"
49
50         def get_dmesg(self):
51                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
52                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
53                 log = open("log/dmesg.%s.log" % self.node, 'r')
54                 return log
55
56         def get_bootmanager_log(self):
57                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
58                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
59                 log = open("log/bm.%s.log" % self.node, 'r')
60                 return log
61
62         def dump_plconf_file(self):
63                 c = self.c
64                 c.modules.sys.path.append("/tmp/source/")
65                 c.modules.os.chdir('/tmp/source')
66
67                 log = c.modules.BootManager.log('/tmp/new.log')
68                 bm = c.modules.BootManager.BootManager(log,'boot')
69
70                 BootManagerException = c.modules.Exceptions.BootManagerException
71                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
72                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
73                 bm_continue = True
74
75                 InitializeBootManager.Run(bm.VARS, bm.LOG)
76                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
77                 except Exception, x:
78                         bm_continue = False
79                         print "   ERROR:", x
80                         print "   Possibly, unable to find valid configuration file"
81
82                 if bm_continue and self.config and not self.config.quiet:
83                         for key in bm.VARS.keys():
84                                 print key, " == ", bm.VARS[key]
85                 else:
86                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
87                 
88
89         def compare_and_repair_nodekeys(self):
90                 c = self.c
91                 c.modules.sys.path.append("/tmp/source/")
92                 c.modules.os.chdir('/tmp/source')
93
94                 log = c.modules.BootManager.log('/tmp/new.log')
95                 bm = c.modules.BootManager.BootManager(log,'boot')
96
97                 BootManagerException = c.modules.Exceptions.BootManagerException
98                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
99                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
100                 bm_continue = True
101
102                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
103
104                 InitializeBootManager.Run(bm.VARS, bm.LOG)
105                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
106                 except Exception, x:
107                         bm_continue = False
108                         if not config.quiet: print "exception"
109                         if not config.quiet: print x
110                         print "   Possibly, unable to find valid configuration file"
111
112                 if bm_continue:
113                         print "   NODE: %s" % bm.VARS['NODE_KEY']
114                         print "   PLC : %s" % plcnode['key']
115
116                         if bm.VARS['NODE_KEY'] == plcnode['key']:
117                                 return True
118                         else:
119                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
120                                         print "   Successfully updated NODE_KEY with PLC"
121                                         return True
122                                 else:
123                                         return False
124                                 
125                         #for key in bm.VARS.keys():
126                         #       print key, " == ", bm.VARS[key]
127                 else:
128                         print "   Unable to retrieve NODE_KEY"
129
130         def bootmanager_running(self):
131                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
132                         return True
133                 else:
134                         return False
135
136         def restart_node(self, state='boot'):
137                 api.UpdateNode(self.node, {'boot_state' : state})
138
139                 print "   Killing all slice processes... : %s" %  self.node
140                 cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
141                 self.c.modules.os.system(cmd_slicekill)
142
143                 cmd = """ shutdown -r +1 & """
144                 print "   Restarting %s : %s" % ( self.node, cmd)
145                 self.c.modules.os.system(cmd)
146                 return
147
148         def restart_bootmanager(self, forceState):
149
150                 self.c.modules.os.chdir('/tmp/source')
151                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
152                         print "   BootManager is already running: try again soon..."
153                 else:
154                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
155                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
156                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
157                                   "  rm -f /tmp/BM_RUNNING " + \
158                                   ") &" 
159                         cmd = cmd % forceState
160                         self.c.modules.os.system(cmd)
161
162                 return 
163
164
165 class PlanetLabSession:
166         globalport = 22222
167
168         def __init__(self, node, nosetup, verbose):
169                 self.verbose = verbose
170                 self.node = node
171                 self.port = None
172                 self.nosetup = nosetup
173                 self.command = None
174                 self.setup_host()
175
176         def get_connection(self, config):
177                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
178         
179         def setup_host(self):
180                 self.port = PlanetLabSession.globalport
181                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
182
183                 args = {}
184                 args['port'] = self.port
185                 args['user'] = 'root'
186                 args['hostname'] = self.node
187                 args['monitordir'] = "/home/soltesz/monitor"
188
189                 if self.nosetup:
190                         print "Skipping setup"
191                         return 
192
193                 # COPY Rpyc files to host
194                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
195                 if self.verbose: print cmd
196                 ret = os.system(cmd)
197                 if ret != 0:
198                         print "UNKNOWN SSH KEY FOR %s" % self.node
199                         print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
200                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
201                         ret = os.system(cmd)
202                         if ret != 0:
203                                 print "FAILED TWICE"
204                                 sys.exit(1)
205
206                 #cmd = "rsync -qv -az -e ssh %(monitordir)s/BootManager.py 
207                 # %(monitordir)s/ChainBoot.py %(user)s@%(hostname)s:/tmp/source" % args
208                 #print cmd; os.system(cmd)
209
210                 # KILL any already running servers.
211                 cmd = """ssh %(user)s@%(hostname)s """ + \
212                      """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
213                 cmd = cmd % args
214                 if self.verbose: print cmd
215                 os.system(cmd)
216
217                 # START a new rpyc server.
218                 cmd = """ssh %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
219                          """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
220                 cmd = cmd % args
221                 if self.verbose: print cmd
222                 os.system(cmd)
223
224                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
225                 # and the following options seems to work well.
226                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
227                       """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
228                       """-o ConnectTimeout=120 """ + \
229                       """-n -N -L %(port)s:localhost:18812 """ + \
230                       """%(user)s@%(hostname)s"""
231                 cmd = cmd % args
232                 if self.verbose: print cmd
233                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
234                 ret = self.command.stdout.read(5)
235                 if 'READY' in ret:
236                         # We can return without delay.
237                         time.sleep(1)
238                         return
239
240                 if self.command.returncode is not None:
241                         print "Failed to establish tunnel!"
242                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
243
244                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
245
246         def __del__(self):
247                 if self.command:
248                         if self.verbose: print "Killing SSH session %s" % self.port
249                         self.command.kill()
250
251
252 def steps_to_list(steps):
253         ret_list = []
254         for (id,label) in steps:
255                 ret_list.append(label)
256         return ret_list
257
258 def index_to_id(steps,index):
259         if index < len(steps):
260                 return steps[index][0]
261         else:
262                 return "done"
263
264 def reboot(hostname, config=None, forced_action=None):
265
266         node = hostname
267         print "Creating session for %s" % node
268         # update known_hosts file (in case the node has rebooted since last run)
269         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
270         k = SSHKnownHosts(); k.update(node); k.write(); del k
271
272         if config == None:
273                 session = PlanetLabSession(node, False, False)
274         else:
275                 session = PlanetLabSession(node, config.nosetup, config.verbose)
276         conn = session.get_connection(config)
277
278         if forced_action == "reboot":
279                 conn.restart_node('rins')
280                 return True
281
282         boot_state = conn.get_boot_state()
283         if boot_state == "boot":
284                 print "...Boot state of %s already completed : skipping..." % node
285                 return False
286         elif boot_state == "unknown":
287                 print "...Unknown bootstate for %s : skipping..."% node
288                 return False
289         else:
290                 pass
291
292         if conn.bootmanager_running():
293                 print "...BootManager is currently running.  Skipping host %s" % node
294                 return False
295
296         if config != None:
297                 if config.force:
298                         conn.restart_bootmanager(config.force)
299                         return True
300
301         if config and not config.quiet: print "...downloading dmesg from %s" % node
302         dmesg = conn.get_dmesg()
303         child = fdpexpect.fdspawn(dmesg)
304
305         sequence = []
306         while True:
307                 steps = [
308                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
309                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
310                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
311                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
312                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
313                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
314                         ('floppytimeout','floppy0: floppy timeout called'),
315                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
316
317                         # floppy0: floppy timeout called
318                         # end_request: I/O error, dev fd0, sector 0
319
320                         #Buffer I/O error on device dm-2, logical block 8888896
321                         #ata1: status=0x51 { DriveReady SeekComplete Error }
322                         #ata1: error=0x40 { UncorrectableError }
323                         #SCSI error : <0 0 0 0> return code = 0x8000002
324                         #sda: Current: sense key: Medium Error
325                         #       Additional sense: Unrecovered read error - auto reallocate failed
326
327                         #SCSI error : <0 2 0 0> return code = 0x40001
328                         #end_request: I/O error, dev sda, sector 572489600
329                 ]
330                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
331                 sequence.append(id)
332
333                 if id == "done":
334                         break
335
336         s = Set(sequence)
337         if config and not config.quiet: print "SET: ", s
338
339         if len(s) > 1:
340                 print "...Potential drive errors on %s" % node
341                 if len(s) == 2 and 'floppyerror' in s:
342                         print "...Should investigate.  Continuing with node."
343                 else:
344                         print "...Should investigate.  Skipping node."
345                         return False
346
347         print "...Downloading bm.log from %s" % node
348         log = conn.get_bootmanager_log()
349         child = fdpexpect.fdspawn(log)
350
351         time.sleep(1)
352
353         if config and not config.quiet: print "...Scanning bm.log for errors"
354         action_id = "dbg"
355         sequence = []
356         while True:
357
358                 steps = [
359                         ('bminit'               , 'Initializing the BootManager.'),
360                         ('cfg'                  , 'Reading node configuration file.'),
361                         ('auth'                 , 'Authenticating node with PLC.'),
362                         ('getplc'               , 'Retrieving details of node from PLC.'),
363                         ('update'               , 'Updating node boot state at PLC.'),
364                         ('hardware'             , 'Checking if hardware requirements met.'),
365                         ('installinit'  , 'Install: Initializing.'),
366                         ('installdisk'  , 'Install: partitioning disks.'),
367                         ('installbootfs', 'Install: bootstrapfs tarball.'),
368                         ('installcfg'   , 'Install: Writing configuration files.'),
369                         ('installstop'  , 'Install: Shutting down installer.'),
370                         ('update2'              , 'Updating node boot state at PLC.'),
371                         ('installinit2' , 'Install: Initializing.'),
372                         ('validate'             , 'Validating node installation.'),
373                         ('rebuildinitrd', 'Rebuilding initrd'),
374                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
375                         ('update3'              , 'Updating node configuration.'),
376                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
377                         ('update4'              , 'Sending hardware configuration to PLC.'),
378                         ('debug'                , 'Starting debug mode'),
379                         ('bmexceptmount', 'BootManagerException during mount'),
380                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
381                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
382                         ('exception'    , 'Exception'),
383                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
384                         ('protoerror'   , 'XML RPC protocol error'),
385                         ('implementerror', 'Implementation Error'),
386                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
387                         ('noinstall'    , 'notinstalled'),
388                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
389                         ('noblockdev'   , "No block devices detected."),
390                         ('hardwarefail' , 'Hardware requirements not met'),
391                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
392                         ('modulefail'   , 'Unable to get list of system modules'),
393                         ('writeerror'   , 'write error: No space left on device'),
394                         ('nonode'       , 'Failed to authenticate call: No such node'),
395                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
396                         ('bootcheckfail'     , 'BootCheckAuthentication'),
397                         ('bootupdatefail'   , 'BootUpdateNode'),
398                 ]
399                 list = steps_to_list(steps)
400                 index = child.expect( list + [ pexpect.EOF ])
401                 id = index_to_id(steps,index)
402                 sequence.append(id)
403
404                 if id == "exception":
405                         if config and not config.quiet: print "...Found An Exception!!!"
406                 elif index == len(list):
407                         #print "Reached EOF"
408                         break
409                 
410         s = "-".join(sequence)
411         print "   FOUND SEQUENCE: ", s
412
413         if s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done":
414                 if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
415                 conn.restart_bootmanager('boot')
416         elif s == "bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done":
417                 if conn.compare_and_repair_nodekeys():
418                         # the keys either are in sync or were forced in sync.
419                         # so try to reboot the node again.
420                         conn.restart_bootmanager('boot')
421                 else:
422                         # there was some failure to synchronize the keys.
423                         print "...Unable to repair node keys on %s" % node
424         elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done" or \
425                  s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done":
426                 conn.restart_bootmanager('boot')
427         elif s == "bminit-cfg-auth-getplc-update-debug-done":
428                 conn.restart_bootmanager('boot')
429         elif s == "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done" or \
430                  s == "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done":
431                 conn.restart_bootmanager('rins')
432         elif s == "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done":
433                 conn.restart_bootmanager('boot')
434         elif s == "bminit-cfg-auth-protoerror-exception-update-debug-done":
435                 conn.restart_bootmanager('boot')
436         elif s == "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done" or \
437                  s == "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done":
438                 # reinstall b/c it is not installed.
439                 conn.restart_bootmanager('rins')
440         elif s == "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done" or \
441                  s == "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done":
442
443                 conn.restart_bootmanager('rins')
444         elif s == "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done" or \
445                  s == "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done":
446                 conn.restart_node('rins')
447         elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done":
448                 conn.restart_node('rins')
449         elif s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done":
450                 conn.restart_node('rins')
451         elif s == "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done":
452                 conn.restart_bootmanager('rins')
453         elif s == "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done":
454                 conn.restart_bootmanager('rins')
455         elif s == "bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done" or \
456                  s == "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done":
457                 conn.dump_plconf_file()
458         elif s == "bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarefail-update-debug-done" or \
459              s == "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarefail-update-debug-done" or \
460                  s == "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarefail-update-debug-done":
461                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
462                 pass
463
464         elif s == "bminit-cfg-auth-getplc-update-hardware-exception-hardwarefail-update-debug-done":
465                 # MAKE An ACTION record that this host has failed hardware.  May
466                 # require either an exception "/minhw" or other manual intervention.
467                 # Definitely need to send out some more EMAIL.
468                 print "...NOTIFY OWNER OF BROKEN HARDWARE!!!"
469                 pass
470
471         elif s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done" or \
472              s == "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done" or \
473              s == "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done" or \
474                  s == "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done":
475                 conn.restart_node('rins')
476                 #conn.restart_bootmanager('rins')
477                 print "...Need to follow up on this one."
478
479                 ## If the disk is full, just start over.
480                 #conn.restart_bootmanager('rins')
481         elif s == "":
482                 pass
483
484         else:
485                 print "   HOST %s" % hostname
486                 print "   UNKNOWN SEQUENCE: %s" % s
487                 pass
488
489         return True
490         
491
492 # MAIN -------------------------------------------------------------------
493
494 def main():
495         from config import config
496         from optparse import OptionParser
497         parser = OptionParser()
498         parser.set_defaults(node=None, nodelist=None, child=False, nosetup=False, verbose=False, force=None, quiet=False)
499         parser.add_option("", "--child", dest="child", action="store_true", 
500                                                 help="This is the child mode of this process.")
501         parser.add_option("", "--force", dest="force", metavar="boot_state",
502                                                 help="Force a boot state passed to BootManager.py.")
503         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
504                                                 help="Extra quiet output messages.")
505         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
506                                                 help="Extra debug output messages.")
507         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
508                                                 help="Do not perform the orginary setup phase.")
509         parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
510                                                 help="A single node name to try to bring out of debug mode.")
511         parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
512                                                 help="A list of nodes to bring out of debug mode.")
513         config = config(parser)
514         config.parse_args()
515
516         if config.nodelist:
517                 nodes = config.getListFromFile(config.nodelist)
518         elif config.node:
519                 nodes = [ config.node ]
520         else:
521                 parser.print_help()
522                 sys.exit(1)
523
524         for node in nodes:
525                 reboot(node, config)
526
527 if __name__ == "__main__":
528         main()