added for the first time
[monitor.git] / nodereboot.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import plc
6 import auth
7 api = plc.PLC(auth.auth, auth.plc)
8
9 import sys
10 import os
11 import policy
12
13 from getsshkeys import SSHKnownHosts
14
15 import subprocess
16 import time
17 import soltesz
18 from sets import Set
19
20 import ssh.pxssh as pxssh
21 import ssh.fdpexpect as fdpexpect
22 import ssh.pexpect as pexpect
23 from unified_model import *
24 from emailTxt import mailtxt
25
26 import signal
27 class Sopen(subprocess.Popen):
28         def kill(self, signal = signal.SIGTERM):
29                 os.kill(self.pid, signal)
30
31 #from Rpyc import SocketConnection, Async
32 from Rpyc import SocketConnection, Async
33 from Rpyc.Utils import *
34
35 def get_fbnode(node):
36         fb = soltesz.dbLoad("findbad")
37         fbnode = fb['nodes'][node]['values']
38         return fbnode
39
40 class NodeConnection:
41         def __init__(self, connection, node, config):
42                 self.node = node
43                 self.c = connection
44                 self.config = config
45
46         def get_boot_state(self):
47                 if self.c.modules.os.path.exists('/tmp/source'):
48                         return "dbg"
49                 elif self.c.modules.os.path.exists('/vservers'): 
50                         return "boot"
51                 else:
52                         return "unknown"
53
54         def get_dmesg(self):
55                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
56                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
57                 log = open("log/dmesg.%s.log" % self.node, 'r')
58                 return log
59
60         def get_bootmanager_log(self):
61                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
62                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
63                 log = open("log/bm.%s.log" % self.node, 'r')
64                 return log
65
66         def dump_plconf_file(self):
67                 c = self.c
68                 c.modules.sys.path.append("/tmp/source/")
69                 c.modules.os.chdir('/tmp/source')
70
71                 log = c.modules.BootManager.log('/tmp/new.log')
72                 bm = c.modules.BootManager.BootManager(log,'boot')
73
74                 BootManagerException = c.modules.Exceptions.BootManagerException
75                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
76                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
77                 bm_continue = True
78
79                 InitializeBootManager.Run(bm.VARS, bm.LOG)
80                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
81                 except Exception, x:
82                         bm_continue = False
83                         print "   ERROR:", x
84                         print "   Possibly, unable to find valid configuration file"
85
86                 if bm_continue and self.config and not self.config.quiet:
87                         for key in bm.VARS.keys():
88                                 print key, " == ", bm.VARS[key]
89                 else:
90                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
91                 
92
93         def compare_and_repair_nodekeys(self):
94                 c = self.c
95                 c.modules.sys.path.append("/tmp/source/")
96                 c.modules.os.chdir('/tmp/source')
97
98                 log = c.modules.BootManager.log('/tmp/new.log')
99                 bm = c.modules.BootManager.BootManager(log,'boot')
100
101                 BootManagerException = c.modules.Exceptions.BootManagerException
102                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
103                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
104                 bm_continue = True
105
106                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
107
108                 InitializeBootManager.Run(bm.VARS, bm.LOG)
109                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
110                 except Exception, x:
111                         bm_continue = False
112                         print "exception"
113                         print x
114                         print "   Possibly, unable to find valid configuration file"
115
116                 if bm_continue:
117                         print "   NODE: %s" % bm.VARS['NODE_KEY']
118                         print "   PLC : %s" % plcnode['key']
119
120                         if bm.VARS['NODE_KEY'] == plcnode['key']:
121                                 return True
122                         else:
123                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
124                                         print "   Successfully updated NODE_KEY with PLC"
125                                         return True
126                                 else:
127                                         return False
128                                 
129                         #for key in bm.VARS.keys():
130                         #       print key, " == ", bm.VARS[key]
131                 else:
132                         print "   Unable to retrieve NODE_KEY"
133
134         def bootmanager_running(self):
135                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
136                         return True
137                 else:
138                         return False
139
140         def set_nodestate(self, state='boot'):
141                 return api.UpdateNode(self.node, {'boot_state' : state})
142
143         def restart_node(self, state='boot'):
144                 api.UpdateNode(self.node, {'boot_state' : state})
145
146                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
147                 if not pflags.getRecentFlag('gentlekill'):
148                         print "   Killing all slice processes... : %s" %  self.node
149                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
150                         self.c.modules.os.system(cmd_slicekill)
151                         cmd = """ shutdown -r +1 & """
152                         print "   Restarting %s : %s" % ( self.node, cmd)
153                         self.c.modules.os.system(cmd)
154
155                         pflags.setRecentFlag('gentlekill')
156                         pflags.save()
157                 else:
158                         print "   Restarting with sysrq 'sub' %s" % self.node
159                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
160                         self.c.modules.os.system(cmd)
161
162                 return
163
164         def restart_bootmanager(self, forceState):
165
166                 self.c.modules.os.chdir('/tmp/source')
167                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
168                         print "   BootManager is already running: try again soon..."
169                 else:
170                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
171                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
172                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
173                                   "  rm -f /tmp/BM_RUNNING " + \
174                                   ") &" 
175                         cmd = cmd % forceState
176                         self.c.modules.os.system(cmd)
177
178                 return 
179
180
181 import random
182 class PlanetLabSession:
183         globalport = 22000 + int(random.random()*1000)
184
185         def __init__(self, node, nosetup, verbose):
186                 self.verbose = verbose
187                 self.node = node
188                 self.port = None
189                 self.nosetup = nosetup
190                 self.command = None
191                 self.setup_host()
192
193         def get_connection(self, config):
194                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
195         
196         def setup_host(self):
197                 self.port = PlanetLabSession.globalport
198                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
199
200                 args = {}
201                 args['port'] = self.port
202                 args['user'] = 'root'
203                 args['hostname'] = self.node
204                 args['monitordir'] = "/home/soltesz/monitor"
205                 ssh_port = 22
206
207                 if self.nosetup:
208                         print "Skipping setup"
209                         return 
210
211                 # COPY Rpyc files to host
212                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
213                 if self.verbose: print cmd
214                 # TODO: Add timeout
215                 timeout = 120
216                 localos = soltesz.CMD()
217
218                 ret = localos.system(cmd, timeout)
219                 print ret
220                 if ret != 0:
221                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
222                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
223                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
224                         ret = localos.system(cmd, timeout)
225                         print ret
226                         if ret != 0:
227                                 print "\tFAILED TWICE"
228                                 #sys.exit(1)
229                                 raise Exception("Failed twice trying to login with updated ssh host key")
230
231                 t1 = time.time()
232                 # KILL any already running servers.
233                 ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port)
234                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
235             rm -f out.log
236             echo "kill server" >> out.log
237             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
238             echo "export" >> out.log
239             export PYTHONPATH=$HOME  ;
240             echo "start server" >> out.log
241             python Rpyc/Servers/forking_server.py &> server.log &
242             echo "done" >> out.log
243 EOF""")
244                 #cmd = """ssh %(user)s@%(hostname)s """ + \
245                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
246                 #cmd = cmd % args
247                 #if self.verbose: print cmd
248                 ## TODO: Add timeout
249                 #print localos.system(cmd,timeout)
250
251                 ## START a new rpyc server.
252                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
253                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
254                 #cmd = cmd % args
255                 #if self.verbose: print cmd
256                 #print localos.system(cmd,timeout)
257                 print ssh.ret
258
259                 # TODO: Add timeout
260                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
261                 # and the following options seems to work well.
262                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
263                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
264                           """-o ConnectTimeout=120 """ + \
265                           """-n -N -L %(port)s:localhost:18812 """ + \
266                           """%(user)s@%(hostname)s"""
267                 cmd = cmd % args
268                 if self.verbose: print cmd
269                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
270                 # TODO: the read() here may block indefinitely.  Need a better
271                 # approach therefore, that includes a timeout.
272                 #ret = self.command.stdout.read(5)
273                 ret = soltesz.read_t(self.command.stdout, 5)
274
275                 t2 = time.time()
276                 if 'READY' in ret:
277                         # NOTE: There is still a slight race for machines that are slow...
278                         self.timeout = 2*(t2-t1)
279                         print "Sleeping for %s sec" % self.timeout
280                         time.sleep(self.timeout)
281                         return
282
283                 if self.command.returncode is not None:
284                         print "Failed to establish tunnel!"
285                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
286
287                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
288
289         def __del__(self):
290                 if self.command:
291                         if self.verbose: print "Killing SSH session %s" % self.port
292                         self.command.kill()
293
294
295 def steps_to_list(steps):
296         ret_list = []
297         for (id,label) in steps:
298                 ret_list.append(label)
299         return ret_list
300
301 def index_to_id(steps,index):
302         if index < len(steps):
303                 return steps[index][0]
304         else:
305                 return "done"
306
307 def reboot(hostname, config=None, forced_action=None):
308
309         # NOTE: Nothing works if the bootcd is REALLY old.
310         #       So, this is the first step.
311         fbnode = get_fbnode(hostname)
312         if fbnode['category'] == "OLDBOOTCD":
313                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
314                 args = {}
315                 args['hostname_list'] = "    %s" % hostname
316
317                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
318                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
319
320                 loginbase = plc.siteId(hostname)
321                 m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
322
323                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
324                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
325                 return True
326
327         node = hostname
328         print "Creating session for %s" % node
329         # update known_hosts file (in case the node has rebooted since last run)
330         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
331         try:
332                 k = SSHKnownHosts(); k.update(node); k.write(); del k
333         except:
334                 import traceback; print traceback.print_exc()
335                 return False
336
337         try:
338                 if config == None:
339                         session = PlanetLabSession(node, False, True)
340                 else:
341                         session = PlanetLabSession(node, config.nosetup, config.verbose)
342         except Exception, e:
343                 print "ERROR setting up session for %s" % hostname
344                 import traceback; print traceback.print_exc()
345                 print e
346                 return False
347
348         try:
349                 conn = session.get_connection(config)
350         except EOFError:
351                 # NOTE: sometimes the wait in setup_host() is not long enough.  
352                 # So, here we try to wait a little longer before giving up entirely.
353                 try:
354                         time.sleep(session.timeout*4)
355                         conn = session.get_connection(config)
356                 except:
357                         import traceback; print traceback.print_exc()
358                         return False
359                         
360
361         if forced_action == "reboot":
362                 conn.restart_node('rins')
363                 return True
364
365         boot_state = conn.get_boot_state()
366         if boot_state == "boot":
367                 print "...Boot state of %s already completed : skipping..." % node
368                 return True
369         elif boot_state == "unknown":
370                 print "...Unknown bootstate for %s : skipping..."% node
371                 return False
372         else:
373                 pass
374
375         if conn.bootmanager_running():
376                 print "...BootManager is currently running.  Skipping host %s" % node
377                 return True
378
379         #if config != None:
380         #       if config.force:
381         #               conn.restart_bootmanager(config.force)
382         #               return True
383
384         # Read persistent flags, tagged on one week intervals.
385         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
386                 
387
388         if config and not config.quiet: print "...downloading dmesg from %s" % node
389         dmesg = conn.get_dmesg()
390         child = fdpexpect.fdspawn(dmesg)
391
392         sequence = []
393         while True:
394                 steps = [
395                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
396                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
397                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
398
399                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
400                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
401                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
402                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
403                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
404                         ('floppytimeout','floppy0: floppy timeout called'),
405                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
406
407                         # floppy0: floppy timeout called
408                         # end_request: I/O error, dev fd0, sector 0
409
410                         #Buffer I/O error on device dm-2, logical block 8888896
411                         #ata1: status=0x51 { DriveReady SeekComplete Error }
412                         #ata1: error=0x40 { UncorrectableError }
413                         #SCSI error : <0 0 0 0> return code = 0x8000002
414                         #sda: Current: sense key: Medium Error
415                         #       Additional sense: Unrecovered read error - auto reallocate failed
416
417                         #SCSI error : <0 2 0 0> return code = 0x40001
418                         #end_request: I/O error, dev sda, sector 572489600
419                 ]
420                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
421                 sequence.append(id)
422
423                 if id == "done":
424                         break
425
426         s = Set(sequence)
427         if config and not config.quiet: print "\tSET: ", s
428
429         if len(s) > 1:
430                 print "...Potential drive errors on %s" % node
431                 if len(s) == 2 and 'floppyerror' in s:
432                         print "...Should investigate.  Continuing with node."
433                 else:
434                         print "...Should investigate.  Skipping node."
435                         # TODO: send message related to these errors.
436                         args = {}
437                         args['hostname'] = hostname
438                         args['log'] = conn.get_dmesg().read()
439
440                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
441                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
442
443                         loginbase = plc.siteId(hostname)
444                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
445                         conn.set_nodestate('diag')
446                         return False
447
448         print "...Downloading bm.log from %s" % node
449         log = conn.get_bootmanager_log()
450         child = fdpexpect.fdspawn(log)
451
452         try:
453                 if config.collect: return True
454         except:
455                 pass
456
457         time.sleep(1)
458
459         if config and not config.quiet: print "...Scanning bm.log for errors"
460         action_id = "dbg"
461         sequence = []
462         while True:
463
464                 steps = [
465                         ('bminit'               , 'Initializing the BootManager.'),
466                         ('cfg'                  , 'Reading node configuration file.'),
467                         ('auth'                 , 'Authenticating node with PLC.'),
468                         ('getplc'               , 'Retrieving details of node from PLC.'),
469                         ('update'               , 'Updating node boot state at PLC.'),
470                         ('hardware'             , 'Checking if hardware requirements met.'),
471                         ('installinit'  , 'Install: Initializing.'),
472                         ('installdisk'  , 'Install: partitioning disks.'),
473                         ('installbootfs', 'Install: bootstrapfs tarball.'),
474                         ('installcfg'   , 'Install: Writing configuration files.'),
475                         ('installstop'  , 'Install: Shutting down installer.'),
476                         ('update2'              , 'Updating node boot state at PLC.'),
477                         ('installinit2' , 'Install: Initializing.'),
478                         ('validate'             , 'Validating node installation.'),
479                         ('rebuildinitrd', 'Rebuilding initrd'),
480                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
481                         ('update3'              , 'Updating node configuration.'),
482                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
483                         ('update4'              , 'Sending hardware configuration to PLC.'),
484                         ('debug'                , 'Starting debug mode'),
485                         ('bmexceptmount', 'BootManagerException during mount'),
486                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
487                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
488                         ('exception'    , 'Exception'),
489                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
490                         ('protoerror'   , 'XML RPC protocol error'),
491                         ('nodehostname' , 'Configured node hostname does not resolve'),
492                         ('implementerror', 'Implementation Error'),
493                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
494                         ('noinstall'    , 'notinstalled'),
495                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
496                         ('noblockdev'   , "No block devices detected."),
497                         ('downloadfail' , 'Unable to download main tarball /boot-alpha/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
498                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
499                         ('hardwarerequirefail' , 'Hardware requirements not met'),
500                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
501                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
502                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
503                         ('modulefail'   , 'Unable to get list of system modules'),
504                         ('writeerror'   , 'write error: No space left on device'),
505                         ('nospace'      , "No space left on device"),
506                         ('nonode'       , 'Failed to authenticate call: No such node'),
507                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
508                         ('bootcheckfail'     , 'BootCheckAuthentication'),
509                         ('bootupdatefail'   , 'BootUpdateNode'),
510                 ]
511                 list = steps_to_list(steps)
512                 index = child.expect( list + [ pexpect.EOF ])
513                 id = index_to_id(steps,index)
514                 sequence.append(id)
515
516                 if id == "exception":
517                         if config and not config.quiet: print "...Found An Exception!!!"
518                 elif index == len(list):
519                         #print "Reached EOF"
520                         break
521                 
522         s = "-".join(sequence)
523         print "   FOUND SEQUENCE: ", s
524
525         # NOTE: We get or set the flag based on the current sequence identifier.
526         #  By using the sequence identifier, we guarantee that there will be no
527         #  frequent loops.  I'm guessing there is a better way to track loops,
528         #  though.
529         if not config.force and pflags.getRecentFlag(s):
530                 pflags.setRecentFlag(s)
531                 pflags.save() 
532                 print "... flag is set or it has already run recently. Skipping %s" % node
533                 return True
534
535         sequences = {}
536
537
538         # restart_bootmanager_boot
539         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
540                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
541                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
542                         "bminit-cfg-auth-getplc-update-debug-done",
543                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
544                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
545                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
546                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
547                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
548                         ]:
549                 sequences.update({n : "restart_bootmanager_boot"})
550
551         #       conn.restart_bootmanager('rins')
552         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
553                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
554                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
555                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
556                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
557                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
558                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
559                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
560                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
561                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
562                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
563                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
564                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
565                         ]:
566                 sequences.update({n : "restart_bootmanager_rins"})
567
568         # repair_node_keys
569         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
570
571         #   conn.restart_node('rins')
572         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
573                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
574                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
575                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
576                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
577                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
578                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
579                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
580                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
581                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
582                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
583                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
584                         ]:
585                 sequences.update({n : "restart_node_rins"})
586
587         #       restart_node_boot
588         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
589                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
590                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
591                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
592                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
593                          ]:
594                 sequences.update({n: "restart_node_boot"})
595
596         # update_node_config_email
597         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
598                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
599                         ]:
600                 sequences.update({n : "update_node_config_email"})
601
602         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
603                 sequences.update({n : "nodenetwork_email"})
604
605         # update_bootcd_email
606         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
607                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
608                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
609                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
610                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
611                         ]:
612                 sequences.update({n : "update_bootcd_email"})
613
614         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
615                         ]:
616                 sequences.update({n: "suspect_error_email"})
617
618         # update_hardware_email
619         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
620         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
621
622         # broken_hardware_email
623         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
624
625         flag_set = True
626
627         
628         if s not in sequences:
629                 print "   HOST %s" % hostname
630                 print "   UNKNOWN SEQUENCE: %s" % s
631
632                 args = {}
633                 args['hostname'] = hostname
634                 args['sequence'] = s
635                 args['bmlog'] = conn.get_bootmanager_log().read()
636                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
637                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
638                 m.reset()
639                 m.send(['monitor-list@lists.planet-lab.org'])
640
641                 conn.restart_bootmanager('boot')
642
643                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
644                 # This way, we can check it again after we've fixed it.
645                 flag_set = False
646
647         else:
648
649                 if   sequences[s] == "restart_bootmanager_boot":
650                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
651                         conn.restart_bootmanager('boot')
652                 elif sequences[s] == "restart_bootmanager_rins":
653                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
654                         conn.restart_bootmanager('rins')
655                 elif sequences[s] == "restart_node_rins":
656                         conn.restart_node('rins')
657                 elif sequences[s] == "restart_node_boot":
658                         conn.restart_node('boot')
659                 elif sequences[s] == "repair_node_keys":
660                         if conn.compare_and_repair_nodekeys():
661                                 # the keys either are in sync or were forced in sync.
662                                 # so try to reboot the node again.
663                                 conn.restart_bootmanager('boot')
664                         else:
665                                 # there was some failure to synchronize the keys.
666                                 print "...Unable to repair node keys on %s" % node
667
668                 elif sequences[s] == "suspect_error_email":
669                         args = {}
670                         args['hostname'] = hostname
671                         args['sequence'] = s
672                         args['bmlog'] = conn.get_bootmanager_log().read()
673                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
674                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
675                         m.reset()
676                         m.send(['monitor-list@lists.planet-lab.org'])
677
678                         conn.restart_bootmanager('boot')
679
680                 elif sequences[s] == "update_node_config_email":
681                         print "...Sending message to UPDATE NODE CONFIG"
682                         args = {}
683                         args['hostname'] = hostname
684                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
685                                                                 True, db='nodeid_persistmessages')
686                         loginbase = plc.siteId(hostname)
687                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
688                         conn.dump_plconf_file()
689                         conn.set_nodestate('diag')
690
691                 elif sequences[s] == "nodenetwork_email":
692                         print "...Sending message to LOOK AT NODE NETWORK"
693                         args = {}
694                         args['hostname'] = hostname
695                         args['bmlog'] = conn.get_bootmanager_log().read()
696                         m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
697                                                                 True, db='nodenet_persistmessages')
698                         loginbase = plc.siteId(hostname)
699                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
700                         conn.dump_plconf_file()
701                         conn.set_nodestate('diag')
702
703                 elif sequences[s] == "update_bootcd_email":
704                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
705                         import getconf
706                         args = {}
707                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
708                         args['hostname_list'] = "%s" % hostname
709
710                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
711                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
712
713                         loginbase = plc.siteId(hostname)
714                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
715
716                         #print "\tDisabling %s due to out-of-date BOOTCD" % hostname
717                         #conn.set_nodestate('disable')
718
719                 elif sequences[s] == "broken_hardware_email":
720                         # MAKE An ACTION record that this host has failed hardware.  May
721                         # require either an exception "/minhw" or other manual intervention.
722                         # Definitely need to send out some more EMAIL.
723                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
724                         # TODO: email notice of broken hardware
725                         args = {}
726                         args['hostname'] = hostname
727                         args['log'] = conn.get_dmesg().read()
728                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
729                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
730
731                         loginbase = plc.siteId(hostname)
732                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
733                         conn.set_nodestate('disable')
734
735                 elif sequences[s] == "update_hardware_email":
736                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
737                         args = {}
738                         args['hostname'] = hostname
739                         args['bmlog'] = conn.get_bootmanager_log().read()
740                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
741                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
742
743                         loginbase = plc.siteId(hostname)
744                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
745                         conn.set_nodestate('disable')
746
747         if flag_set:
748                 pflags.setRecentFlag(s)
749                 pflags.save() 
750
751         return True
752         
753
754 # MAIN -------------------------------------------------------------------
755
756 def main():
757         from config import config
758         from optparse import OptionParser
759         parser = OptionParser()
760         parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False)
761         parser.add_option("", "--child", dest="child", action="store_true", 
762                                                 help="This is the child mode of this process.")
763         parser.add_option("", "--force", dest="force", metavar="boot_state",
764                                                 help="Force a boot state passed to BootManager.py.")
765         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
766                                                 help="Extra quiet output messages.")
767         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
768                                                 help="Extra debug output messages.")
769         parser.add_option("", "--collect", dest="collect", action="store_true", 
770                                                 help="No action, just collect dmesg, and bm.log")
771         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
772                                                 help="Do not perform the orginary setup phase.")
773         parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
774                                                 help="A single node name to try to bring out of debug mode.")
775         parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
776                                                 help="A list of nodes to bring out of debug mode.")
777         config = config(parser)
778         config.parse_args()
779
780         if config.nodelist:
781                 nodes = config.getListFromFile(config.nodelist)
782         elif config.node:
783                 nodes = [ config.node ]
784         else:
785                 parser.print_help()
786                 sys.exit(1)
787
788         for node in nodes:
789                 reboot(node, config)
790
791 if __name__ == "__main__":
792         main()