a7a99d9811194d235f6bc8fb6948e03bad543bed
[monitor.git] / nodereboot.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5
6 import plc
7 import auth
8 api = plc.PLC(auth.auth, auth.plc)
9
10 import sys
11 import os
12 import policy
13
14 from getsshkeys import SSHKnownHosts
15
16 import subprocess
17 import time
18 import soltesz
19 from sets import Set
20
21 import ssh.pxssh as pxssh
22 import ssh.fdpexpect as fdpexpect
23 import ssh.pexpect as pexpect
24 from unified_model import *
25 from emailTxt import mailtxt
26
27 import signal
28 class Sopen(subprocess.Popen):
29         def kill(self, signal = signal.SIGTERM):
30                 os.kill(self.pid, signal)
31
32 #from Rpyc import SocketConnection, Async
33 from Rpyc import SocketConnection, Async
34 from Rpyc.Utils import *
35
36
37 class NodeConnection:
38         def __init__(self, connection, node, config):
39                 self.node = node
40                 self.c = connection
41                 self.config = config
42
43         def get_boot_state(self):
44                 if self.c.modules.os.path.exists('/tmp/source'):
45                         return "dbg"
46                 elif self.c.modules.os.path.exists('/vservers'): 
47                         return "boot"
48                 else:
49                         return "unknown"
50
51         def get_dmesg(self):
52                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
53                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
54                 log = open("log/dmesg.%s.log" % self.node, 'r')
55                 return log
56
57         def get_bootmanager_log(self):
58                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
59                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
60                 log = open("log/bm.%s.log" % self.node, 'r')
61                 return log
62
63         def dump_plconf_file(self):
64                 c = self.c
65                 c.modules.sys.path.append("/tmp/source/")
66                 c.modules.os.chdir('/tmp/source')
67
68                 log = c.modules.BootManager.log('/tmp/new.log')
69                 bm = c.modules.BootManager.BootManager(log,'boot')
70
71                 BootManagerException = c.modules.Exceptions.BootManagerException
72                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
73                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
74                 bm_continue = True
75
76                 InitializeBootManager.Run(bm.VARS, bm.LOG)
77                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
78                 except Exception, x:
79                         bm_continue = False
80                         print "   ERROR:", x
81                         print "   Possibly, unable to find valid configuration file"
82
83                 if bm_continue and self.config and not self.config.quiet:
84                         for key in bm.VARS.keys():
85                                 print key, " == ", bm.VARS[key]
86                 else:
87                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
88                 
89
90         def compare_and_repair_nodekeys(self):
91                 c = self.c
92                 c.modules.sys.path.append("/tmp/source/")
93                 c.modules.os.chdir('/tmp/source')
94
95                 log = c.modules.BootManager.log('/tmp/new.log')
96                 bm = c.modules.BootManager.BootManager(log,'boot')
97
98                 BootManagerException = c.modules.Exceptions.BootManagerException
99                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
100                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
101                 bm_continue = True
102
103                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
104
105                 InitializeBootManager.Run(bm.VARS, bm.LOG)
106                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
107                 except Exception, x:
108                         bm_continue = False
109                         if not config.quiet: print "exception"
110                         if not config.quiet: print x
111                         print "   Possibly, unable to find valid configuration file"
112
113                 if bm_continue:
114                         print "   NODE: %s" % bm.VARS['NODE_KEY']
115                         print "   PLC : %s" % plcnode['key']
116
117                         if bm.VARS['NODE_KEY'] == plcnode['key']:
118                                 return True
119                         else:
120                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
121                                         print "   Successfully updated NODE_KEY with PLC"
122                                         return True
123                                 else:
124                                         return False
125                                 
126                         #for key in bm.VARS.keys():
127                         #       print key, " == ", bm.VARS[key]
128                 else:
129                         print "   Unable to retrieve NODE_KEY"
130
131         def bootmanager_running(self):
132                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
133                         return True
134                 else:
135                         return False
136
137         def restart_node(self, state='boot'):
138                 api.UpdateNode(self.node, {'boot_state' : state})
139
140                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
141                 if not pflags.getRecentFlag('gentlekill'):
142                         print "   Killing all slice processes... : %s" %  self.node
143                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
144                         self.c.modules.os.system(cmd_slicekill)
145                         cmd = """ shutdown -r +1 & """
146                         print "   Restarting %s : %s" % ( self.node, cmd)
147                         self.c.modules.os.system(cmd)
148
149                         pflags.setRecentFlag('gentlekill')
150                         pflags.save()
151                 else:
152                         print "   Restarting with sysrq 'sub' %s" % self.node
153                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
154                         self.c.modules.os.system(cmd)
155
156                 return
157
158         def restart_bootmanager(self, forceState):
159
160                 self.c.modules.os.chdir('/tmp/source')
161                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
162                         print "   BootManager is already running: try again soon..."
163                 else:
164                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
165                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
166                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
167                                   "  rm -f /tmp/BM_RUNNING " + \
168                                   ") &" 
169                         cmd = cmd % forceState
170                         self.c.modules.os.system(cmd)
171
172                 return 
173
174
175 import random
176 class PlanetLabSession:
177         globalport = 22000 + int(random.random()*1000)
178
179         def __init__(self, node, nosetup, verbose):
180                 self.verbose = verbose
181                 self.node = node
182                 self.port = None
183                 self.nosetup = nosetup
184                 self.command = None
185                 self.setup_host()
186
187         def get_connection(self, config):
188                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
189         
190         def setup_host(self):
191                 self.port = PlanetLabSession.globalport
192                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
193
194                 args = {}
195                 args['port'] = self.port
196                 args['user'] = 'root'
197                 args['hostname'] = self.node
198                 args['monitordir'] = "/home/soltesz/monitor"
199
200                 if self.nosetup:
201                         print "Skipping setup"
202                         return 
203
204                 # COPY Rpyc files to host
205                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
206                 if self.verbose: print cmd
207                 # TODO: Add timeout
208                 timeout = 120
209                 localos = soltesz.CMD()
210
211                 ret = localos.system(cmd, timeout)
212                 print ret
213                 if ret != 0:
214                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
215                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
216                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
217                         ret = localos.system(cmd, timeout)
218                         print ret
219                         if ret != 0:
220                                 print "\tFAILED TWICE"
221                                 #sys.exit(1)
222                                 raise Exception("Failed twice trying to login with updated ssh host key")
223
224                 t1 = time.time()
225                 # KILL any already running servers.
226                 cmd = """ssh %(user)s@%(hostname)s """ + \
227                          """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
228                 cmd = cmd % args
229                 if self.verbose: print cmd
230                 # TODO: Add timeout
231                 print localos.system(cmd,timeout)
232
233                 # START a new rpyc server.
234                 cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
235                          """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
236                 cmd = cmd % args
237                 if self.verbose: print cmd
238                 print localos.system(cmd,timeout)
239
240                 # TODO: Add timeout
241                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
242                 # and the following options seems to work well.
243                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
244                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
245                           """-o ConnectTimeout=120 """ + \
246                           """-n -N -L %(port)s:localhost:18812 """ + \
247                           """%(user)s@%(hostname)s"""
248                 cmd = cmd % args
249                 if self.verbose: print cmd
250                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
251                 # TODO: the read() here may block indefinitely.  Need a better
252                 # approach therefore, that includes a timeout.
253                 ret = self.command.stdout.read(5)
254
255                 t2 = time.time()
256                 if 'READY' in ret:
257                         # NOTE: There is still a slight race for machines that are slow...
258                         self.timeout = 2*(t2-t1)
259                         print "Sleeping for %s sec" % self.timeout
260                         time.sleep(self.timeout)
261                         return
262
263                 if self.command.returncode is not None:
264                         print "Failed to establish tunnel!"
265                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
266
267                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
268
269         def __del__(self):
270                 if self.command:
271                         if self.verbose: print "Killing SSH session %s" % self.port
272                         self.command.kill()
273
274
275 def steps_to_list(steps):
276         ret_list = []
277         for (id,label) in steps:
278                 ret_list.append(label)
279         return ret_list
280
281 def index_to_id(steps,index):
282         if index < len(steps):
283                 return steps[index][0]
284         else:
285                 return "done"
286
287 def reboot(hostname, config=None, forced_action=None):
288
289         node = hostname
290         print "Creating session for %s" % node
291         # update known_hosts file (in case the node has rebooted since last run)
292         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
293         try:
294                 k = SSHKnownHosts(); k.update(node); k.write(); del k
295         except:
296                 import traceback; print traceback.print_exc()
297                 return False
298
299         try:
300                 if config == None:
301                         session = PlanetLabSession(node, False, True)
302                 else:
303                         session = PlanetLabSession(node, config.nosetup, config.verbose)
304         except Exception, e:
305                 print "ERROR setting up session for %s" % hostname
306                 import traceback; print traceback.print_exc()
307                 print e
308                 return False
309
310         try:
311                 conn = session.get_connection(config)
312         except EOFError:
313                 # NOTE: sometimes the wait in setup_host() is not long enough.  
314                 # So, here we try to wait a little longer before giving up entirely.
315                 try:
316                         time.sleep(session.timeout*4)
317                         conn = session.get_connection(config)
318                 except:
319                         import traceback; print traceback.print_exc()
320                         return False
321                         
322
323         if forced_action == "reboot":
324                 conn.restart_node('rins')
325                 return True
326
327         boot_state = conn.get_boot_state()
328         if boot_state == "boot":
329                 print "...Boot state of %s already completed : skipping..." % node
330                 return True
331         elif boot_state == "unknown":
332                 print "...Unknown bootstate for %s : skipping..."% node
333                 return False
334         else:
335                 pass
336
337         if conn.bootmanager_running():
338                 print "...BootManager is currently running.  Skipping host %s" % node
339                 return True
340
341         #if config != None:
342         #       if config.force:
343         #               conn.restart_bootmanager(config.force)
344         #               return True
345
346         # Read persistent flags, tagged on one week intervals.
347         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
348                 
349
350         if config and not config.quiet: print "...downloading dmesg from %s" % node
351         dmesg = conn.get_dmesg()
352         child = fdpexpect.fdspawn(dmesg)
353
354         sequence = []
355         while True:
356                 steps = [
357                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
358                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
359                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
360                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
361                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
362                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
363                         ('floppytimeout','floppy0: floppy timeout called'),
364                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
365
366                         # floppy0: floppy timeout called
367                         # end_request: I/O error, dev fd0, sector 0
368
369                         #Buffer I/O error on device dm-2, logical block 8888896
370                         #ata1: status=0x51 { DriveReady SeekComplete Error }
371                         #ata1: error=0x40 { UncorrectableError }
372                         #SCSI error : <0 0 0 0> return code = 0x8000002
373                         #sda: Current: sense key: Medium Error
374                         #       Additional sense: Unrecovered read error - auto reallocate failed
375
376                         #SCSI error : <0 2 0 0> return code = 0x40001
377                         #end_request: I/O error, dev sda, sector 572489600
378                 ]
379                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
380                 sequence.append(id)
381
382                 if id == "done":
383                         break
384
385         s = Set(sequence)
386         if config and not config.quiet: print "\tSET: ", s
387
388         if len(s) > 1:
389                 print "...Potential drive errors on %s" % node
390                 if len(s) == 2 and 'floppyerror' in s:
391                         print "...Should investigate.  Continuing with node."
392                 else:
393                         print "...Should investigate.  Skipping node."
394                         # TODO: send message related to these errors.
395                         args = {}
396                         args['hostname'] = hostname
397                         args['log'] = conn.get_dmesg().read()
398
399                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
400                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
401
402                         loginbase = plc.siteId(hostname)
403                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
404                         return False
405
406         print "...Downloading bm.log from %s" % node
407         log = conn.get_bootmanager_log()
408         child = fdpexpect.fdspawn(log)
409
410         time.sleep(1)
411
412         if config and not config.quiet: print "...Scanning bm.log for errors"
413         action_id = "dbg"
414         sequence = []
415         while True:
416
417                 steps = [
418                         ('bminit'               , 'Initializing the BootManager.'),
419                         ('cfg'                  , 'Reading node configuration file.'),
420                         ('auth'                 , 'Authenticating node with PLC.'),
421                         ('getplc'               , 'Retrieving details of node from PLC.'),
422                         ('update'               , 'Updating node boot state at PLC.'),
423                         ('hardware'             , 'Checking if hardware requirements met.'),
424                         ('installinit'  , 'Install: Initializing.'),
425                         ('installdisk'  , 'Install: partitioning disks.'),
426                         ('installbootfs', 'Install: bootstrapfs tarball.'),
427                         ('installcfg'   , 'Install: Writing configuration files.'),
428                         ('installstop'  , 'Install: Shutting down installer.'),
429                         ('update2'              , 'Updating node boot state at PLC.'),
430                         ('installinit2' , 'Install: Initializing.'),
431                         ('validate'             , 'Validating node installation.'),
432                         ('rebuildinitrd', 'Rebuilding initrd'),
433                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
434                         ('update3'              , 'Updating node configuration.'),
435                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
436                         ('update4'              , 'Sending hardware configuration to PLC.'),
437                         ('debug'                , 'Starting debug mode'),
438                         ('bmexceptmount', 'BootManagerException during mount'),
439                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
440                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
441                         ('exception'    , 'Exception'),
442                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
443                         ('protoerror'   , 'XML RPC protocol error'),
444                         ('nodehostname' , 'Configured node hostname does not resolve'),
445                         ('implementerror', 'Implementation Error'),
446                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
447                         ('noinstall'    , 'notinstalled'),
448                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
449                         ('noblockdev'   , "No block devices detected."),
450                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
451                         ('hardwarefail' , 'Hardware requirements not met'),
452                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
453                         ('modulefail'   , 'Unable to get list of system modules'),
454                         ('writeerror'   , 'write error: No space left on device'),
455                         ('nospace'      , "No space left on device"),
456                         ('nonode'       , 'Failed to authenticate call: No such node'),
457                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
458                         ('bootcheckfail'     , 'BootCheckAuthentication'),
459                         ('bootupdatefail'   , 'BootUpdateNode'),
460                 ]
461                 list = steps_to_list(steps)
462                 index = child.expect( list + [ pexpect.EOF ])
463                 id = index_to_id(steps,index)
464                 sequence.append(id)
465
466                 if id == "exception":
467                         if config and not config.quiet: print "...Found An Exception!!!"
468                 elif index == len(list):
469                         #print "Reached EOF"
470                         break
471                 
472         s = "-".join(sequence)
473         print "   FOUND SEQUENCE: ", s
474
475         # NOTE: We get or set the flag based on the current sequence identifier.
476         #  By using the sequence identifier, we guarantee that there will be no
477         #  frequent loops.  I'm guessing there is a better way to track loops,
478         #  though.
479         if not config.force and ( pflags.getFlag(s) or pflags.isRecent() ):
480                 pflags.resetFlag(s)
481                 pflags.setRecent()
482                 pflags.save() 
483                 print "... flag is set or it has already run recently. Skipping %s" % node
484                 return True
485
486         sequences = {}
487
488
489         # restart_bootmanager_boot
490         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
491                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
492                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
493                         "bminit-cfg-auth-getplc-update-debug-done",
494                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
495                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
496                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
497                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
498                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
499                         ]:
500                 sequences.update({n : "restart_bootmanager_boot"})
501
502         #       conn.restart_bootmanager('rins')
503         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
504                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
505                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
506                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
507                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
508                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
509                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
510                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
511                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
512                         ]:
513                 sequences.update({n : "restart_bootmanager_rins"})
514
515         # repair_node_keys
516         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
517
518         #   conn.restart_node('rins')
519         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
520                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
521                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
522                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
523                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
524                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
525                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
526                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
527                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
528                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
529                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
530                         ]:
531                 sequences.update({n : "restart_node_rins"})
532
533         #       restart_node_boot
534         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
535                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
536                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
537                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
538                          ]:
539                 sequences.update({n: "restart_node_boot"})
540
541         # update_node_config_email
542         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
543                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
544                         "bminit-cfg-exception-nodehostname-update-debug-done",
545                         ]:
546                 sequences.update({n : "update_node_config_email"})
547
548         # update_bootcd_email
549         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarefail-update-debug-done",
550                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarefail-update-debug-done",
551                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarefail-update-debug-done",
552                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarefail-update-debug-done",
553                         "bminit-cfg-auth-getplc-hardware-exception-hardwarefail-update-debug-done",
554                         ]:
555                 sequences.update({n : "update_bootcd_email"})
556
557         # update_hardware_email
558         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarefail-update-debug-done" : "update_hardware_email"})
559
560         # broken_hardware_email
561         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarefail-update-debug-done" : "broken_hardware_email"})
562
563         
564         if s not in sequences:
565                 print "   HOST %s" % hostname
566                 print "   UNKNOWN SEQUENCE: %s" % s
567
568                 args = {}
569                 args['hostname'] = hostname
570                 args['sequence'] = s
571                 args['bmlog'] = conn.get_bootmanager_log().read()
572                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
573                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
574                 m.reset()
575                 m.send(['monitor-list@lists.planet-lab.org'])
576
577                 conn.restart_bootmanager('boot')
578
579         else:
580
581                 if   sequences[s] == "restart_bootmanager_boot":
582                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
583                         conn.restart_bootmanager('boot')
584                 elif sequences[s] == "restart_bootmanager_rins":
585                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
586                         conn.restart_bootmanager('rins')
587                 elif sequences[s] == "restart_node_rins":
588                         conn.restart_node('rins')
589                 elif sequences[s] == "restart_node_boot":
590                         conn.restart_node('boot')
591                 elif sequences[s] == "repair_node_keys":
592                         if conn.compare_and_repair_nodekeys():
593                                 # the keys either are in sync or were forced in sync.
594                                 # so try to reboot the node again.
595                                 conn.restart_bootmanager('boot')
596                         else:
597                                 # there was some failure to synchronize the keys.
598                                 print "...Unable to repair node keys on %s" % node
599                 elif sequences[s] == "update_node_config_email":
600                         print "...Sending message to UPDATE NODE CONFIG"
601                         args = {}
602                         args['hostname'] = hostname
603                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
604                                                                 True, db='nodeid_persistmessages')
605                         loginbase = plc.siteId(hostname)
606                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
607                         conn.dump_plconf_file()
608
609                 elif sequences[s] == "update_bootcd_email":
610                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
611                         import getconf
612                         args = {}
613                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
614                         args['hostname_list'] = "%s" % hostname
615
616                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
617                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
618
619                         loginbase = plc.siteId(hostname)
620                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
621
622                 elif sequences[s] == "broken_hardware_email":
623                         # MAKE An ACTION record that this host has failed hardware.  May
624                         # require either an exception "/minhw" or other manual intervention.
625                         # Definitely need to send out some more EMAIL.
626                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
627                         # TODO: email notice of broken hardware
628                         args = {}
629                         args['hostname'] = hostname
630                         args['log'] = conn.get_dmesg().read()
631                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
632                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
633
634                         loginbase = plc.siteId(hostname)
635                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
636
637                 elif sequences[s] == "update_hardware_email":
638                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
639                         args = {}
640                         args['hostname'] = hostname
641                         args['bmlog'] = conn.get_bootmanager_log().read()
642                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
643                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
644
645                         loginbase = plc.siteId(hostname)
646                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
647
648         pflags.setFlag(s)
649         pflags.save() 
650
651         return True
652         
653
654 # MAIN -------------------------------------------------------------------
655
656 def main():
657         from config import config
658         from optparse import OptionParser
659         parser = OptionParser()
660         parser.set_defaults(node=None, nodelist=None, child=False, nosetup=False, verbose=False, force=None, quiet=False)
661         parser.add_option("", "--child", dest="child", action="store_true", 
662                                                 help="This is the child mode of this process.")
663         parser.add_option("", "--force", dest="force", metavar="boot_state",
664                                                 help="Force a boot state passed to BootManager.py.")
665         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
666                                                 help="Extra quiet output messages.")
667         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
668                                                 help="Extra debug output messages.")
669         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
670                                                 help="Do not perform the orginary setup phase.")
671         parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
672                                                 help="A single node name to try to bring out of debug mode.")
673         parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
674                                                 help="A list of nodes to bring out of debug mode.")
675         config = config(parser)
676         config.parse_args()
677
678         if config.nodelist:
679                 nodes = config.getListFromFile(config.nodelist)
680         elif config.node:
681                 nodes = [ config.node ]
682         else:
683                 parser.print_help()
684                 sys.exit(1)
685
686         for node in nodes:
687                 reboot(node, config)
688
689 if __name__ == "__main__":
690         main()