e8dc7b89dae9d55d71115a520c8587e7a980ec5c
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import plc
6 api = plc.getAuthAPI()
7
8 import sys
9 import os
10 import const
11
12 from getsshkeys import SSHKnownHosts
13
14 import subprocess
15 import time
16 import database
17 import moncommands
18 from sets import Set
19
20 import ssh.pxssh as pxssh
21 import ssh.fdpexpect as fdpexpect
22 import ssh.pexpect as pexpect
23 from unified_model import *
24 from emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
26 import traceback
27 import config
28
29 import signal
30 class Sopen(subprocess.Popen):
31         def kill(self, signal = signal.SIGTERM):
32                 os.kill(self.pid, signal)
33
34 #from Rpyc import SocketConnection, Async
35 from Rpyc import SocketConnection, Async
36 from Rpyc.Utils import *
37 fb = None
38
39 class NodeConnection:
40         def __init__(self, connection, node, config):
41                 self.node = node
42                 self.c = connection
43                 self.config = config
44
45         def get_boot_state(self):
46                 if self.c.modules.os.path.exists('/tmp/source'):
47                         return "dbg"
48                 elif self.c.modules.os.path.exists('/vservers'): 
49                         return "boot"
50                 else:
51                         return "unknown"
52
53         def get_dmesg(self):
54                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
55                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
56                 log = open("log/dmesg.%s.log" % self.node, 'r')
57                 return log
58
59         def get_bootmanager_log(self):
60                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
61                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
62                 log = open("log/bm.%s.log" % self.node, 'r')
63                 return log
64
65         def dump_plconf_file(self):
66                 c = self.c
67                 self.c.modules.sys.path.append("/tmp/source/")
68                 self.c.modules.os.chdir('/tmp/source')
69
70                 log = c.modules.BootManager.log('/tmp/new.log')
71                 bm = c.modules.BootManager.BootManager(log,'boot')
72
73                 BootManagerException = c.modules.Exceptions.BootManagerException
74                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
75                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
76                 bm_continue = True
77
78                 InitializeBootManager.Run(bm.VARS, bm.LOG)
79                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
80                 except Exception, x:
81                         bm_continue = False
82                         print "   ERROR:", x
83                         print "   Possibly, unable to find valid configuration file"
84
85                 if bm_continue and self.config and not self.config.quiet:
86                         for key in bm.VARS.keys():
87                                 print key, " == ", bm.VARS[key]
88                 else:
89                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
90                 
91
92         def compare_and_repair_nodekeys(self):
93                 c = self.c
94                 self.c.modules.sys.path.append("/tmp/source/")
95                 self.c.modules.os.chdir('/tmp/source')
96
97                 log = c.modules.BootManager.log('/tmp/new.log')
98                 bm = c.modules.BootManager.BootManager(log,'boot')
99
100                 BootManagerException = c.modules.Exceptions.BootManagerException
101                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
102                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
103                 bm_continue = True
104
105                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
106
107                 InitializeBootManager.Run(bm.VARS, bm.LOG)
108                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
109                 except Exception, x:
110                         bm_continue = False
111                         print "exception"
112                         print x
113                         print "   Possibly, unable to find valid configuration file"
114
115                 if bm_continue:
116                         print "   NODE: %s" % bm.VARS['NODE_KEY']
117                         print "   PLC : %s" % plcnode['key']
118
119                         if bm.VARS['NODE_KEY'] == plcnode['key']:
120                                 return True
121                         else:
122                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
123                                         print "   Successfully updated NODE_KEY with PLC"
124                                         return True
125                                 else:
126                                         return False
127                                 
128                         #for key in bm.VARS.keys():
129                         #       print key, " == ", bm.VARS[key]
130                 else:
131                         print "   Unable to retrieve NODE_KEY"
132
133         def bootmanager_running(self):
134                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
135                         return True
136                 else:
137                         return False
138
139         def set_nodestate(self, state='boot'):
140                 return api.UpdateNode(self.node, {'boot_state' : state})
141
142         def restart_node(self, state='boot'):
143                 api.UpdateNode(self.node, {'boot_state' : state})
144
145                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
146                 if not pflags.getRecentFlag('gentlekill'):
147                         print "   Killing all slice processes... : %s" %  self.node
148                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
149                         self.c.modules.os.system(cmd_slicekill)
150                         cmd = """ shutdown -r +1 & """
151                         print "   Restarting %s : %s" % ( self.node, cmd)
152                         self.c.modules.os.system(cmd)
153
154                         pflags.setRecentFlag('gentlekill')
155                         pflags.save()
156                 else:
157                         print "   Restarting with sysrq 'sub' %s" % self.node
158                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
159                         self.c.modules.os.system(cmd)
160
161                 return
162
163         def restart_bootmanager(self, forceState):
164
165                 self.c.modules.os.chdir('/tmp/source')
166                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
167                         print "   BootManager is already running: try again soon..."
168                 else:
169                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
170                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
171                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
172                                   "  rm -f /tmp/BM_RUNNING " + \
173                                   ") &" 
174                         cmd = cmd % forceState
175                         self.c.modules.os.system(cmd)
176
177                 return 
178
179
180 import random
181 class PlanetLabSession:
182         globalport = 22000 + int(random.random()*1000)
183
184         def __init__(self, node, nosetup, verbose):
185                 self.verbose = verbose
186                 self.node = node
187                 self.port = None
188                 self.nosetup = nosetup
189                 self.command = None
190                 self.setup_host()
191
192         def get_connection(self, config):
193                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
194         
195         def setup_host(self):
196                 self.port = PlanetLabSession.globalport
197                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
198
199                 args = {}
200                 args['port'] = self.port
201                 args['user'] = 'root'
202                 args['hostname'] = self.node
203                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
204                 ssh_port = 22
205
206                 if self.nosetup:
207                         print "Skipping setup"
208                         return 
209
210                 # COPY Rpyc files to host
211                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
212                 if self.verbose: print cmd
213                 # TODO: Add timeout
214                 timeout = 120
215                 localos = moncommands.CMD()
216
217                 ret = localos.system(cmd, timeout)
218                 print ret
219                 if ret != 0:
220                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
221                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
222                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
223                         ret = localos.system(cmd, timeout)
224                         print ret
225                         if ret != 0:
226                                 print "\tFAILED TWICE"
227                                 #sys.exit(1)
228                                 raise Exception("Failed twice trying to login with updated ssh host key")
229
230                 t1 = time.time()
231                 # KILL any already running servers.
232                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
233                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
234             rm -f out.log
235             echo "kill server" >> out.log
236             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
237             echo "export" >> out.log
238             export PYTHONPATH=$HOME  ;
239             echo "start server" >> out.log
240             python Rpyc/Servers/forking_server.py &> server.log &
241             echo "done" >> out.log
242 EOF""")
243                 #cmd = """ssh %(user)s@%(hostname)s """ + \
244                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
245                 #cmd = cmd % args
246                 #if self.verbose: print cmd
247                 ## TODO: Add timeout
248                 #print localos.system(cmd,timeout)
249
250                 ## START a new rpyc server.
251                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
252                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
253                 #cmd = cmd % args
254                 #if self.verbose: print cmd
255                 #print localos.system(cmd,timeout)
256                 print ssh.ret
257
258                 # TODO: Add timeout
259                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
260                 # and the following options seems to work well.
261                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
262                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
263                           """-o ConnectTimeout=120 """ + \
264                           """-n -N -L %(port)s:localhost:18812 """ + \
265                           """%(user)s@%(hostname)s"""
266                 cmd = cmd % args
267                 if self.verbose: print cmd
268                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
269                 # TODO: the read() here may block indefinitely.  Need a better
270                 # approach therefore, that includes a timeout.
271                 #ret = self.command.stdout.read(5)
272                 ret = moncommands.read_t(self.command.stdout, 5)
273
274                 t2 = time.time()
275                 if 'READY' in ret:
276                         # NOTE: There is still a slight race for machines that are slow...
277                         self.timeout = 2*(t2-t1)
278                         print "Sleeping for %s sec" % self.timeout
279                         time.sleep(self.timeout)
280                         return
281
282                 if self.command.returncode is not None:
283                         print "Failed to establish tunnel!"
284                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
285
286                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
287
288         def __del__(self):
289                 if self.command:
290                         if self.verbose: print "Killing SSH session %s" % self.port
291                         self.command.kill()
292
293
294 def steps_to_list(steps):
295         ret_list = []
296         for (id,label) in steps:
297                 ret_list.append(label)
298         return ret_list
299
300 def index_to_id(steps,index):
301         if index < len(steps):
302                 return steps[index][0]
303         else:
304                 return "done"
305
306 def reboot(hostname, config=None, forced_action=None):
307
308         # NOTE: Nothing works if the bootcd is REALLY old.
309         #       So, this is the first step.
310         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
311         if fbnode['category'] == "OLDBOOTCD":
312                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
313                 args = {}
314                 args['hostname_list'] = "    %s" % hostname
315
316                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
317                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
318
319                 loginbase = plc.siteId(hostname)
320                 m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
321
322                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
323                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
324                 return True
325
326         node = hostname
327         print "Creating session for %s" % node
328         # update known_hosts file (in case the node has rebooted since last run)
329         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
330         try:
331                 k = SSHKnownHosts(); k.update(node); k.write(); del k
332         except:
333                 print traceback.print_exc()
334                 return False
335
336         try:
337                 if config == None:
338                         session = PlanetLabSession(node, False, True)
339                 else:
340                         session = PlanetLabSession(node, config.nosetup, config.verbose)
341         except Exception, e:
342                 print "ERROR setting up session for %s" % hostname
343                 print traceback.print_exc()
344                 print e
345                 return False
346
347         try:
348                 conn = session.get_connection(config)
349         except EOFError:
350                 # NOTE: sometimes the wait in setup_host() is not long enough.  
351                 # So, here we try to wait a little longer before giving up entirely.
352                 try:
353                         time.sleep(session.timeout*4)
354                         conn = session.get_connection(config)
355                 except:
356                         print traceback.print_exc()
357                         return False
358
359         if forced_action == "reboot":
360                 conn.restart_node('rins')
361                 return True
362
363         boot_state = conn.get_boot_state()
364         if boot_state == "boot":
365                 print "...Boot state of %s already completed : skipping..." % node
366                 return True
367         elif boot_state == "unknown":
368                 print "...Unknown bootstate for %s : skipping..."% node
369                 return False
370         else:
371                 pass
372
373         if conn.bootmanager_running():
374                 print "...BootManager is currently running.  Skipping host %s" % node
375                 return True
376
377         #if config != None:
378         #       if config.force:
379         #               conn.restart_bootmanager(config.force)
380         #               return True
381
382         # Read persistent flags, tagged on one week intervals.
383         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
384                 
385
386         if config and not config.quiet: print "...downloading dmesg from %s" % node
387         dmesg = conn.get_dmesg()
388         child = fdpexpect.fdspawn(dmesg)
389
390         sequence = []
391         while True:
392                 steps = [
393                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
394                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
395                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
396
397                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
398
399                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
400                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
401
402                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
403                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
404
405                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
406                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
407
408                         ('floppytimeout','floppy0: floppy timeout called'),
409                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
410
411                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
412                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
413
414                         # floppy0: floppy timeout called
415                         # end_request: I/O error, dev fd0, sector 0
416
417                         # Buffer I/O error on device dm-2, logical block 8888896
418                         # ata1: status=0x51 { DriveReady SeekComplete Error }
419                         # ata1: error=0x40 { UncorrectableError }
420                         # SCSI error : <0 0 0 0> return code = 0x8000002
421                         # sda: Current: sense key: Medium Error
422                         #       Additional sense: Unrecovered read error - auto reallocate failed
423
424                         # SCSI error : <0 2 0 0> return code = 0x40001
425                         # end_request: I/O error, dev sda, sector 572489600
426                 ]
427                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
428                 sequence.append(id)
429
430                 if id == "done":
431                         break
432
433         s = Set(sequence)
434         if config and not config.quiet: print "\tSET: ", s
435
436         if len(s) > 1:
437                 print "...Potential drive errors on %s" % node
438                 if len(s) == 2 and 'floppyerror' in s:
439                         print "...Should investigate.  Continuing with node."
440                 else:
441                         print "...Should investigate.  Skipping node."
442                         # TODO: send message related to these errors.
443                         args = {}
444                         args['hostname'] = hostname
445                         args['log'] = conn.get_dmesg().read()
446
447                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
448                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
449
450                         loginbase = plc.siteId(hostname)
451                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
452                         conn.set_nodestate('disable')
453                         return False
454
455         print "...Downloading bm.log from %s" % node
456         log = conn.get_bootmanager_log()
457         child = fdpexpect.fdspawn(log)
458
459         try:
460                 if config.collect: return True
461         except:
462                 pass
463
464         time.sleep(1)
465
466         if config and not config.quiet: print "...Scanning bm.log for errors"
467         action_id = "dbg"
468         sequence = []
469         while True:
470
471                 steps = [
472                         ('bminit'               , 'Initializing the BootManager.'),
473                         ('cfg'                  , 'Reading node configuration file.'),
474                         ('auth'                 , 'Authenticating node with PLC.'),
475                         ('getplc'               , 'Retrieving details of node from PLC.'),
476                         ('update'               , 'Updating node boot state at PLC.'),
477                         ('hardware'             , 'Checking if hardware requirements met.'),
478                         ('installinit'  , 'Install: Initializing.'),
479                         ('installdisk'  , 'Install: partitioning disks.'),
480                         ('installbootfs', 'Install: bootstrapfs tarball.'),
481                         ('installcfg'   , 'Install: Writing configuration files.'),
482                         ('installstop'  , 'Install: Shutting down installer.'),
483                         ('update2'              , 'Updating node boot state at PLC.'),
484                         ('installinit2' , 'Install: Initializing.'),
485                         ('validate'             , 'Validating node installation.'),
486                         ('rebuildinitrd', 'Rebuilding initrd'),
487                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
488                         ('update3'              , 'Updating node configuration.'),
489                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
490                         ('update4'              , 'Sending hardware configuration to PLC.'),
491                         ('debug'                , 'Starting debug mode'),
492                         ('bmexceptmount', 'BootManagerException during mount'),
493                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
494                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
495                         ('exception'    , 'Exception'),
496                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
497                         ('protoerror'   , 'XML RPC protocol error'),
498                         ('nodehostname' , 'Configured node hostname does not resolve'),
499                         ('implementerror', 'Implementation Error'),
500                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
501                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
502                         ('noinstall'    , 'notinstalled'),
503                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
504                         ('noblockdev'   , "No block devices detected."),
505                         ('dnserror'     , 'Name or service not known'),
506                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
507                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
508                         ('hardwarerequirefail' , 'Hardware requirements not met'),
509                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
510                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
511                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
512                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
513                         ('modulefail'   , 'Unable to get list of system modules'),
514                         ('writeerror'   , 'write error: No space left on device'),
515                         ('nospace'      , "No space left on device"),
516                         ('nonode'       , 'Failed to authenticate call: No such node'),
517                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
518                         ('bootcheckfail'     , 'BootCheckAuthentication'),
519                         ('bootupdatefail'   , 'BootUpdateNode'),
520                 ]
521                 list = steps_to_list(steps)
522                 index = child.expect( list + [ pexpect.EOF ])
523                 id = index_to_id(steps,index)
524                 sequence.append(id)
525
526                 if id == "exception":
527                         if config and not config.quiet: print "...Found An Exception!!!"
528                 elif index == len(list):
529                         #print "Reached EOF"
530                         break
531                 
532         s = "-".join(sequence)
533         print "   FOUND SEQUENCE: ", s
534
535         # NOTE: We get or set the flag based on the current sequence identifier.
536         #  By using the sequence identifier, we guarantee that there will be no
537         #  frequent loops.  I'm guessing there is a better way to track loops,
538         #  though.
539         #if not config.force and pflags.getRecentFlag(s):
540         #       pflags.setRecentFlag(s)
541         #       pflags.save() 
542         #       print "... flag is set or it has already run recently. Skipping %s" % node
543         #       return True
544
545         sequences = {}
546
547
548         # restart_bootmanager_boot
549         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
550                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
551                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
552
553                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
554
555                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
556                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
557                         "bminit-cfg-auth-getplc-update-debug-done",
558                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
559                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
560                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
561                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
562                         "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
563                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
564                         ]:
565                 sequences.update({n : "restart_bootmanager_boot"})
566
567         #       conn.restart_bootmanager('rins')
568         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
569                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
570                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
571                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
572                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
573                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
574                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
575                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
576                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
577                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
578                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
579                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
580                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
581                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
582                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
583                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
584                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
585                         ]:
586                 sequences.update({n : "restart_bootmanager_rins"})
587
588         # repair_node_keys
589         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
590
591         #   conn.restart_node('rins')
592         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
593                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
594                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
595                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
596                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
597                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
598                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
599                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
600                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
601                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
602                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
603                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
604                         ]:
605                 sequences.update({n : "restart_node_rins"})
606
607         #       restart_node_boot
608         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
609                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
610                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
611                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
612                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
613                          ]:
614                 sequences.update({n: "restart_node_boot"})
615
616         # update_node_config_email
617         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
618                           "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
619                           "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
620                         ]:
621                 sequences.update({n : "update_node_config_email"})
622
623         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
624                            "bminit-cfg-update-exception-nodehostname-update-debug-done", 
625                         ]:
626                 sequences.update({n : "nodenetwork_email"})
627
628         # update_bootcd_email
629         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
630                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
631                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
632                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
633                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
634                         ]:
635                 sequences.update({n : "update_bootcd_email"})
636
637         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
638                         ]:
639                 sequences.update({n: "suspect_error_email"})
640
641         # update_hardware_email
642         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
643         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
644
645         # broken_hardware_email
646         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
647
648         # bad_dns_email
649         for n in [ 
650          "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
651                 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
652                 ]:
653                 sequences.update( { n : "bad_dns_email"})
654
655         flag_set = True
656
657         
658         if s not in sequences:
659                 print "   HOST %s" % hostname
660                 print "   UNKNOWN SEQUENCE: %s" % s
661
662                 args = {}
663                 args['hostname'] = hostname
664                 args['sequence'] = s
665                 args['bmlog'] = conn.get_bootmanager_log().read()
666                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
667                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
668                 m.reset()
669                 m.send(['monitor-list@lists.planet-lab.org'])
670
671                 conn.restart_bootmanager('boot')
672
673                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
674                 # This way, we can check it again after we've fixed it.
675                 flag_set = False
676
677         else:
678
679                 if   sequences[s] == "restart_bootmanager_boot":
680                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
681                         conn.restart_bootmanager('boot')
682                 elif sequences[s] == "restart_bootmanager_rins":
683                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
684                         conn.restart_bootmanager('rins')
685                 elif sequences[s] == "restart_node_rins":
686                         conn.restart_node('rins')
687                 elif sequences[s] == "restart_node_boot":
688                         conn.restart_node('boot')
689                 elif sequences[s] == "repair_node_keys":
690                         if conn.compare_and_repair_nodekeys():
691                                 # the keys either are in sync or were forced in sync.
692                                 # so try to reboot the node again.
693                                 conn.restart_bootmanager('rins')
694                                 pass
695                         else:
696                                 # there was some failure to synchronize the keys.
697                                 print "...Unable to repair node keys on %s" % node
698
699                 elif sequences[s] == "suspect_error_email":
700                         args = {}
701                         args['hostname'] = hostname
702                         args['sequence'] = s
703                         args['bmlog'] = conn.get_bootmanager_log().read()
704                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
705                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
706                         m.reset()
707                         m.send(['monitor-list@lists.planet-lab.org'])
708
709                         conn.restart_bootmanager('boot')
710
711                 elif sequences[s] == "update_node_config_email":
712                         print "...Sending message to UPDATE NODE CONFIG"
713                         args = {}
714                         args['hostname'] = hostname
715                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
716                                                                 True, db='nodeid_persistmessages')
717                         loginbase = plc.siteId(hostname)
718                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
719                         conn.dump_plconf_file()
720                         conn.set_nodestate('disable')
721
722                 elif sequences[s] == "nodenetwork_email":
723                         print "...Sending message to LOOK AT NODE NETWORK"
724                         args = {}
725                         args['hostname'] = hostname
726                         args['bmlog'] = conn.get_bootmanager_log().read()
727                         m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
728                                                                 True, db='nodenet_persistmessages')
729                         loginbase = plc.siteId(hostname)
730                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
731                         conn.dump_plconf_file()
732                         conn.set_nodestate('disable')
733
734                 elif sequences[s] == "update_bootcd_email":
735                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
736                         import getconf
737                         args = {}
738                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
739                         args['hostname_list'] = "%s" % hostname
740
741                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
742                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
743
744                         loginbase = plc.siteId(hostname)
745                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
746
747                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
748                         conn.set_nodestate('disable')
749
750                 elif sequences[s] == "broken_hardware_email":
751                         # MAKE An ACTION record that this host has failed hardware.  May
752                         # require either an exception "/minhw" or other manual intervention.
753                         # Definitely need to send out some more EMAIL.
754                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
755                         # TODO: email notice of broken hardware
756                         args = {}
757                         args['hostname'] = hostname
758                         args['log'] = conn.get_dmesg().read()
759                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
760                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
761
762                         loginbase = plc.siteId(hostname)
763                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
764                         conn.set_nodestate('disable')
765
766                 elif sequences[s] == "update_hardware_email":
767                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
768                         args = {}
769                         args['hostname'] = hostname
770                         args['bmlog'] = conn.get_bootmanager_log().read()
771                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
772                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
773
774                         loginbase = plc.siteId(hostname)
775                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
776                         conn.set_nodestate('disable')
777
778                 elif sequences[s] == "bad_dns_email":
779                         print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
780                         args = {}
781                         try:
782                                 node = api.GetNodes(hostname)[0]
783                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
784                         except:
785                                 print traceback.print_exc()
786                                 # TODO: api error. skip email, b/c all info is not available,
787                                 # flag_set will not be recorded.
788                                 return False
789                         nodenet_str = network_config_to_str(net)
790
791                         args['hostname'] = hostname
792                         args['network_config'] = nodenet_str
793                         args['nodenetwork_id'] = net['nodenetwork_id']
794                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
795                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
796
797                         loginbase = plc.siteId(hostname)
798                         m.send([const.PIEMAIL % loginbase, const.TECHEMAIL % loginbase])
799                         conn.set_nodestate('disable')
800
801         if flag_set:
802                 pflags.setRecentFlag(s)
803                 pflags.save() 
804
805         return True
806         
807
808 # MAIN -------------------------------------------------------------------
809
810 def main():
811         import parser as parsermodule
812         parser = parsermodule.getParser()
813
814         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
815                                                 force=None, quiet=False)
816         parser.add_option("", "--child", dest="child", action="store_true", 
817                                                 help="This is the child mode of this process.")
818         parser.add_option("", "--force", dest="force", metavar="boot_state",
819                                                 help="Force a boot state passed to BootManager.py.")
820         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
821                                                 help="Extra quiet output messages.")
822         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
823                                                 help="Extra debug output messages.")
824         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
825                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
826         parser.add_option("", "--collect", dest="collect", action="store_true", 
827                                                 help="No action, just collect dmesg, and bm.log")
828         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
829                                                 help="Do not perform the orginary setup phase.")
830
831         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
832         config = parsermodule.parse_args(parser)
833
834         if config.nodelist:
835                 nodes = config.getListFromFile(config.nodelist)
836         elif config.node:
837                 nodes = [ config.node ]
838         else:
839                 parser.print_help()
840                 sys.exit(1)
841
842         for node in nodes:
843                 reboot(node, config)
844
845 if __name__ == "__main__":
846         main()