added two new sequences to rins a node if not installed.
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import plc
6 api = plc.getAuthAPI()
7
8 import sys
9 import os
10 import const
11
12 from getsshkeys import SSHKnownHosts
13
14 import subprocess
15 import time
16 import database
17 import moncommands
18 from sets import Set
19
20 import ssh.pxssh as pxssh
21 import ssh.fdpexpect as fdpexpect
22 import ssh.pexpect as pexpect
23 from unified_model import *
24 from emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
26 import traceback
27 import config
28
29 import signal
30 class Sopen(subprocess.Popen):
31         def kill(self, signal = signal.SIGTERM):
32                 os.kill(self.pid, signal)
33
34 #from Rpyc import SocketConnection, Async
35 from Rpyc import SocketConnection, Async
36 from Rpyc.Utils import *
37 fb = None
38
39 def get_fbnode(node):
40         global fb
41         if fb is None:
42                 fb = database.dbLoad("findbad")
43         fbnode = fb['nodes'][node]['values']
44         return fbnode
45
46 class NodeConnection:
47         def __init__(self, connection, node, config):
48                 self.node = node
49                 self.c = connection
50                 self.config = config
51
52         def get_boot_state(self):
53                 if self.c.modules.os.path.exists('/tmp/source'):
54                         return "dbg"
55                 elif self.c.modules.os.path.exists('/vservers'): 
56                         return "boot"
57                 else:
58                         return "unknown"
59
60         def get_dmesg(self):
61                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
62                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
63                 log = open("log/dmesg.%s.log" % self.node, 'r')
64                 return log
65
66         def get_bootmanager_log(self):
67                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
68                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
69                 log = open("log/bm.%s.log" % self.node, 'r')
70                 return log
71
72         def dump_plconf_file(self):
73                 c = self.c
74                 self.c.modules.sys.path.append("/tmp/source/")
75                 self.c.modules.os.chdir('/tmp/source')
76
77                 log = c.modules.BootManager.log('/tmp/new.log')
78                 bm = c.modules.BootManager.BootManager(log,'boot')
79
80                 BootManagerException = c.modules.Exceptions.BootManagerException
81                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
82                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
83                 bm_continue = True
84
85                 InitializeBootManager.Run(bm.VARS, bm.LOG)
86                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
87                 except Exception, x:
88                         bm_continue = False
89                         print "   ERROR:", x
90                         print "   Possibly, unable to find valid configuration file"
91
92                 if bm_continue and self.config and not self.config.quiet:
93                         for key in bm.VARS.keys():
94                                 print key, " == ", bm.VARS[key]
95                 else:
96                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
97                 
98
99         def compare_and_repair_nodekeys(self):
100                 c = self.c
101                 self.c.modules.sys.path.append("/tmp/source/")
102                 self.c.modules.os.chdir('/tmp/source')
103
104                 log = c.modules.BootManager.log('/tmp/new.log')
105                 bm = c.modules.BootManager.BootManager(log,'boot')
106
107                 BootManagerException = c.modules.Exceptions.BootManagerException
108                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
109                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
110                 bm_continue = True
111
112                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
113
114                 InitializeBootManager.Run(bm.VARS, bm.LOG)
115                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
116                 except Exception, x:
117                         bm_continue = False
118                         print "exception"
119                         print x
120                         print "   Possibly, unable to find valid configuration file"
121
122                 if bm_continue:
123                         print "   NODE: %s" % bm.VARS['NODE_KEY']
124                         print "   PLC : %s" % plcnode['key']
125
126                         if bm.VARS['NODE_KEY'] == plcnode['key']:
127                                 return True
128                         else:
129                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
130                                         print "   Successfully updated NODE_KEY with PLC"
131                                         return True
132                                 else:
133                                         return False
134                                 
135                         #for key in bm.VARS.keys():
136                         #       print key, " == ", bm.VARS[key]
137                 else:
138                         print "   Unable to retrieve NODE_KEY"
139
140         def bootmanager_running(self):
141                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
142                         return True
143                 else:
144                         return False
145
146         def set_nodestate(self, state='boot'):
147                 return api.UpdateNode(self.node, {'boot_state' : state})
148
149         def restart_node(self, state='boot'):
150                 api.UpdateNode(self.node, {'boot_state' : state})
151
152                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
153                 if not pflags.getRecentFlag('gentlekill'):
154                         print "   Killing all slice processes... : %s" %  self.node
155                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
156                         self.c.modules.os.system(cmd_slicekill)
157                         cmd = """ shutdown -r +1 & """
158                         print "   Restarting %s : %s" % ( self.node, cmd)
159                         self.c.modules.os.system(cmd)
160
161                         pflags.setRecentFlag('gentlekill')
162                         pflags.save()
163                 else:
164                         print "   Restarting with sysrq 'sub' %s" % self.node
165                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
166                         self.c.modules.os.system(cmd)
167
168                 return
169
170         def restart_bootmanager(self, forceState):
171
172                 self.c.modules.os.chdir('/tmp/source')
173                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
174                         print "   BootManager is already running: try again soon..."
175                 else:
176                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
177                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
178                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
179                                   "  rm -f /tmp/BM_RUNNING " + \
180                                   ") &" 
181                         cmd = cmd % forceState
182                         self.c.modules.os.system(cmd)
183
184                 return 
185
186
187 import random
188 class PlanetLabSession:
189         globalport = 22000 + int(random.random()*1000)
190
191         def __init__(self, node, nosetup, verbose):
192                 self.verbose = verbose
193                 self.node = node
194                 self.port = None
195                 self.nosetup = nosetup
196                 self.command = None
197                 self.setup_host()
198
199         def get_connection(self, config):
200                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
201         
202         def setup_host(self):
203                 self.port = PlanetLabSession.globalport
204                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
205
206                 args = {}
207                 args['port'] = self.port
208                 args['user'] = 'root'
209                 args['hostname'] = self.node
210                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
211                 ssh_port = 22
212
213                 if self.nosetup:
214                         print "Skipping setup"
215                         return 
216
217                 # COPY Rpyc files to host
218                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
219                 if self.verbose: print cmd
220                 # TODO: Add timeout
221                 timeout = 120
222                 localos = moncommands.CMD()
223
224                 ret = localos.system(cmd, timeout)
225                 print ret
226                 if ret != 0:
227                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
228                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
229                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
230                         ret = localos.system(cmd, timeout)
231                         print ret
232                         if ret != 0:
233                                 print "\tFAILED TWICE"
234                                 #sys.exit(1)
235                                 raise Exception("Failed twice trying to login with updated ssh host key")
236
237                 t1 = time.time()
238                 # KILL any already running servers.
239                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
240                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
241             rm -f out.log
242             echo "kill server" >> out.log
243             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
244             echo "export" >> out.log
245             export PYTHONPATH=$HOME  ;
246             echo "start server" >> out.log
247             python Rpyc/Servers/forking_server.py &> server.log &
248             echo "done" >> out.log
249 EOF""")
250                 #cmd = """ssh %(user)s@%(hostname)s """ + \
251                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
252                 #cmd = cmd % args
253                 #if self.verbose: print cmd
254                 ## TODO: Add timeout
255                 #print localos.system(cmd,timeout)
256
257                 ## START a new rpyc server.
258                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
259                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
260                 #cmd = cmd % args
261                 #if self.verbose: print cmd
262                 #print localos.system(cmd,timeout)
263                 print ssh.ret
264
265                 # TODO: Add timeout
266                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
267                 # and the following options seems to work well.
268                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
269                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
270                           """-o ConnectTimeout=120 """ + \
271                           """-n -N -L %(port)s:localhost:18812 """ + \
272                           """%(user)s@%(hostname)s"""
273                 cmd = cmd % args
274                 if self.verbose: print cmd
275                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
276                 # TODO: the read() here may block indefinitely.  Need a better
277                 # approach therefore, that includes a timeout.
278                 #ret = self.command.stdout.read(5)
279                 ret = moncommands.read_t(self.command.stdout, 5)
280
281                 t2 = time.time()
282                 if 'READY' in ret:
283                         # NOTE: There is still a slight race for machines that are slow...
284                         self.timeout = 2*(t2-t1)
285                         print "Sleeping for %s sec" % self.timeout
286                         time.sleep(self.timeout)
287                         return
288
289                 if self.command.returncode is not None:
290                         print "Failed to establish tunnel!"
291                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
292
293                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
294
295         def __del__(self):
296                 if self.command:
297                         if self.verbose: print "Killing SSH session %s" % self.port
298                         self.command.kill()
299
300
301 def steps_to_list(steps):
302         ret_list = []
303         for (id,label) in steps:
304                 ret_list.append(label)
305         return ret_list
306
307 def index_to_id(steps,index):
308         if index < len(steps):
309                 return steps[index][0]
310         else:
311                 return "done"
312
313 def reboot(hostname, config=None, forced_action=None):
314
315         # NOTE: Nothing works if the bootcd is REALLY old.
316         #       So, this is the first step.
317         fbnode = get_fbnode(hostname)
318         if fbnode['category'] == "OLDBOOTCD":
319                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
320                 args = {}
321                 args['hostname_list'] = "    %s" % hostname
322
323                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
324                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
325
326                 loginbase = plc.siteId(hostname)
327                 emails = plc.getTechEmails(loginbase)
328                 m.send(emails) 
329
330                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
331                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
332                 return True
333
334         node = hostname
335         print "Creating session for %s" % node
336         # update known_hosts file (in case the node has rebooted since last run)
337         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
338         try:
339                 k = SSHKnownHosts(); k.update(node); k.write(); del k
340         except:
341                 from nodecommon import email_exception
342                 email_exception()
343                 print traceback.print_exc()
344                 return False
345
346         try:
347                 if config == None:
348                         session = PlanetLabSession(node, False, True)
349                 else:
350                         session = PlanetLabSession(node, config.nosetup, config.verbose)
351         except Exception, e:
352                 msg = "ERROR setting up session for %s" % hostname
353                 print msg
354                 print traceback.print_exc()
355                 from nodecommon import email_exception
356                 email_exception(msg)
357                 print e
358                 return False
359
360         try:
361                 conn = session.get_connection(config)
362         except EOFError:
363                 # NOTE: sometimes the wait in setup_host() is not long enough.  
364                 # So, here we try to wait a little longer before giving up entirely.
365                 try:
366                         time.sleep(session.timeout*4)
367                         conn = session.get_connection(config)
368                 except:
369                         print traceback.print_exc()
370                         from nodecommon import email_exception
371                         email_exception()
372                         return False
373
374         if forced_action == "reboot":
375                 conn.restart_node('rins')
376                 return True
377
378         boot_state = conn.get_boot_state()
379         if boot_state == "boot":
380                 print "...Boot state of %s already completed : skipping..." % node
381                 return True
382         elif boot_state == "unknown":
383                 print "...Unknown bootstate for %s : skipping..."% node
384                 return False
385         else:
386                 pass
387
388         if conn.bootmanager_running():
389                 print "...BootManager is currently running.  Skipping host %s" % node
390                 return True
391
392         #if config != None:
393         #       if config.force:
394         #               conn.restart_bootmanager(config.force)
395         #               return True
396
397         # Read persistent flags, tagged on one week intervals.
398         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
399                 
400
401         if config and not config.quiet: print "...downloading dmesg from %s" % node
402         dmesg = conn.get_dmesg()
403         child = fdpexpect.fdspawn(dmesg)
404
405         sequence = []
406         while True:
407                 steps = [
408                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
409                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
410                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
411
412                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
413
414                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
415                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
416
417                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
418                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
419
420                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
421                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
422
423                         ('floppytimeout','floppy0: floppy timeout called'),
424                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
425
426                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
427                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
428
429                         # floppy0: floppy timeout called
430                         # end_request: I/O error, dev fd0, sector 0
431
432                         # Buffer I/O error on device dm-2, logical block 8888896
433                         # ata1: status=0x51 { DriveReady SeekComplete Error }
434                         # ata1: error=0x40 { UncorrectableError }
435                         # SCSI error : <0 0 0 0> return code = 0x8000002
436                         # sda: Current: sense key: Medium Error
437                         #       Additional sense: Unrecovered read error - auto reallocate failed
438
439                         # SCSI error : <0 2 0 0> return code = 0x40001
440                         # end_request: I/O error, dev sda, sector 572489600
441                 ]
442                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
443                 sequence.append(id)
444
445                 if id == "done":
446                         break
447
448         s = Set(sequence)
449         if config and not config.quiet: print "\tSET: ", s
450
451         if len(s) > 1:
452                 print "...Potential drive errors on %s" % node
453                 if len(s) == 2 and 'floppyerror' in s:
454                         print "...Should investigate.  Continuing with node."
455                 else:
456                         print "...Should investigate.  Skipping node."
457                         # TODO: send message related to these errors.
458                         args = {}
459                         args['hostname'] = hostname
460                         args['log'] = conn.get_dmesg().read()
461
462                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
463                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
464
465                         loginbase = plc.siteId(hostname)
466                         emails = plc.getTechEmails(loginbase)
467                         m.send(emails) 
468                         conn.set_nodestate('disable')
469                         return False
470
471         print "...Downloading bm.log from %s" % node
472         log = conn.get_bootmanager_log()
473         child = fdpexpect.fdspawn(log)
474
475         try:
476                 if config.collect: return True
477         except:
478                 pass
479
480         time.sleep(1)
481
482         if config and not config.quiet: print "...Scanning bm.log for errors"
483         action_id = "dbg"
484         sequence = []
485         while True:
486
487                 steps = [
488                         ('bminit'               , 'Initializing the BootManager.'),
489                         ('cfg'                  , 'Reading node configuration file.'),
490                         ('auth'                 , 'Authenticating node with PLC.'),
491                         ('getplc'               , 'Retrieving details of node from PLC.'),
492                         ('update'               , 'Updating node boot state at PLC.'),
493                         ('hardware'             , 'Checking if hardware requirements met.'),
494                         ('installinit'  , 'Install: Initializing.'),
495                         ('installdisk'  , 'Install: partitioning disks.'),
496                         ('installbootfs', 'Install: bootstrapfs tarball.'),
497                         ('installcfg'   , 'Install: Writing configuration files.'),
498                         ('installstop'  , 'Install: Shutting down installer.'),
499                         ('update2'              , 'Updating node boot state at PLC.'),
500                         ('installinit2' , 'Install: Initializing.'),
501                         ('validate'             , 'Validating node installation.'),
502                         ('rebuildinitrd', 'Rebuilding initrd'),
503                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
504                         ('update3'              , 'Updating node configuration.'),
505                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
506                         ('update4'              , 'Sending hardware configuration to PLC.'),
507                         ('debug'                , 'Starting debug mode'),
508                         ('bmexceptmount', 'BootManagerException during mount'),
509                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
510                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
511                         ('exception'    , 'Exception'),
512                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
513                         ('protoerror'   , 'XML RPC protocol error'),
514                         ('nodehostname' , 'Configured node hostname does not resolve'),
515                         ('implementerror', 'Implementation Error'),
516                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
517                         ('noinstall'    , 'notinstalled'),
518                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
519                         ('noblockdev'   , "No block devices detected."),
520                         ('dnserror'     , 'Name or service not known'),
521                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
522                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
523                         ('hardwarerequirefail' , 'Hardware requirements not met'),
524                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
525                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
526                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
527                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
528                         ('modulefail'   , 'Unable to get list of system modules'),
529                         ('writeerror'   , 'write error: No space left on device'),
530                         ('nospace'      , "No space left on device"),
531                         ('nonode'       , 'Failed to authenticate call: No such node'),
532                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
533                         ('bootcheckfail'     , 'BootCheckAuthentication'),
534                         ('bootupdatefail'   , 'BootUpdateNode'),
535                 ]
536                 list = steps_to_list(steps)
537                 index = child.expect( list + [ pexpect.EOF ])
538                 id = index_to_id(steps,index)
539                 sequence.append(id)
540
541                 if id == "exception":
542                         if config and not config.quiet: print "...Found An Exception!!!"
543                 elif index == len(list):
544                         #print "Reached EOF"
545                         break
546                 
547         s = "-".join(sequence)
548         print "   FOUND SEQUENCE: ", s
549
550         # NOTE: We get or set the flag based on the current sequence identifier.
551         #  By using the sequence identifier, we guarantee that there will be no
552         #  frequent loops.  I'm guessing there is a better way to track loops,
553         #  though.
554         #if not config.force and pflags.getRecentFlag(s):
555         #       pflags.setRecentFlag(s)
556         #       pflags.save() 
557         #       print "... flag is set or it has already run recently. Skipping %s" % node
558         #       return True
559
560         sequences = {}
561
562
563         # restart_bootmanager_boot
564         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
565                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
566                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
567
568                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
569
570                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
571                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
572                         "bminit-cfg-auth-getplc-update-debug-done",
573                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
574                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
575                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
576                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
577                         "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
578                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
579                         ]:
580                 sequences.update({n : "restart_bootmanager_boot"})
581
582         #       conn.restart_bootmanager('rins')
583         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
584                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
585                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
586                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
587                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
588                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
589                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
590                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
591                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
592                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
593                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
594                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
595                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
596                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
597                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
598                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
599                         # actual solution appears to involve removing the bad files, and
600                         # continually trying to boot the node.
601                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
602                         "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
603                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
604                         ]:
605                 sequences.update({n : "restart_bootmanager_rins"})
606
607         # repair_node_keys
608         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
609
610         #   conn.restart_node('rins')
611         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
612                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
613                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
614                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
615                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
616                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
617                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
618                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
619                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
620                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
621                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
622                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
623                         ]:
624                 sequences.update({n : "restart_node_rins"})
625
626         #       restart_node_boot
627         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
628                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
629                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
630                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
631                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
632                          "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
633                          ]:
634                 sequences.update({n: "restart_node_boot"})
635
636         # update_node_config_email
637         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
638                           "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
639                           "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
640                         ]:
641                 sequences.update({n : "update_node_config_email"})
642
643         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
644                            "bminit-cfg-update-exception-nodehostname-update-debug-done", 
645                         ]:
646                 sequences.update({n : "nodenetwork_email"})
647
648         # update_bootcd_email
649         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
650                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
651                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
652                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
653                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
654                         ]:
655                 sequences.update({n : "update_bootcd_email"})
656
657         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
658                         ]:
659                 sequences.update({n: "suspect_error_email"})
660
661         # update_hardware_email
662         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
663         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
664
665         # broken_hardware_email
666         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
667
668         # bad_dns_email
669         for n in [ 
670          "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
671                 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
672                 ]:
673                 sequences.update( { n : "bad_dns_email"})
674
675         flag_set = True
676
677         
678         if s not in sequences:
679                 print "   HOST %s" % hostname
680                 print "   UNKNOWN SEQUENCE: %s" % s
681
682                 args = {}
683                 args['hostname'] = hostname
684                 args['sequence'] = s
685                 args['bmlog'] = conn.get_bootmanager_log().read()
686                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
687                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
688                 m.reset()
689                 m.send([config.cc_email]) 
690
691                 conn.restart_bootmanager('boot')
692
693                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
694                 # This way, we can check it again after we've fixed it.
695                 flag_set = False
696
697         else:
698
699                 if   sequences[s] == "restart_bootmanager_boot":
700                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
701                         conn.restart_bootmanager('boot')
702                 elif sequences[s] == "restart_bootmanager_rins":
703                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
704                         conn.restart_bootmanager('rins')
705                 elif sequences[s] == "restart_node_rins":
706                         conn.restart_node('rins')
707                 elif sequences[s] == "restart_node_boot":
708                         conn.restart_node('boot')
709                 elif sequences[s] == "repair_node_keys":
710                         if conn.compare_and_repair_nodekeys():
711                                 # the keys either are in sync or were forced in sync.
712                                 # so try to reboot the node again.
713                                 conn.restart_bootmanager('rins')
714                                 pass
715                         else:
716                                 # there was some failure to synchronize the keys.
717                                 print "...Unable to repair node keys on %s" % node
718
719                 elif sequences[s] == "suspect_error_email":
720                         args = {}
721                         args['hostname'] = hostname
722                         args['sequence'] = s
723                         args['bmlog'] = conn.get_bootmanager_log().read()
724                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
725                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
726                         m.reset()
727                         m.send([config.cc_email]) 
728
729                         conn.restart_bootmanager('boot')
730
731                 elif sequences[s] == "update_node_config_email":
732                         print "...Sending message to UPDATE NODE CONFIG"
733                         args = {}
734                         args['hostname'] = hostname
735                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
736                                                                 True, db='nodeid_persistmessages')
737                         loginbase = plc.siteId(hostname)
738                         emails = plc.getTechEmails(loginbase)
739                         m.send(emails) 
740                         conn.dump_plconf_file()
741                         conn.set_nodestate('disable')
742
743                 elif sequences[s] == "nodenetwork_email":
744                         print "...Sending message to LOOK AT NODE NETWORK"
745                         args = {}
746                         args['hostname'] = hostname
747                         args['bmlog'] = conn.get_bootmanager_log().read()
748                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
749                                                                 True, db='nodenet_persistmessages')
750                         loginbase = plc.siteId(hostname)
751                         emails = plc.getTechEmails(loginbase)
752                         m.send(emails) 
753                         conn.dump_plconf_file()
754                         conn.set_nodestate('disable')
755
756                 elif sequences[s] == "update_bootcd_email":
757                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
758                         import getconf
759                         args = {}
760                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
761                         args['hostname_list'] = "%s" % hostname
762
763                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
764                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
765
766                         loginbase = plc.siteId(hostname)
767                         emails = plc.getTechEmails(loginbase)
768                         m.send(emails) 
769
770                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
771                         conn.set_nodestate('disable')
772
773                 elif sequences[s] == "broken_hardware_email":
774                         # MAKE An ACTION record that this host has failed hardware.  May
775                         # require either an exception "/minhw" or other manual intervention.
776                         # Definitely need to send out some more EMAIL.
777                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
778                         # TODO: email notice of broken hardware
779                         args = {}
780                         args['hostname'] = hostname
781                         args['log'] = conn.get_dmesg().read()
782                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
783                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
784
785                         loginbase = plc.siteId(hostname)
786                         emails = plc.getTechEmails(loginbase)
787                         m.send(emails) 
788                         conn.set_nodestate('disable')
789
790                 elif sequences[s] == "update_hardware_email":
791                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
792                         args = {}
793                         args['hostname'] = hostname
794                         args['bmlog'] = conn.get_bootmanager_log().read()
795                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
796                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
797
798                         loginbase = plc.siteId(hostname)
799                         emails = plc.getTechEmails(loginbase)
800                         m.send(emails) 
801                         conn.set_nodestate('disable')
802
803                 elif sequences[s] == "bad_dns_email":
804                         print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
805                         args = {}
806                         try:
807                                 node = api.GetNodes(hostname)[0]
808                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
809                         except:
810                                 from nodecommon import email_exception
811                                 email_exception()
812                                 print traceback.print_exc()
813                                 # TODO: api error. skip email, b/c all info is not available,
814                                 # flag_set will not be recorded.
815                                 return False
816                         nodenet_str = network_config_to_str(net)
817
818                         args['hostname'] = hostname
819                         args['network_config'] = nodenet_str
820                         args['nodenetwork_id'] = net['nodenetwork_id']
821                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
822                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
823
824                         loginbase = plc.siteId(hostname)
825                         emails = plc.getTechEmails(loginbase)
826                         m.send(emails) 
827                         conn.set_nodestate('disable')
828
829         if flag_set:
830                 pflags.setRecentFlag(s)
831                 pflags.save() 
832
833         return True
834         
835
836 # MAIN -------------------------------------------------------------------
837
838 def main():
839         import parser as parsermodule
840         parser = parsermodule.getParser()
841
842         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
843                                                 force=None, quiet=False)
844         parser.add_option("", "--child", dest="child", action="store_true", 
845                                                 help="This is the child mode of this process.")
846         parser.add_option("", "--force", dest="force", metavar="boot_state",
847                                                 help="Force a boot state passed to BootManager.py.")
848         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
849                                                 help="Extra quiet output messages.")
850         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
851                                                 help="Extra debug output messages.")
852         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
853                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
854         parser.add_option("", "--collect", dest="collect", action="store_true", 
855                                                 help="No action, just collect dmesg, and bm.log")
856         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
857                                                 help="Do not perform the orginary setup phase.")
858
859         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
860         config = parsermodule.parse_args(parser)
861
862         if config.nodelist:
863                 nodes = config.getListFromFile(config.nodelist)
864         elif config.node:
865                 nodes = [ config.node ]
866         else:
867                 parser.print_help()
868                 sys.exit(1)
869
870         for node in nodes:
871                 reboot(node, config)
872
873 if __name__ == "__main__":
874         main()