6dc7ac89e931742cecb93997dfd4a08291f27236
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import plc
6 api = plc.getAuthAPI()
7
8 import sys
9 import os
10 import const
11
12 from getsshkeys import SSHKnownHosts
13
14 import subprocess
15 import time
16 import database
17 import moncommands
18 from sets import Set
19
20 import ssh.pxssh as pxssh
21 import ssh.fdpexpect as fdpexpect
22 import ssh.pexpect as pexpect
23 from unified_model import *
24 from emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
26 import traceback
27 import config
28
29 class ExceptionDoubleSSHError(Exception): pass
30
31 import signal
32 class Sopen(subprocess.Popen):
33         def kill(self, signal = signal.SIGTERM):
34                 os.kill(self.pid, signal)
35
36 #from Rpyc import SocketConnection, Async
37 from Rpyc import SocketConnection, Async
38 from Rpyc.Utils import *
39 fb = None
40
41 def get_fbnode(node):
42         global fb
43         if fb is None:
44                 fb = database.dbLoad("findbad")
45         fbnode = fb['nodes'][node]['values']
46         return fbnode
47
48 class NodeConnection:
49         def __init__(self, connection, node, config):
50                 self.node = node
51                 self.c = connection
52                 self.config = config
53
54         def get_boot_state(self):
55                 if self.c.modules.os.path.exists('/tmp/source'):
56                         return "dbg"
57                 elif self.c.modules.os.path.exists('/vservers'): 
58                         return "boot"
59                 else:
60                         return "unknown"
61
62         def get_dmesg(self):
63                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
64                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
65                 download(self.c, "/var/log/dmesg.bm.log", "log/history/%s-dmesg.%s.log" % (t_stamp, self.node))
66                 os.system("cp log/history/%s-dmesg.%s.log log/dmesg.%s.log" % (t_stamp, self.node, self.node))
67                 log = open("log/dmesg.%s.log" % self.node, 'r')
68                 return log
69
70         def get_bootmanager_log(self):
71                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
72                 download(self.c, "/tmp/bm.log", "log/history/%s-bm.%s.log" % (t_stamp, self.node))
73                 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
74                 os.system("cp log/history/%s-bm.%s.log log/bm.%s.log" % (t_stamp, self.node, self.node))
75                 log = open("log/bm.%s.log" % self.node, 'r')
76                 return log
77
78         def dump_plconf_file(self):
79                 c = self.c
80                 self.c.modules.sys.path.append("/tmp/source/")
81                 self.c.modules.os.chdir('/tmp/source')
82
83                 log = c.modules.BootManager.log('/tmp/new.log')
84                 bm = c.modules.BootManager.BootManager(log,'boot')
85
86                 BootManagerException = c.modules.Exceptions.BootManagerException
87                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
88                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
89                 bm_continue = True
90
91                 InitializeBootManager.Run(bm.VARS, bm.LOG)
92                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
93                 except Exception, x:
94                         bm_continue = False
95                         print "   ERROR:", x
96                         print "   Possibly, unable to find valid configuration file"
97
98                 if bm_continue and self.config and not self.config.quiet:
99                         for key in bm.VARS.keys():
100                                 print key, " == ", bm.VARS[key]
101                 else:
102                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
103                 
104
105         def compare_and_repair_nodekeys(self):
106                 c = self.c
107                 self.c.modules.sys.path.append("/tmp/source/")
108                 self.c.modules.os.chdir('/tmp/source')
109
110                 log = c.modules.BootManager.log('/tmp/new.log')
111                 bm = c.modules.BootManager.BootManager(log,'boot')
112
113                 BootManagerException = c.modules.Exceptions.BootManagerException
114                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
115                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
116                 bm_continue = True
117
118                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
119
120                 InitializeBootManager.Run(bm.VARS, bm.LOG)
121                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
122                 except Exception, x:
123                         bm_continue = False
124                         print "exception"
125                         print x
126                         print "   Possibly, unable to find valid configuration file"
127
128                 if bm_continue:
129                         print "   NODE: %s" % bm.VARS['NODE_KEY']
130                         print "   PLC : %s" % plcnode['key']
131
132                         if bm.VARS['NODE_KEY'] == plcnode['key']:
133                                 return True
134                         else:
135                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
136                                         print "   Successfully updated NODE_KEY with PLC"
137                                         return True
138                                 else:
139                                         return False
140                                 
141                         #for key in bm.VARS.keys():
142                         #       print key, " == ", bm.VARS[key]
143                 else:
144                         print "   Unable to retrieve NODE_KEY"
145
146         def bootmanager_running(self):
147                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
148                         return True
149                 else:
150                         return False
151
152         def set_nodestate(self, state='boot'):
153                 return api.UpdateNode(self.node, {'boot_state' : state})
154
155         def restart_node(self, state='boot'):
156                 api.UpdateNode(self.node, {'boot_state' : state})
157
158                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
159                 if not pflags.getRecentFlag('gentlekill'):
160                         print "   Killing all slice processes... : %s" %  self.node
161                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
162                         self.c.modules.os.system(cmd_slicekill)
163                         cmd = """ shutdown -r +1 & """
164                         print "   Restarting %s : %s" % ( self.node, cmd)
165                         self.c.modules.os.system(cmd)
166
167                         pflags.setRecentFlag('gentlekill')
168                         pflags.save()
169                 else:
170                         print "   Restarting with sysrq 'sub' %s" % self.node
171                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
172                         self.c.modules.os.system(cmd)
173
174                 return
175
176         def restart_bootmanager(self, forceState):
177
178                 self.c.modules.os.chdir('/tmp/source')
179                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
180                         print "   BootManager is already running: try again soon..."
181                 else:
182                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
183                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
184                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
185                                   "  rm -f /tmp/BM_RUNNING " + \
186                                   ") &" 
187                         cmd = cmd % forceState
188                         self.c.modules.os.system(cmd)
189
190                 return 
191
192
193 import random
194 class PlanetLabSession:
195         globalport = 22000 + int(random.random()*1000)
196
197         def __init__(self, node, nosetup, verbose):
198                 self.verbose = verbose
199                 self.node = node
200                 self.port = None
201                 self.nosetup = nosetup
202                 self.command = None
203                 self.setup_host()
204
205         def get_connection(self, config):
206                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
207         
208         def setup_host(self):
209                 self.port = PlanetLabSession.globalport
210                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
211
212                 args = {}
213                 args['port'] = self.port
214                 args['user'] = 'root'
215                 args['hostname'] = self.node
216                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
217                 ssh_port = 22
218
219                 if self.nosetup:
220                         print "Skipping setup"
221                         return 
222
223                 # COPY Rpyc files to host
224                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
225                 if self.verbose: print cmd
226                 # TODO: Add timeout
227                 timeout = 120
228                 localos = moncommands.CMD()
229
230                 ret = localos.system(cmd, timeout)
231                 print ret
232                 if ret != 0:
233                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
234                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
235                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
236                         ret = localos.system(cmd, timeout)
237                         print ret
238                         if ret != 0:
239                                 print "\tFAILED TWICE"
240                                 #sys.exit(1)
241                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
242
243                 t1 = time.time()
244                 # KILL any already running servers.
245                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
246                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
247             rm -f out.log
248             echo "kill server" >> out.log
249             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
250             echo "export" >> out.log
251             export PYTHONPATH=$HOME  ;
252             echo "start server" >> out.log
253             python Rpyc/Servers/forking_server.py &> server.log &
254             echo "done" >> out.log
255 EOF""")
256                 #cmd = """ssh %(user)s@%(hostname)s """ + \
257                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
258                 #cmd = cmd % args
259                 #if self.verbose: print cmd
260                 ## TODO: Add timeout
261                 #print localos.system(cmd,timeout)
262
263                 ## START a new rpyc server.
264                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
265                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
266                 #cmd = cmd % args
267                 #if self.verbose: print cmd
268                 #print localos.system(cmd,timeout)
269                 print ssh.ret
270
271                 # TODO: Add timeout
272                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
273                 # and the following options seems to work well.
274                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
275                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
276                           """-o ConnectTimeout=120 """ + \
277                           """-n -N -L %(port)s:localhost:18812 """ + \
278                           """%(user)s@%(hostname)s"""
279                 cmd = cmd % args
280                 if self.verbose: print cmd
281                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
282                 # TODO: the read() here may block indefinitely.  Need a better
283                 # approach therefore, that includes a timeout.
284                 #ret = self.command.stdout.read(5)
285                 ret = moncommands.read_t(self.command.stdout, 5)
286
287                 t2 = time.time()
288                 if 'READY' in ret:
289                         # NOTE: There is still a slight race for machines that are slow...
290                         self.timeout = 2*(t2-t1)
291                         print "Sleeping for %s sec" % self.timeout
292                         time.sleep(self.timeout)
293                         return
294
295                 if self.command.returncode is not None:
296                         print "Failed to establish tunnel!"
297                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
298
299                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
300
301         def __del__(self):
302                 if self.command:
303                         if self.verbose: print "Killing SSH session %s" % self.port
304                         self.command.kill()
305
306
307 def steps_to_list(steps):
308         ret_list = []
309         for (id,label) in steps:
310                 ret_list.append(label)
311         return ret_list
312
313 def index_to_id(steps,index):
314         if index < len(steps):
315                 return steps[index][0]
316         else:
317                 return "done"
318
319 def reboot(hostname, config=None, forced_action=None):
320
321         # NOTE: Nothing works if the bootcd is REALLY old.
322         #       So, this is the first step.
323         fbnode = get_fbnode(hostname)
324         if fbnode['category'] == "OLDBOOTCD":
325                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
326                 args = {}
327                 args['hostname_list'] = "    %s" % hostname
328
329                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
330                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
331
332                 loginbase = plc.siteId(hostname)
333                 emails = plc.getTechEmails(loginbase)
334                 m.send(emails) 
335
336                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
337                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
338                 return True
339
340         node = hostname
341         print "Creating session for %s" % node
342         # update known_hosts file (in case the node has rebooted since last run)
343         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
344         try:
345                 k = SSHKnownHosts(); k.update(node); k.write(); del k
346         except:
347                 from nodecommon import email_exception
348                 email_exception()
349                 print traceback.print_exc()
350                 return False
351
352         try:
353                 if config == None:
354                         session = PlanetLabSession(node, False, True)
355                 else:
356                         session = PlanetLabSession(node, config.nosetup, config.verbose)
357         except ExceptionDoubleSSHError, e:
358                 msg = "ERROR setting up session for %s" % hostname
359                 print msg
360                 return False
361         except Exception, e:
362                 msg = "ERROR setting up session for %s" % hostname
363                 print msg
364                 print traceback.print_exc()
365                 from nodecommon import email_exception
366                 email_exception(msg)
367                 print e
368                 return False
369
370         try:
371                 conn = session.get_connection(config)
372         except EOFError:
373                 # NOTE: sometimes the wait in setup_host() is not long enough.  
374                 # So, here we try to wait a little longer before giving up entirely.
375                 try:
376                         time.sleep(session.timeout*4)
377                         conn = session.get_connection(config)
378                 except EOFError:
379                         # failed twice... no need to report this really, it's just in a
380                         # weird state...
381                         return False
382                 except:
383                         print traceback.print_exc()
384                         from nodecommon import email_exception
385                         email_exception(node)
386                         return False
387
388         if forced_action == "reboot":
389                 conn.restart_node('rins')
390                 return True
391
392         boot_state = conn.get_boot_state()
393         if boot_state == "boot":
394                 print "...Boot state of %s already completed : skipping..." % node
395                 return True
396         elif boot_state == "unknown":
397                 print "...Unknown bootstate for %s : skipping..."% node
398                 return False
399         else:
400                 pass
401
402         if conn.bootmanager_running():
403                 print "...BootManager is currently running.  Skipping host %s" % node
404                 return True
405
406         #if config != None:
407         #       if config.force:
408         #               conn.restart_bootmanager(config.force)
409         #               return True
410
411         # Read persistent flags, tagged on one week intervals.
412         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
413                 
414
415         if config and not config.quiet: print "...downloading dmesg from %s" % node
416         dmesg = conn.get_dmesg()
417         child = fdpexpect.fdspawn(dmesg)
418
419         sequence = []
420         while True:
421                 steps = [
422                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
423                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
424                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
425
426                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
427
428                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
429                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
430
431                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
432                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
433
434                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
435                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
436
437                         ('floppytimeout','floppy0: floppy timeout called'),
438                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
439
440                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
441                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
442
443                         # floppy0: floppy timeout called
444                         # end_request: I/O error, dev fd0, sector 0
445
446                         # Buffer I/O error on device dm-2, logical block 8888896
447                         # ata1: status=0x51 { DriveReady SeekComplete Error }
448                         # ata1: error=0x40 { UncorrectableError }
449                         # SCSI error : <0 0 0 0> return code = 0x8000002
450                         # sda: Current: sense key: Medium Error
451                         #       Additional sense: Unrecovered read error - auto reallocate failed
452
453                         # SCSI error : <0 2 0 0> return code = 0x40001
454                         # end_request: I/O error, dev sda, sector 572489600
455                 ]
456                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
457                 sequence.append(id)
458
459                 if id == "done":
460                         break
461
462         s = Set(sequence)
463         if config and not config.quiet: print "\tSET: ", s
464
465         if len(s) > 1:
466                 print "...Potential drive errors on %s" % node
467                 if len(s) == 2 and 'floppyerror' in s:
468                         print "...Should investigate.  Continuing with node."
469                 else:
470                         print "...Should investigate.  Skipping node."
471                         # TODO: send message related to these errors.
472                         args = {}
473                         args['hostname'] = hostname
474                         args['log'] = conn.get_dmesg().read()
475
476                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
477                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
478
479                         loginbase = plc.siteId(hostname)
480                         emails = plc.getTechEmails(loginbase)
481                         m.send(emails) 
482                         conn.set_nodestate('disable')
483                         return False
484
485         print "...Downloading bm.log from %s" % node
486         log = conn.get_bootmanager_log()
487         child = fdpexpect.fdspawn(log)
488
489         try:
490                 if config.collect: return True
491         except:
492                 pass
493
494         time.sleep(1)
495
496         if config and not config.quiet: print "...Scanning bm.log for errors"
497         action_id = "dbg"
498         sequence = []
499         while True:
500
501                 steps = [
502                         ('bminit'               , 'Initializing the BootManager.'),
503                         ('cfg'                  , 'Reading node configuration file.'),
504                         ('auth'                 , 'Authenticating node with PLC.'),
505                         ('getplc'               , 'Retrieving details of node from PLC.'),
506                         ('update'               , 'Updating node boot state at PLC.'),
507                         ('hardware'             , 'Checking if hardware requirements met.'),
508                         ('installinit'  , 'Install: Initializing.'),
509                         ('installdisk'  , 'Install: partitioning disks.'),
510                         ('installbootfs', 'Install: bootstrapfs tarball.'),
511                         ('installcfg'   , 'Install: Writing configuration files.'),
512                         ('installstop'  , 'Install: Shutting down installer.'),
513                         ('update2'              , 'Updating node boot state at PLC.'),
514                         ('installinit2' , 'Install: Initializing.'),
515                         ('validate'             , 'Validating node installation.'),
516                         ('rebuildinitrd', 'Rebuilding initrd'),
517                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
518                         ('update3'              , 'Updating node configuration.'),
519                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
520                         ('update4'              , 'Sending hardware configuration to PLC.'),
521                         ('debug'                , 'Starting debug mode'),
522                         ('bmexceptmount', 'BootManagerException during mount'),
523                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
524                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
525                         ('exception'    , 'Exception'),
526                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
527                         ('protoerror'   , 'XML RPC protocol error'),
528                         ('nodehostname' , 'Configured node hostname does not resolve'),
529                         ('implementerror', 'Implementation Error'),
530                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
531                         ('noinstall'    , 'notinstalled'),
532                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
533                         ('noblockdev'   , "No block devices detected."),
534                         ('dnserror'     , 'Name or service not known'),
535                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
536                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
537                         ('hardwarerequirefail' , 'Hardware requirements not met'),
538                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
539                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
540                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
541                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
542                         ('modulefail'   , 'Unable to get list of system modules'),
543                         ('writeerror'   , 'write error: No space left on device'),
544                         ('nospace'      , "No space left on device"),
545                         ('nonode'       , 'Failed to authenticate call: No such node'),
546                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
547                         ('bootcheckfail'     , 'BootCheckAuthentication'),
548                         ('bootupdatefail'   , 'BootUpdateNode'),
549                 ]
550                 list = steps_to_list(steps)
551                 index = child.expect( list + [ pexpect.EOF ])
552                 id = index_to_id(steps,index)
553                 sequence.append(id)
554
555                 if id == "exception":
556                         if config and not config.quiet: print "...Found An Exception!!!"
557                 elif index == len(list):
558                         #print "Reached EOF"
559                         break
560                 
561         s = "-".join(sequence)
562         print "   FOUND SEQUENCE: ", s
563
564         # NOTE: We get or set the flag based on the current sequence identifier.
565         #  By using the sequence identifier, we guarantee that there will be no
566         #  frequent loops.  I'm guessing there is a better way to track loops,
567         #  though.
568         #if not config.force and pflags.getRecentFlag(s):
569         #       pflags.setRecentFlag(s)
570         #       pflags.save() 
571         #       print "... flag is set or it has already run recently. Skipping %s" % node
572         #       return True
573
574         sequences = {}
575
576
577         # restart_bootmanager_boot
578         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
579                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
580                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
581
582                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
583
584                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
585                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
586                         "bminit-cfg-auth-getplc-update-debug-done",
587                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
588                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
589                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
590                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
591                         "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
592                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
593                         ]:
594                 sequences.update({n : "restart_bootmanager_boot"})
595
596         #       conn.restart_bootmanager('rins')
597         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
598                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
599                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
600                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
601                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
602                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
603                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
604                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
605                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
606                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
607                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
608                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
609                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
610                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
611                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
612                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
613                         # actual solution appears to involve removing the bad files, and
614                         # continually trying to boot the node.
615                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
616                         "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
617                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
618                         ]:
619                 sequences.update({n : "restart_bootmanager_rins"})
620
621         # repair_node_keys
622         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
623
624         #   conn.restart_node('rins')
625         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
626                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
627                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
628                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
629                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
630                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
631                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
632                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
633                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
634                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
635                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
636                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
637                         ]:
638                 sequences.update({n : "restart_node_rins"})
639
640         #       restart_node_boot
641         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
642                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
643                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
644                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
645                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
646                          "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
647                          ]:
648                 sequences.update({n: "restart_node_boot"})
649
650         # update_node_config_email
651         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
652                           "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
653                           "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
654                         ]:
655                 sequences.update({n : "update_node_config_email"})
656
657         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
658                            "bminit-cfg-update-exception-nodehostname-update-debug-done", 
659                         ]:
660                 sequences.update({n : "nodenetwork_email"})
661
662         # update_bootcd_email
663         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
664                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
665                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
666                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
667                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
668                         ]:
669                 sequences.update({n : "update_bootcd_email"})
670
671         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
672                         ]:
673                 sequences.update({n: "suspect_error_email"})
674
675         # update_hardware_email
676         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
677         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
678
679         # broken_hardware_email
680         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
681
682         # bad_dns_email
683         for n in [ 
684          "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
685                 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
686                 ]:
687                 sequences.update( { n : "bad_dns_email"})
688
689         flag_set = True
690
691         
692         if s not in sequences:
693                 print "   HOST %s" % hostname
694                 print "   UNKNOWN SEQUENCE: %s" % s
695
696                 args = {}
697                 args['hostname'] = hostname
698                 args['sequence'] = s
699                 args['bmlog'] = conn.get_bootmanager_log().read()
700                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
701                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
702                 m.reset()
703                 m.send([config.cc_email]) 
704
705                 conn.restart_bootmanager('boot')
706
707                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
708                 # This way, we can check it again after we've fixed it.
709                 flag_set = False
710
711         else:
712
713                 if   sequences[s] == "restart_bootmanager_boot":
714                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
715                         conn.restart_bootmanager('boot')
716                 elif sequences[s] == "restart_bootmanager_rins":
717                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
718                         conn.restart_bootmanager('rins')
719                 elif sequences[s] == "restart_node_rins":
720                         conn.restart_node('rins')
721                 elif sequences[s] == "restart_node_boot":
722                         conn.restart_node('boot')
723                 elif sequences[s] == "repair_node_keys":
724                         if conn.compare_and_repair_nodekeys():
725                                 # the keys either are in sync or were forced in sync.
726                                 # so try to reboot the node again.
727                                 conn.restart_bootmanager('rins')
728                                 pass
729                         else:
730                                 # there was some failure to synchronize the keys.
731                                 print "...Unable to repair node keys on %s" % node
732
733                 elif sequences[s] == "suspect_error_email":
734                         args = {}
735                         args['hostname'] = hostname
736                         args['sequence'] = s
737                         args['bmlog'] = conn.get_bootmanager_log().read()
738                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
739                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
740                         m.reset()
741                         m.send([config.cc_email]) 
742
743                         conn.restart_bootmanager('boot')
744
745                 elif sequences[s] == "update_node_config_email":
746                         print "...Sending message to UPDATE NODE CONFIG"
747                         args = {}
748                         args['hostname'] = hostname
749                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
750                                                                 True, db='nodeid_persistmessages')
751                         loginbase = plc.siteId(hostname)
752                         emails = plc.getTechEmails(loginbase)
753                         m.send(emails) 
754                         conn.dump_plconf_file()
755                         conn.set_nodestate('disable')
756
757                 elif sequences[s] == "nodenetwork_email":
758                         print "...Sending message to LOOK AT NODE NETWORK"
759                         args = {}
760                         args['hostname'] = hostname
761                         args['bmlog'] = conn.get_bootmanager_log().read()
762                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
763                                                                 True, db='nodenet_persistmessages')
764                         loginbase = plc.siteId(hostname)
765                         emails = plc.getTechEmails(loginbase)
766                         m.send(emails) 
767                         conn.dump_plconf_file()
768                         conn.set_nodestate('disable')
769
770                 elif sequences[s] == "update_bootcd_email":
771                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
772                         import getconf
773                         args = {}
774                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
775                         args['hostname_list'] = "%s" % hostname
776
777                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
778                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
779
780                         loginbase = plc.siteId(hostname)
781                         emails = plc.getTechEmails(loginbase)
782                         m.send(emails) 
783
784                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
785                         conn.set_nodestate('disable')
786
787                 elif sequences[s] == "broken_hardware_email":
788                         # MAKE An ACTION record that this host has failed hardware.  May
789                         # require either an exception "/minhw" or other manual intervention.
790                         # Definitely need to send out some more EMAIL.
791                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
792                         # TODO: email notice of broken hardware
793                         args = {}
794                         args['hostname'] = hostname
795                         args['log'] = conn.get_dmesg().read()
796                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
797                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
798
799                         loginbase = plc.siteId(hostname)
800                         emails = plc.getTechEmails(loginbase)
801                         m.send(emails) 
802                         conn.set_nodestate('disable')
803
804                 elif sequences[s] == "update_hardware_email":
805                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
806                         args = {}
807                         args['hostname'] = hostname
808                         args['bmlog'] = conn.get_bootmanager_log().read()
809                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
810                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
811
812                         loginbase = plc.siteId(hostname)
813                         emails = plc.getTechEmails(loginbase)
814                         m.send(emails) 
815                         conn.set_nodestate('disable')
816
817                 elif sequences[s] == "bad_dns_email":
818                         print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
819                         args = {}
820                         try:
821                                 node = api.GetNodes(hostname)[0]
822                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
823                         except:
824                                 from nodecommon import email_exception
825                                 email_exception()
826                                 print traceback.print_exc()
827                                 # TODO: api error. skip email, b/c all info is not available,
828                                 # flag_set will not be recorded.
829                                 return False
830                         nodenet_str = network_config_to_str(net)
831
832                         args['hostname'] = hostname
833                         args['network_config'] = nodenet_str
834                         args['nodenetwork_id'] = net['nodenetwork_id']
835                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
836                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
837
838                         loginbase = plc.siteId(hostname)
839                         emails = plc.getTechEmails(loginbase)
840                         m.send(emails) 
841                         conn.set_nodestate('disable')
842
843         if flag_set:
844                 pflags.setRecentFlag(s)
845                 pflags.save() 
846
847         return True
848         
849
850 # MAIN -------------------------------------------------------------------
851
852 def main():
853         import parser as parsermodule
854         parser = parsermodule.getParser()
855
856         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
857                                                 force=None, quiet=False)
858         parser.add_option("", "--child", dest="child", action="store_true", 
859                                                 help="This is the child mode of this process.")
860         parser.add_option("", "--force", dest="force", metavar="boot_state",
861                                                 help="Force a boot state passed to BootManager.py.")
862         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
863                                                 help="Extra quiet output messages.")
864         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
865                                                 help="Extra debug output messages.")
866         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
867                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
868         parser.add_option("", "--collect", dest="collect", action="store_true", 
869                                                 help="No action, just collect dmesg, and bm.log")
870         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
871                                                 help="Do not perform the orginary setup phase.")
872
873         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
874         config = parsermodule.parse_args(parser)
875
876         if config.nodelist:
877                 nodes = config.getListFromFile(config.nodelist)
878         elif config.node:
879                 nodes = [ config.node ]
880         else:
881                 parser.print_help()
882                 sys.exit(1)
883
884         for node in nodes:
885                 reboot(node, config)
886
887 if __name__ == "__main__":
888         main()