changes for 3.0
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import plc
6 api = plc.getAuthAPI()
7
8 import sys
9 import os
10 import const
11
12 from getsshkeys import SSHKnownHosts
13
14 import subprocess
15 import time
16 import database
17 import moncommands
18 from sets import Set
19
20 import ssh.pxssh as pxssh
21 import ssh.fdpexpect as fdpexpect
22 import ssh.pexpect as pexpect
23 from unified_model import *
24 from emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
26 import traceback
27 import config
28
29 class ExceptionDoubleSSHError(Exception): pass
30
31 import signal
32 class Sopen(subprocess.Popen):
33         def kill(self, signal = signal.SIGTERM):
34                 os.kill(self.pid, signal)
35
36 #from Rpyc import SocketConnection, Async
37 from Rpyc import SocketConnection, Async
38 from Rpyc.Utils import *
39 fb = None
40
41 def get_fbnode(node):
42         global fb
43         if fb is None:
44                 fb = database.dbLoad("findbad")
45         fbnode = fb['nodes'][node]['values']
46         return fbnode
47
48 class NodeConnection:
49         def __init__(self, connection, node, config):
50                 self.node = node
51                 self.c = connection
52                 self.config = config
53
54         def get_boot_state(self):
55                 if self.c.modules.os.path.exists('/tmp/source'):
56                         return "dbg"
57                 elif self.c.modules.os.path.exists('/vservers'): 
58                         return "boot"
59                 else:
60                         return "unknown"
61
62         def get_dmesg(self):
63                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
64                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
65                 download(self.c, "/var/log/dmesg.bm.log", "log/history/%s-dmesg.%s.log" % (t_stamp, self.node))
66                 os.system("cp log/history/%s-dmesg.%s.log log/dmesg.%s.log" % (t_stamp, self.node, self.node))
67                 log = open("log/dmesg.%s.log" % self.node, 'r')
68                 return log
69
70         def get_bootmanager_log(self):
71                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
72                 download(self.c, "/tmp/bm.log", "log/history/%s-bm.%s.log" % (t_stamp, self.node))
73                 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
74                 os.system("cp log/history/%s-bm.%s.log log/bm.%s.log" % (t_stamp, self.node, self.node))
75                 log = open("log/bm.%s.log" % self.node, 'r')
76                 return log
77
78         def dump_plconf_file(self):
79                 c = self.c
80                 self.c.modules.sys.path.append("/tmp/source/")
81                 self.c.modules.os.chdir('/tmp/source')
82
83                 log = c.modules.BootManager.log('/tmp/new.log')
84                 bm = c.modules.BootManager.BootManager(log,'boot')
85
86                 BootManagerException = c.modules.Exceptions.BootManagerException
87                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
88                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
89                 bm_continue = True
90
91                 InitializeBootManager.Run(bm.VARS, bm.LOG)
92                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
93                 except Exception, x:
94                         bm_continue = False
95                         print "   ERROR:", x
96                         print "   Possibly, unable to find valid configuration file"
97
98                 if bm_continue and self.config and not self.config.quiet:
99                         for key in bm.VARS.keys():
100                                 print key, " == ", bm.VARS[key]
101                 else:
102                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
103                 
104
105         def compare_and_repair_nodekeys(self):
106                 c = self.c
107                 self.c.modules.sys.path.append("/tmp/source/")
108                 self.c.modules.os.chdir('/tmp/source')
109
110                 log = c.modules.BootManager.log('/tmp/new.log')
111                 bm = c.modules.BootManager.BootManager(log,'boot')
112
113                 BootManagerException = c.modules.Exceptions.BootManagerException
114                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
115                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
116                 bm_continue = True
117
118                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
119
120                 InitializeBootManager.Run(bm.VARS, bm.LOG)
121                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
122                 except Exception, x:
123                         bm_continue = False
124                         print "exception"
125                         print x
126                         print "   Possibly, unable to find valid configuration file"
127
128                 if bm_continue:
129                         print "   NODE: %s" % bm.VARS['NODE_KEY']
130                         print "   PLC : %s" % plcnode['key']
131
132                         if bm.VARS['NODE_KEY'] == plcnode['key']:
133                                 return True
134                         else:
135                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
136                                         print "   Successfully updated NODE_KEY with PLC"
137                                         return True
138                                 else:
139                                         return False
140                                 
141                         #for key in bm.VARS.keys():
142                         #       print key, " == ", bm.VARS[key]
143                 else:
144                         print "   Unable to retrieve NODE_KEY"
145
146         def bootmanager_running(self):
147                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
148                         return True
149                 else:
150                         return False
151
152         def set_nodestate(self, state='boot'):
153                 return api.UpdateNode(self.node, {'boot_state' : state})
154
155         def restart_node(self, state='boot'):
156                 api.UpdateNode(self.node, {'boot_state' : state})
157
158                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
159                 if not pflags.getRecentFlag('gentlekill'):
160                         print "   Killing all slice processes... : %s" %  self.node
161                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
162                         self.c.modules.os.system(cmd_slicekill)
163                         cmd = """ shutdown -r +1 & """
164                         print "   Restarting %s : %s" % ( self.node, cmd)
165                         self.c.modules.os.system(cmd)
166
167                         pflags.setRecentFlag('gentlekill')
168                         pflags.save()
169                 else:
170                         print "   Restarting with sysrq 'sub' %s" % self.node
171                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
172                         self.c.modules.os.system(cmd)
173
174                 return
175
176         def restart_bootmanager(self, forceState):
177
178                 self.c.modules.os.chdir('/tmp/source')
179                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
180                         print "   BootManager is already running: try again soon..."
181                 else:
182                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
183                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
184                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
185                                   "  rm -f /tmp/BM_RUNNING " + \
186                                   ") &" 
187                         cmd = cmd % forceState
188                         self.c.modules.os.system(cmd)
189
190                 return 
191
192
193 import random
194 class PlanetLabSession:
195         globalport = 22000 + int(random.random()*1000)
196
197         def __init__(self, node, nosetup, verbose):
198                 self.verbose = verbose
199                 self.node = node
200                 self.port = None
201                 self.nosetup = nosetup
202                 self.command = None
203                 self.setup_host()
204
205         def get_connection(self, config):
206                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
207         
208         def setup_host(self):
209                 self.port = PlanetLabSession.globalport
210                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
211
212                 args = {}
213                 args['port'] = self.port
214                 args['user'] = 'root'
215                 args['hostname'] = self.node
216                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
217                 ssh_port = 22
218
219                 if self.nosetup:
220                         print "Skipping setup"
221                         return 
222
223                 # COPY Rpyc files to host
224                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
225                 if self.verbose: print cmd
226                 # TODO: Add timeout
227                 timeout = 120
228                 localos = moncommands.CMD()
229
230                 ret = localos.system(cmd, timeout)
231                 print ret
232                 if ret != 0:
233                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
234                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
235                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
236                         ret = localos.system(cmd, timeout)
237                         print ret
238                         if ret != 0:
239                                 print "\tFAILED TWICE"
240                                 #sys.exit(1)
241                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
242
243                 t1 = time.time()
244                 # KILL any already running servers.
245                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
246                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
247             rm -f out.log
248             echo "kill server" >> out.log
249             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
250             echo "export" >> out.log
251             export PYTHONPATH=$HOME  ;
252             echo "start server" >> out.log
253             python Rpyc/Servers/forking_server.py &> server.log &
254             echo "done" >> out.log
255 EOF""")
256                 #cmd = """ssh %(user)s@%(hostname)s """ + \
257                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
258                 #cmd = cmd % args
259                 #if self.verbose: print cmd
260                 ## TODO: Add timeout
261                 #print localos.system(cmd,timeout)
262
263                 ## START a new rpyc server.
264                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
265                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
266                 #cmd = cmd % args
267                 #if self.verbose: print cmd
268                 #print localos.system(cmd,timeout)
269                 print ssh.ret
270
271                 # TODO: Add timeout
272                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
273                 # and the following options seems to work well.
274                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
275                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
276                           """-o ConnectTimeout=120 """ + \
277                           """-n -N -L %(port)s:localhost:18812 """ + \
278                           """%(user)s@%(hostname)s"""
279                 cmd = cmd % args
280                 if self.verbose: print cmd
281                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
282                 # TODO: the read() here may block indefinitely.  Need a better
283                 # approach therefore, that includes a timeout.
284                 #ret = self.command.stdout.read(5)
285                 ret = moncommands.read_t(self.command.stdout, 5)
286
287                 t2 = time.time()
288                 if 'READY' in ret:
289                         # NOTE: There is still a slight race for machines that are slow...
290                         self.timeout = 2*(t2-t1)
291                         print "Sleeping for %s sec" % self.timeout
292                         time.sleep(self.timeout)
293                         return
294
295                 if self.command.returncode is not None:
296                         print "Failed to establish tunnel!"
297                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
298
299                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
300
301         def __del__(self):
302                 if self.command:
303                         if self.verbose: print "Killing SSH session %s" % self.port
304                         self.command.kill()
305
306
307 def steps_to_list(steps):
308         ret_list = []
309         for (id,label) in steps:
310                 ret_list.append(label)
311         return ret_list
312
313 def index_to_id(steps,index):
314         if index < len(steps):
315                 return steps[index][0]
316         else:
317                 return "done"
318
319 def reboot(hostname, config=None, forced_action=None):
320
321         # NOTE: Nothing works if the bootcd is REALLY old.
322         #       So, this is the first step.
323         fbnode = get_fbnode(hostname)
324         if fbnode['category'] == "OLDBOOTCD":
325                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
326                 args = {}
327                 args['hostname_list'] = "    %s" % hostname
328
329                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
330                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
331
332                 loginbase = plc.siteId(hostname)
333                 emails = plc.getTechEmails(loginbase)
334                 m.send(emails) 
335
336                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
337                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
338                 return True
339
340         node = hostname
341         print "Creating session for %s" % node
342         # update known_hosts file (in case the node has rebooted since last run)
343         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
344         try:
345                 k = SSHKnownHosts(); k.update(node); k.write(); del k
346         except:
347                 from nodecommon import email_exception
348                 email_exception()
349                 print traceback.print_exc()
350                 return False
351
352         try:
353                 if config == None:
354                         session = PlanetLabSession(node, False, True)
355                 else:
356                         session = PlanetLabSession(node, config.nosetup, config.verbose)
357         except ExceptionDoubleSSHError, e:
358                 msg = "ERROR setting up session for %s" % hostname
359                 print msg
360                 return False
361         except Exception, e:
362                 msg = "ERROR setting up session for %s" % hostname
363                 print msg
364                 print traceback.print_exc()
365                 from nodecommon import email_exception
366                 email_exception(msg)
367                 print e
368                 return False
369
370         try:
371                 conn = session.get_connection(config)
372         except EOFError:
373                 # NOTE: sometimes the wait in setup_host() is not long enough.  
374                 # So, here we try to wait a little longer before giving up entirely.
375                 try:
376                         time.sleep(session.timeout*4)
377                         conn = session.get_connection(config)
378                 except EOFError:
379                         # failed twice... no need to report this really, it's just in a
380                         # weird state...
381                         return False
382                 except:
383                         print traceback.print_exc()
384                         from nodecommon import email_exception
385                         email_exception(node)
386                         return False
387
388         if forced_action == "reboot":
389                 conn.restart_node('reinstall')
390                 return True
391
392         boot_state = conn.get_boot_state()
393         if boot_state == "boot":
394                 print "...Boot state of %s already completed : skipping..." % node
395                 return True
396         elif boot_state == "unknown":
397                 print "...Unknown bootstate for %s : skipping..."% node
398                 return False
399         else:
400                 pass
401
402         if conn.bootmanager_running():
403                 print "...BootManager is currently running.  Skipping host %s" % node
404                 return True
405
406         #if config != None:
407         #       if config.force:
408         #               conn.restart_bootmanager(config.force)
409         #               return True
410
411         # Read persistent flags, tagged on one week intervals.
412         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
413                 
414
415         if config and not config.quiet: print "...downloading dmesg from %s" % node
416         dmesg = conn.get_dmesg()
417         child = fdpexpect.fdspawn(dmesg)
418
419         sequence = []
420         while True:
421                 steps = [
422                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
423                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
424                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
425
426                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
427
428                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
429                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
430
431                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
432                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
433
434                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
435                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
436
437                         ('floppytimeout','floppy0: floppy timeout called'),
438                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
439
440                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
441                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
442
443                         # floppy0: floppy timeout called
444                         # end_request: I/O error, dev fd0, sector 0
445
446                         # Buffer I/O error on device dm-2, logical block 8888896
447                         # ata1: status=0x51 { DriveReady SeekComplete Error }
448                         # ata1: error=0x40 { UncorrectableError }
449                         # SCSI error : <0 0 0 0> return code = 0x8000002
450                         # sda: Current: sense key: Medium Error
451                         #       Additional sense: Unrecovered read error - auto reallocate failed
452
453                         # SCSI error : <0 2 0 0> return code = 0x40001
454                         # end_request: I/O error, dev sda, sector 572489600
455                 ]
456                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
457                 sequence.append(id)
458
459                 if id == "done":
460                         break
461
462         s = Set(sequence)
463         if config and not config.quiet: print "\tSET: ", s
464
465         if len(s) > 1:
466                 print "...Potential drive errors on %s" % node
467                 if len(s) == 2 and 'floppyerror' in s:
468                         print "...Should investigate.  Continuing with node."
469                 else:
470                         print "...Should investigate.  Skipping node."
471                         # TODO: send message related to these errors.
472                         args = {}
473                         args['hostname'] = hostname
474                         args['log'] = conn.get_dmesg().read()
475
476                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
477                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
478
479                         loginbase = plc.siteId(hostname)
480                         emails = plc.getTechEmails(loginbase)
481                         m.send(emails) 
482                         conn.set_nodestate('disable')
483                         return False
484
485         print "...Downloading bm.log from %s" % node
486         log = conn.get_bootmanager_log()
487         child = fdpexpect.fdspawn(log)
488
489         try:
490                 if config.collect: return True
491         except:
492                 pass
493
494         time.sleep(1)
495
496         if config and not config.quiet: print "...Scanning bm.log for errors"
497         action_id = "dbg"
498         sequence = []
499         while True:
500
501                 steps = [
502                         ('bminit'               , 'Initializing the BootManager.'),
503                         ('cfg'                  , 'Reading node configuration file.'),
504                         ('auth'                 , 'Authenticating node with PLC.'),
505                         ('getplc'               , 'Retrieving details of node from PLC.'),
506                         ('update'               , 'Updating node boot state at PLC.'),
507                         ('hardware'             , 'Checking if hardware requirements met.'),
508                         ('installinit'  , 'Install: Initializing.'),
509                         ('installdisk'  , 'Install: partitioning disks.'),
510                         ('installbootfs', 'Install: bootstrapfs tarball.'),
511                         ('installcfg'   , 'Install: Writing configuration files.'),
512                         ('installstop'  , 'Install: Shutting down installer.'),
513                         ('update2'              , 'Updating node boot state at PLC.'),
514                         ('installinit2' , 'Install: Initializing.'),
515                         ('validate'             , 'Validating node installation.'),
516                         ('rebuildinitrd', 'Rebuilding initrd'),
517                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
518                         ('update3'              , 'Updating node configuration.'),
519                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
520                         ('update4'              , 'Sending hardware configuration to PLC.'),
521                         ('debug'                , 'Starting debug mode'),
522                         ('bmexceptmount', 'BootManagerException during mount'),
523                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
524                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
525                         ('exception'    , 'Exception'),
526                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
527                         ('protoerror'   , 'XML RPC protocol error'),
528                         ('nodehostname' , 'Configured node hostname does not resolve'),
529                         ('implementerror', 'Implementation Error'),
530                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
531                         ('noinstall'    , 'notinstalled'),
532                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
533                         ('noblockdev'   , "No block devices detected."),
534                         ('dnserror'     , 'Name or service not known'),
535                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
536                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
537                         ('hardwarerequirefail' , 'Hardware requirements not met'),
538                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
539                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
540                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
541                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
542                         ('modulefail'   , 'Unable to get list of system modules'),
543                         ('writeerror'   , 'write error: No space left on device'),
544                         ('nospace'      , "No space left on device"),
545                         ('nonode'       , 'Failed to authenticate call: No such node'),
546                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
547                         ('bootcheckfail'     , 'BootCheckAuthentication'),
548                         ('bootupdatefail'   , 'BootUpdateNode'),
549                 ]
550                 list = steps_to_list(steps)
551                 index = child.expect( list + [ pexpect.EOF ])
552                 id = index_to_id(steps,index)
553                 sequence.append(id)
554
555                 if id == "exception":
556                         if config and not config.quiet: print "...Found An Exception!!!"
557                 elif index == len(list):
558                         #print "Reached EOF"
559                         break
560                 
561         s = "-".join(sequence)
562         print "   FOUND SEQUENCE: ", s
563
564         # NOTE: We get or set the flag based on the current sequence identifier.
565         #  By using the sequence identifier, we guarantee that there will be no
566         #  frequent loops.  I'm guessing there is a better way to track loops,
567         #  though.
568         #if not config.force and pflags.getRecentFlag(s):
569         #       pflags.setRecentFlag(s)
570         #       pflags.save() 
571         #       print "... flag is set or it has already run recently. Skipping %s" % node
572         #       return True
573
574         sequences = {}
575
576
577         # restart_bootmanager_boot
578         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
579                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
580                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
581
582                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
583
584                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
585                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
586                         "bminit-cfg-auth-getplc-update-debug-done",
587                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
588                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
589                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
590                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
591                         "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
592                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
593                         ]:
594                 sequences.update({n : "restart_bootmanager_boot"})
595
596         #       conn.restart_bootmanager('reinstall')
597         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
598                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
599                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
600                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
601                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
602                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
603                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
604                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
605                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
606                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
607                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
608                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
609                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
610                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
611                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
612                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
613                         # actual solution appears to involve removing the bad files, and
614                         # continually trying to boot the node.
615                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
616                         "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
617                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
618                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-validate-bmexceptvgscan-done",
619                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-validate-done",
620                         ]:
621                 sequences.update({n : "restart_bootmanager_rins"})
622
623         # repair_node_keys
624         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
625
626         #   conn.restart_node('reinstall')
627         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
628                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
629                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
630                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
631                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
632                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
633                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
634                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
635                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
636                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
637                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
638                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
639                         ]:
640                 sequences.update({n : "restart_node_rins"})
641
642         #       restart_node_boot
643         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
644                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
645                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
646                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
647                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
648                          "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
649                          ]:
650                 sequences.update({n: "restart_node_boot"})
651
652         # update_node_config_email
653         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
654                           "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
655                           "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
656                         ]:
657                 sequences.update({n : "update_node_config_email"})
658
659         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
660                            "bminit-cfg-update-exception-nodehostname-update-debug-done", 
661                         ]:
662                 sequences.update({n : "nodenetwork_email"})
663
664         # update_bootcd_email
665         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
666                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
667                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
668                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
669                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
670                         ]:
671                 sequences.update({n : "update_bootcd_email"})
672
673         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
674                         ]:
675                 sequences.update({n: "suspect_error_email"})
676
677         # update_hardware_email
678         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
679         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
680
681         # broken_hardware_email
682         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
683
684         # bad_dns_email
685         for n in [ 
686          "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
687                 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
688                 ]:
689                 sequences.update( { n : "bad_dns_email"})
690
691         flag_set = True
692
693         
694         if s not in sequences:
695                 print "   HOST %s" % hostname
696                 print "   UNKNOWN SEQUENCE: %s" % s
697
698                 args = {}
699                 args['hostname'] = hostname
700                 args['sequence'] = s
701                 args['bmlog'] = conn.get_bootmanager_log().read()
702                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
703                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
704                 m.reset()
705                 m.send([config.cc_email]) 
706
707                 conn.restart_bootmanager('boot')
708
709                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
710                 # This way, we can check it again after we've fixed it.
711                 flag_set = False
712
713         else:
714
715                 if   sequences[s] == "restart_bootmanager_boot":
716                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
717                         conn.restart_bootmanager('boot')
718                 elif sequences[s] == "restart_bootmanager_rins":
719                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
720                         conn.restart_bootmanager('reinstall')
721                 elif sequences[s] == "restart_node_rins":
722                         conn.restart_node('reinstall')
723                 elif sequences[s] == "restart_node_boot":
724                         conn.restart_node('boot')
725                 elif sequences[s] == "repair_node_keys":
726                         if conn.compare_and_repair_nodekeys():
727                                 # the keys either are in sync or were forced in sync.
728                                 # so try to reboot the node again.
729                                 conn.restart_bootmanager('reinstall')
730                                 pass
731                         else:
732                                 # there was some failure to synchronize the keys.
733                                 print "...Unable to repair node keys on %s" % node
734
735                 elif sequences[s] == "suspect_error_email":
736                         args = {}
737                         args['hostname'] = hostname
738                         args['sequence'] = s
739                         args['bmlog'] = conn.get_bootmanager_log().read()
740                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
741                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
742                         m.reset()
743                         m.send([config.cc_email]) 
744
745                         conn.restart_bootmanager('boot')
746
747                 elif sequences[s] == "update_node_config_email":
748                         print "...Sending message to UPDATE NODE CONFIG"
749                         args = {}
750                         args['hostname'] = hostname
751                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
752                                                                 True, db='nodeid_persistmessages')
753                         loginbase = plc.siteId(hostname)
754                         emails = plc.getTechEmails(loginbase)
755                         m.send(emails) 
756                         conn.dump_plconf_file()
757                         conn.set_nodestate('disable')
758
759                 elif sequences[s] == "nodenetwork_email":
760                         print "...Sending message to LOOK AT NODE NETWORK"
761                         args = {}
762                         args['hostname'] = hostname
763                         args['bmlog'] = conn.get_bootmanager_log().read()
764                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
765                                                                 True, db='nodenet_persistmessages')
766                         loginbase = plc.siteId(hostname)
767                         emails = plc.getTechEmails(loginbase)
768                         m.send(emails) 
769                         conn.dump_plconf_file()
770                         conn.set_nodestate('disable')
771
772                 elif sequences[s] == "update_bootcd_email":
773                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
774                         import getconf
775                         args = {}
776                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
777                         args['hostname_list'] = "%s" % hostname
778
779                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
780                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
781
782                         loginbase = plc.siteId(hostname)
783                         emails = plc.getTechEmails(loginbase)
784                         m.send(emails) 
785
786                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
787                         conn.set_nodestate('disable')
788
789                 elif sequences[s] == "broken_hardware_email":
790                         # MAKE An ACTION record that this host has failed hardware.  May
791                         # require either an exception "/minhw" or other manual intervention.
792                         # Definitely need to send out some more EMAIL.
793                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
794                         # TODO: email notice of broken hardware
795                         args = {}
796                         args['hostname'] = hostname
797                         args['log'] = conn.get_dmesg().read()
798                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
799                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
800
801                         loginbase = plc.siteId(hostname)
802                         emails = plc.getTechEmails(loginbase)
803                         m.send(emails) 
804                         conn.set_nodestate('disable')
805
806                 elif sequences[s] == "update_hardware_email":
807                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
808                         args = {}
809                         args['hostname'] = hostname
810                         args['bmlog'] = conn.get_bootmanager_log().read()
811                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
812                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
813
814                         loginbase = plc.siteId(hostname)
815                         emails = plc.getTechEmails(loginbase)
816                         m.send(emails) 
817                         conn.set_nodestate('disable')
818
819                 elif sequences[s] == "bad_dns_email":
820                         print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
821                         args = {}
822                         try:
823                                 node = api.GetNodes(hostname)[0]
824                                 net = api.GetInterfaces(node['interface_ids'])[0]
825                         except:
826                                 from nodecommon import email_exception
827                                 email_exception()
828                                 print traceback.print_exc()
829                                 # TODO: api error. skip email, b/c all info is not available,
830                                 # flag_set will not be recorded.
831                                 return False
832                         nodenet_str = network_config_to_str(net)
833
834                         args['hostname'] = hostname
835                         args['network_config'] = nodenet_str
836                         args['interface_id'] = net['interface_id']
837                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
838                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
839
840                         loginbase = plc.siteId(hostname)
841                         emails = plc.getTechEmails(loginbase)
842                         m.send(emails) 
843                         conn.set_nodestate('disable')
844
845         if flag_set:
846                 pflags.setRecentFlag(s)
847                 pflags.save() 
848
849         return True
850         
851
852 # MAIN -------------------------------------------------------------------
853
854 def main():
855         import parser as parsermodule
856         parser = parsermodule.getParser()
857
858         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
859                                                 force=None, quiet=False)
860         parser.add_option("", "--child", dest="child", action="store_true", 
861                                                 help="This is the child mode of this process.")
862         parser.add_option("", "--force", dest="force", metavar="boot_state",
863                                                 help="Force a boot state passed to BootManager.py.")
864         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
865                                                 help="Extra quiet output messages.")
866         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
867                                                 help="Extra debug output messages.")
868         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
869                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
870         parser.add_option("", "--collect", dest="collect", action="store_true", 
871                                                 help="No action, just collect dmesg, and bm.log")
872         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
873                                                 help="Do not perform the orginary setup phase.")
874
875         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
876         config = parsermodule.parse_args(parser)
877
878         if config.nodelist:
879                 nodes = config.getListFromFile(config.nodelist)
880         elif config.node:
881                 nodes = [ config.node ]
882         else:
883                 parser.print_help()
884                 sys.exit(1)
885
886         for node in nodes:
887                 reboot(node, config)
888
889 if __name__ == "__main__":
890         main()