svn-keywords
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 from monitor import const
6 from monitor.database.info.model import *
7 from monitor.wrapper import plc
8 api = plc.getAuthAPI()
9
10 import sys
11 import os
12
13 from getsshkeys import SSHKnownHosts
14
15 import subprocess
16 import time
17 from pcucontrol.util import command as moncommands
18 from sets import Set
19
20 from pcucontrol.transports.ssh import pxssh as pxssh
21 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
22 from pcucontrol.transports.ssh import pexpect as pexpect
23 from monitor.model import *
24 from monitor.wrapper.emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
26 import traceback
27 from monitor import config
28
29 import signal
30 class Sopen(subprocess.Popen):
31         def kill(self, signal = signal.SIGTERM):
32                 os.kill(self.pid, signal)
33
34 #from Rpyc import SocketConnection, Async
35 from Rpyc import SocketConnection, Async
36 from Rpyc.Utils import *
37 fb = None
38
39 class NodeConnection:
40         def __init__(self, connection, node, config):
41                 self.node = node
42                 self.c = connection
43                 self.config = config
44
45         def get_boot_state(self):
46                 if self.c.modules.os.path.exists('/tmp/source'):
47                         return "dbg"
48                 elif self.c.modules.os.path.exists('/vservers'): 
49                         return "boot"
50                 else:
51                         return "unknown"
52
53         def get_dmesg(self):
54                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
55                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
56                 log = open("log/dmesg.%s.log" % self.node, 'r')
57                 return log
58
59         def get_bootmanager_log(self):
60                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
61                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
62                 log = open("log/bm.%s.log" % self.node, 'r')
63                 return log
64
65         def dump_plconf_file(self):
66                 c = self.c
67                 self.c.modules.sys.path.append("/tmp/source/")
68                 self.c.modules.os.chdir('/tmp/source')
69
70                 log = c.modules.BootManager.log('/tmp/new.log')
71                 bm = c.modules.BootManager.BootManager(log,'boot')
72
73                 BootManagerException = c.modules.Exceptions.BootManagerException
74                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
75                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
76                 bm_continue = True
77
78                 InitializeBootManager.Run(bm.VARS, bm.LOG)
79                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
80                 except Exception, x:
81                         bm_continue = False
82                         print "   ERROR:", x
83                         print "   Possibly, unable to find valid configuration file"
84
85                 if bm_continue:
86                         for key in bm.VARS.keys():
87                                 print key, " == ", bm.VARS[key]
88                 else:
89                         print "   Unable to read Node Configuration"
90                 
91
92         def compare_and_repair_nodekeys(self):
93                 c = self.c
94                 self.c.modules.sys.path.append("/tmp/source/")
95                 self.c.modules.os.chdir('/tmp/source')
96
97                 log = c.modules.BootManager.log('/tmp/new.log')
98                 bm = c.modules.BootManager.BootManager(log,'boot')
99
100                 BootManagerException = c.modules.Exceptions.BootManagerException
101                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
102                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
103                 bm_continue = True
104
105                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
106
107                 InitializeBootManager.Run(bm.VARS, bm.LOG)
108                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
109                 except Exception, x:
110                         bm_continue = False
111                         print "exception"
112                         print x
113                         print "   Possibly, unable to find valid configuration file"
114
115                 if bm_continue:
116                         print "   NODE: %s" % bm.VARS['NODE_KEY']
117                         print "   PLC : %s" % plcnode['key']
118
119                         if bm.VARS['NODE_KEY'] == plcnode['key']:
120                                 return True
121                         else:
122                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
123                                         print "   Successfully updated NODE_KEY with PLC"
124                                         return True
125                                 else:
126                                         return False
127                                 
128                         #for key in bm.VARS.keys():
129                         #       print key, " == ", bm.VARS[key]
130                 else:
131                         print "   Unable to retrieve NODE_KEY"
132
133         def bootmanager_running(self):
134                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
135                         return True
136                 else:
137                         return False
138
139         def set_nodestate(self, state='boot'):
140                 return api.UpdateNode(self.node, {'boot_state' : state})
141
142         def restart_node(self, state='boot'):
143                 api.UpdateNode(self.node, {'boot_state' : state})
144
145                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
146                 if not pflags.getRecentFlag('gentlekill'):
147                         print "   Killing all slice processes... : %s" %  self.node
148                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
149                         self.c.modules.os.system(cmd_slicekill)
150                         cmd = """ shutdown -r +1 & """
151                         print "   Restarting %s : %s" % ( self.node, cmd)
152                         self.c.modules.os.system(cmd)
153
154                         pflags.setRecentFlag('gentlekill')
155                         pflags.save()
156                 else:
157                         print "   Restarting with sysrq 'sub' %s" % self.node
158                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
159                         self.c.modules.os.system(cmd)
160
161                 return
162
163         def restart_bootmanager(self, forceState):
164
165                 self.c.modules.os.chdir('/tmp/source')
166                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
167                         print "   BootManager is already running: try again soon..."
168                 else:
169                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
170                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
171                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
172                                   "  rm -f /tmp/BM_RUNNING " + \
173                                   ") &" 
174                         cmd = cmd % forceState
175                         self.c.modules.os.system(cmd)
176
177                 return 
178
179
180 import random
181 class PlanetLabSession:
182         globalport = 22000 + int(random.random()*1000)
183
184         def __init__(self, node, nosetup, verbose):
185                 self.verbose = verbose
186                 self.node = node
187                 self.port = None
188                 self.nosetup = nosetup
189                 self.command = None
190                 self.setup_host()
191
192         def get_connection(self, config):
193                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
194         
195         def setup_host(self):
196                 self.port = PlanetLabSession.globalport
197                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
198
199                 args = {}
200                 args['port'] = self.port
201                 args['user'] = 'root'
202                 args['hostname'] = self.node
203                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
204                 ssh_port = 22
205
206                 if self.nosetup:
207                         print "Skipping setup"
208                         return 
209
210                 # COPY Rpyc files to host
211                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
212                 if self.verbose: print cmd
213                 # TODO: Add timeout
214                 timeout = 120
215                 localos = moncommands.CMD()
216
217                 ret = localos.system(cmd, timeout)
218                 print ret
219                 if ret != 0:
220                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
221                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
222                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
223                         ret = localos.system(cmd, timeout)
224                         print ret
225                         if ret != 0:
226                                 print "\tFAILED TWICE"
227                                 #sys.exit(1)
228                                 raise Exception("Failed twice trying to login with updated ssh host key")
229
230                 t1 = time.time()
231                 # KILL any already running servers.
232                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
233                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
234             rm -f out.log
235             echo "kill server" >> out.log
236             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
237             echo "export" >> out.log
238             export PYTHONPATH=$HOME  ;
239             echo "start server" >> out.log
240             python Rpyc/Servers/forking_server.py &> server.log &
241             echo "done" >> out.log
242 EOF""")
243                 #cmd = """ssh %(user)s@%(hostname)s """ + \
244                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
245                 #cmd = cmd % args
246                 #if self.verbose: print cmd
247                 ## TODO: Add timeout
248                 #print localos.system(cmd,timeout)
249
250                 ## START a new rpyc server.
251                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
252                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
253                 #cmd = cmd % args
254                 #if self.verbose: print cmd
255                 #print localos.system(cmd,timeout)
256                 print ssh.ret
257
258                 # TODO: Add timeout
259                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
260                 # and the following options seems to work well.
261                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
262                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
263                           """-o ConnectTimeout=120 """ + \
264                           """-n -N -L %(port)s:localhost:18812 """ + \
265                           """%(user)s@%(hostname)s"""
266                 cmd = cmd % args
267                 if self.verbose: print cmd
268                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
269                 # TODO: the read() here may block indefinitely.  Need a better
270                 # approach therefore, that includes a timeout.
271                 #ret = self.command.stdout.read(5)
272                 ret = moncommands.read_t(self.command.stdout, 5)
273
274                 t2 = time.time()
275                 if 'READY' in ret:
276                         # NOTE: There is still a slight race for machines that are slow...
277                         self.timeout = 2*(t2-t1)
278                         print "Sleeping for %s sec" % self.timeout
279                         time.sleep(self.timeout)
280                         return
281
282                 if self.command.returncode is not None:
283                         print "Failed to establish tunnel!"
284                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
285
286                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
287
288         def __del__(self):
289                 if self.command:
290                         if self.verbose: print "Killing SSH session %s" % self.port
291                         self.command.kill()
292
293
294 def steps_to_list(steps):
295         ret_list = []
296         for (id,label) in steps:
297                 ret_list.append(label)
298         return ret_list
299
300 def index_to_id(steps,index):
301         if index < len(steps):
302                 return steps[index][0]
303         else:
304                 return "done"
305
306 def reboot(hostname, config=None, forced_action=None):
307
308         # NOTE: Nothing works if the bootcd is REALLY old.
309         #       So, this is the first step.
310         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
311         print fbnode.keys()
312         if fbnode['observed_category'] == "OLDBOOTCD":
313                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
314                 args = {}
315                 args['hostname_list'] = "    %s" % hostname
316
317                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
318                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
319
320                 loginbase = plc.siteId(hostname)
321                 emails = plc.getTechEmails(loginbase)
322                 m.send(emails) 
323
324                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
325                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
326                 return True
327
328         node = hostname
329         print "Creating session for %s" % node
330         # update known_hosts file (in case the node has rebooted since last run)
331         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
332         try:
333                 k = SSHKnownHosts(); k.update(node); k.write(); del k
334         except:
335                 from monitor.common import email_exception
336                 email_exception()
337                 print traceback.print_exc()
338                 return False
339
340         try:
341                 if config == None:
342                         session = PlanetLabSession(node, False, True)
343                 else:
344                         session = PlanetLabSession(node, config.nosetup, config.verbose)
345         except Exception, e:
346                 msg = "ERROR setting up session for %s" % hostname
347                 print msg
348                 print traceback.print_exc()
349                 from monitor.common import email_exception
350                 email_exception(msg)
351                 print e
352                 return False
353
354         try:
355                 conn = session.get_connection(config)
356         except EOFError:
357                 # NOTE: sometimes the wait in setup_host() is not long enough.  
358                 # So, here we try to wait a little longer before giving up entirely.
359                 try:
360                         time.sleep(session.timeout*4)
361                         conn = session.get_connection(config)
362                 except:
363                         print traceback.print_exc()
364                         from monitor.common import email_exception
365                         email_exception()
366                         return False
367
368         if forced_action == "reboot":
369                 conn.restart_node('rins')
370                 return True
371
372         boot_state = conn.get_boot_state()
373         if boot_state == "boot":
374                 print "...Boot state of %s already completed : skipping..." % node
375                 return True
376         elif boot_state == "unknown":
377                 print "...Unknown bootstate for %s : skipping..."% node
378                 return False
379         else:
380                 pass
381
382         if conn.bootmanager_running():
383                 print "...BootManager is currently running.  Skipping host %s" % node
384                 return True
385
386         #if config != None:
387         #       if config.force:
388         #               conn.restart_bootmanager(config.force)
389         #               return True
390
391         # Read persistent flags, tagged on one week intervals.
392         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
393                 
394
395         if config and not config.quiet: print "...downloading dmesg from %s" % node
396         dmesg = conn.get_dmesg()
397         child = fdpexpect.fdspawn(dmesg)
398
399         sequence = []
400         while True:
401                 steps = [
402                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
403                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
404                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
405
406                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
407
408                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
409                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
410
411                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
412                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
413
414                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
415                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
416
417                         ('floppytimeout','floppy0: floppy timeout called'),
418                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
419
420                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
421                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
422
423                         # floppy0: floppy timeout called
424                         # end_request: I/O error, dev fd0, sector 0
425
426                         # Buffer I/O error on device dm-2, logical block 8888896
427                         # ata1: status=0x51 { DriveReady SeekComplete Error }
428                         # ata1: error=0x40 { UncorrectableError }
429                         # SCSI error : <0 0 0 0> return code = 0x8000002
430                         # sda: Current: sense key: Medium Error
431                         #       Additional sense: Unrecovered read error - auto reallocate failed
432
433                         # SCSI error : <0 2 0 0> return code = 0x40001
434                         # end_request: I/O error, dev sda, sector 572489600
435                 ]
436                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
437                 sequence.append(id)
438
439                 if id == "done":
440                         break
441
442         s = Set(sequence)
443         if config and not config.quiet: print "\tSET: ", s
444
445         if len(s) > 1:
446                 print "...Potential drive errors on %s" % node
447                 if len(s) == 2 and 'floppyerror' in s:
448                         print "...Should investigate.  Continuing with node."
449                 else:
450                         print "...Should investigate.  Skipping node."
451                         # TODO: send message related to these errors.
452                         args = {}
453                         args['hostname'] = hostname
454                         args['log'] = conn.get_dmesg().read()
455
456                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
457                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
458
459                         loginbase = plc.siteId(hostname)
460                         emails = plc.getTechEmails(loginbase)
461                         m.send(emails) 
462                         conn.set_nodestate('disable')
463                         return False
464
465         print "...Downloading bm.log from %s" % node
466         log = conn.get_bootmanager_log()
467         child = fdpexpect.fdspawn(log)
468
469         try:
470                 if config.collect: return True
471         except:
472                 pass
473
474         time.sleep(1)
475
476         if config and not config.quiet: print "...Scanning bm.log for errors"
477         action_id = "dbg"
478         sequence = []
479         while True:
480
481                 steps = [
482                         ('bminit'               , 'Initializing the BootManager.'),
483                         ('cfg'                  , 'Reading node configuration file.'),
484                         ('auth'                 , 'Authenticating node with PLC.'),
485                         ('getplc'               , 'Retrieving details of node from PLC.'),
486                         ('update'               , 'Updating node boot state at PLC.'),
487                         ('hardware'             , 'Checking if hardware requirements met.'),
488                         ('installinit'  , 'Install: Initializing.'),
489                         ('installdisk'  , 'Install: partitioning disks.'),
490                         ('installbootfs', 'Install: bootstrapfs tarball.'),
491                         ('installcfg'   , 'Install: Writing configuration files.'),
492                         ('installstop'  , 'Install: Shutting down installer.'),
493                         ('update2'              , 'Updating node boot state at PLC.'),
494                         ('installinit2' , 'Install: Initializing.'),
495                         ('validate'             , 'Validating node installation.'),
496                         ('rebuildinitrd', 'Rebuilding initrd'),
497                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
498                         ('update3'              , 'Updating node configuration.'),
499                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
500                         ('update4'              , 'Sending hardware configuration to PLC.'),
501                         ('debug'                , 'Starting debug mode'),
502                         ('bmexceptmount', 'BootManagerException during mount'),
503                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
504                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
505                         ('exception'    , 'Exception'),
506                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
507                         ('protoerror'   , 'XML RPC protocol error'),
508                         ('nodehostname' , 'Configured node hostname does not resolve'),
509                         ('implementerror', 'Implementation Error'),
510                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
511                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
512                         ('noinstall'    , 'notinstalled'),
513                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
514                         ('noblockdev'   , "No block devices detected."),
515                         ('dnserror'     , 'Name or service not known'),
516                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
517                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
518                         ('hardwarerequirefail' , 'Hardware requirements not met'),
519                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
520                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
521                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
522                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
523                         ('modulefail'   , 'Unable to get list of system modules'),
524                         ('writeerror'   , 'write error: No space left on device'),
525                         ('nospace'      , "No space left on device"),
526                         ('nonode'       , 'Failed to authenticate call: No such node'),
527                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
528                         ('bootcheckfail'     , 'BootCheckAuthentication'),
529                         ('bootupdatefail'   , 'BootUpdateNode'),
530                 ]
531                 list = steps_to_list(steps)
532                 index = child.expect( list + [ pexpect.EOF ])
533                 id = index_to_id(steps,index)
534                 sequence.append(id)
535
536                 if id == "exception":
537                         if config and not config.quiet: print "...Found An Exception!!!"
538                 elif index == len(list):
539                         #print "Reached EOF"
540                         break
541                 
542         s = "-".join(sequence)
543         print "   FOUND SEQUENCE: ", s
544
545         # NOTE: We get or set the flag based on the current sequence identifier.
546         #  By using the sequence identifier, we guarantee that there will be no
547         #  frequent loops.  I'm guessing there is a better way to track loops,
548         #  though.
549         #if not config.force and pflags.getRecentFlag(s):
550         #       pflags.setRecentFlag(s)
551         #       pflags.save() 
552         #       print "... flag is set or it has already run recently. Skipping %s" % node
553         #       return True
554
555         sequences = {}
556
557
558         # restart_bootmanager_boot
559         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
560                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
561                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
562
563                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
564
565                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
566                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
567                         "bminit-cfg-auth-getplc-update-debug-done",
568                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
569                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
570                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
571                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
572                         "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
573                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
574                         ]:
575                 sequences.update({n : "restart_bootmanager_boot"})
576
577         #       conn.restart_bootmanager('rins')
578         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
579                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
580                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
581                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
582                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
583                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
584                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
585                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
586                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
587                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
588                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
589                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
590                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
591                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
592                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
593                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
594                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
595                         # actual solution appears to involve removing the bad files, and
596                         # continually trying to boot the node.
597                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
598                         "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
599                         ]:
600                 sequences.update({n : "restart_bootmanager_rins"})
601
602         # repair_node_keys
603         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
604
605         #   conn.restart_node('rins')
606         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
607                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
608                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
609                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
610                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
611                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
612                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
613                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
614                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
615                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
616                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
617                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
618                         ]:
619                 sequences.update({n : "restart_node_rins"})
620
621         #       restart_node_boot
622         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
623                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
624                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
625                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
626                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
627                          "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
628                          ]:
629                 sequences.update({n: "restart_node_boot"})
630
631         # update_node_config_email
632         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
633                           "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
634                           "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
635                         ]:
636                 sequences.update({n : "update_node_config_email"})
637
638         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
639                            "bminit-cfg-update-exception-nodehostname-update-debug-done", 
640                         ]:
641                 sequences.update({n : "nodenetwork_email"})
642
643         # update_bootcd_email
644         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
645                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
646                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
647                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
648                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
649                         ]:
650                 sequences.update({n : "update_bootcd_email"})
651
652         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
653                         ]:
654                 sequences.update({n: "suspect_error_email"})
655
656         # update_hardware_email
657         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
658         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
659
660         # broken_hardware_email
661         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
662
663         # bad_dns_email
664         for n in [ 
665          "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
666                 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
667                 ]:
668                 sequences.update( { n : "bad_dns_email"})
669
670         flag_set = True
671
672         
673         if s not in sequences:
674                 print "   HOST %s" % hostname
675                 print "   UNKNOWN SEQUENCE: %s" % s
676
677                 args = {}
678                 args['hostname'] = hostname
679                 args['sequence'] = s
680                 args['bmlog'] = conn.get_bootmanager_log().read()
681                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
682                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
683                 m.reset()
684                 m.send([config.cc_email]) 
685
686                 conn.restart_bootmanager('boot')
687
688                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
689                 # This way, we can check it again after we've fixed it.
690                 flag_set = False
691
692         else:
693
694                 if   sequences[s] == "restart_bootmanager_boot":
695                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
696                         conn.restart_bootmanager('boot')
697                 elif sequences[s] == "restart_bootmanager_rins":
698                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
699                         conn.restart_bootmanager('rins')
700                 elif sequences[s] == "restart_node_rins":
701                         conn.restart_node('rins')
702                 elif sequences[s] == "restart_node_boot":
703                         conn.restart_node('boot')
704                 elif sequences[s] == "repair_node_keys":
705                         if conn.compare_and_repair_nodekeys():
706                                 # the keys either are in sync or were forced in sync.
707                                 # so try to reboot the node again.
708                                 conn.restart_bootmanager('rins')
709                                 pass
710                         else:
711                                 # there was some failure to synchronize the keys.
712                                 print "...Unable to repair node keys on %s" % node
713
714                 elif sequences[s] == "suspect_error_email":
715                         args = {}
716                         args['hostname'] = hostname
717                         args['sequence'] = s
718                         args['bmlog'] = conn.get_bootmanager_log().read()
719                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
720                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
721                         m.reset()
722                         m.send([config.cc_email]) 
723
724                         conn.restart_bootmanager('boot')
725
726                 elif sequences[s] == "update_node_config_email":
727                         print "...Sending message to UPDATE NODE CONFIG"
728                         args = {}
729                         args['hostname'] = hostname
730                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
731                                                                 True, db='nodeid_persistmessages')
732                         loginbase = plc.siteId(hostname)
733                         emails = plc.getTechEmails(loginbase)
734                         m.send(emails) 
735                         conn.dump_plconf_file()
736                         conn.set_nodestate('disable')
737
738                 elif sequences[s] == "nodenetwork_email":
739                         print "...Sending message to LOOK AT NODE NETWORK"
740                         args = {}
741                         args['hostname'] = hostname
742                         args['bmlog'] = conn.get_bootmanager_log().read()
743                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
744                                                                 True, db='nodenet_persistmessages')
745                         loginbase = plc.siteId(hostname)
746                         emails = plc.getTechEmails(loginbase)
747                         m.send(emails) 
748                         conn.dump_plconf_file()
749                         conn.set_nodestate('disable')
750
751                 elif sequences[s] == "update_bootcd_email":
752                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
753                         import getconf
754                         args = {}
755                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
756                         args['hostname_list'] = "%s" % hostname
757
758                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
759                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
760
761                         loginbase = plc.siteId(hostname)
762                         emails = plc.getTechEmails(loginbase)
763                         m.send(emails) 
764
765                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
766                         conn.set_nodestate('disable')
767
768                 elif sequences[s] == "broken_hardware_email":
769                         # MAKE An ACTION record that this host has failed hardware.  May
770                         # require either an exception "/minhw" or other manual intervention.
771                         # Definitely need to send out some more EMAIL.
772                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
773                         # TODO: email notice of broken hardware
774                         args = {}
775                         args['hostname'] = hostname
776                         args['log'] = conn.get_dmesg().read()
777                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
778                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
779
780                         loginbase = plc.siteId(hostname)
781                         emails = plc.getTechEmails(loginbase)
782                         m.send(emails) 
783                         conn.set_nodestate('disable')
784
785                 elif sequences[s] == "update_hardware_email":
786                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
787                         args = {}
788                         args['hostname'] = hostname
789                         args['bmlog'] = conn.get_bootmanager_log().read()
790                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
791                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
792
793                         loginbase = plc.siteId(hostname)
794                         emails = plc.getTechEmails(loginbase)
795                         m.send(emails) 
796                         conn.set_nodestate('disable')
797
798                 elif sequences[s] == "bad_dns_email":
799                         print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
800                         args = {}
801                         try:
802                                 node = api.GetNodes(hostname)[0]
803                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
804                         except:
805                                 from monitor.common import email_exception
806                                 email_exception()
807                                 print traceback.print_exc()
808                                 # TODO: api error. skip email, b/c all info is not available,
809                                 # flag_set will not be recorded.
810                                 return False
811                         nodenet_str = network_config_to_str(net)
812
813                         args['hostname'] = hostname
814                         args['network_config'] = nodenet_str
815                         args['nodenetwork_id'] = net['nodenetwork_id']
816                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
817                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
818
819                         loginbase = plc.siteId(hostname)
820                         emails = plc.getTechEmails(loginbase)
821                         m.send(emails) 
822                         conn.set_nodestate('disable')
823
824         if flag_set:
825                 pflags.setRecentFlag(s)
826                 pflags.save() 
827
828         return True
829         
830
831 # MAIN -------------------------------------------------------------------
832
833 def main():
834         from monitor import parser as parsermodule
835         parser = parsermodule.getParser()
836
837         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
838                                                 force=None, quiet=False)
839         parser.add_option("", "--child", dest="child", action="store_true", 
840                                                 help="This is the child mode of this process.")
841         parser.add_option("", "--force", dest="force", metavar="boot_state",
842                                                 help="Force a boot state passed to BootManager.py.")
843         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
844                                                 help="Extra quiet output messages.")
845         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
846                                                 help="Extra debug output messages.")
847         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
848                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
849         parser.add_option("", "--collect", dest="collect", action="store_true", 
850                                                 help="No action, just collect dmesg, and bm.log")
851         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
852                                                 help="Do not perform the orginary setup phase.")
853
854         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
855         config = parsermodule.parse_args(parser)
856
857         if config.nodelist:
858                 nodes = config.getListFromFile(config.nodelist)
859         elif config.node:
860                 nodes = [ config.node ]
861         else:
862                 parser.print_help()
863                 sys.exit(1)
864
865         for node in nodes:
866                 reboot(node, config)
867
868 if __name__ == "__main__":
869         main()