merge from improvements on the 1.0 branch:
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 from monitor import const
6 from monitor.database.info.model import *
7 from monitor.wrapper import plc
8 api = plc.getAuthAPI()
9
10 import sys
11 import os
12
13 from getsshkeys import SSHKnownHosts
14
15 import subprocess
16 import time
17 from monitor.util import command as moncommands
18 from sets import Set
19
20 from pcucontrol.transports.ssh import pxssh as pxssh
21 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
22 from pcucontrol.transports.ssh import pexpect as pexpect
23 from monitor.model import *
24 from monitor.wrapper.emailTxt import mailtxt
25 from nodeconfig import network_config_to_str
26 import traceback
27 from monitor import config
28
29 import signal
30 class Sopen(subprocess.Popen):
31         def kill(self, signal = signal.SIGTERM):
32                 os.kill(self.pid, signal)
33
34 #from Rpyc import SocketConnection, Async
35 from Rpyc import SocketConnection, Async
36 from Rpyc.Utils import *
37 fb = None
38
39 class NodeConnection:
40         def __init__(self, connection, node, config):
41                 self.node = node
42                 self.c = connection
43                 self.config = config
44
45         def get_boot_state(self):
46                 if self.c.modules.os.path.exists('/tmp/source'):
47                         return "dbg"
48                 elif self.c.modules.os.path.exists('/vservers'): 
49                         return "boot"
50                 else:
51                         return "unknown"
52
53         def get_dmesg(self):
54                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
55                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
56                 log = open("log/dmesg.%s.log" % self.node, 'r')
57                 return log
58
59         def get_bootmanager_log(self):
60                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
61                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
62                 log = open("log/bm.%s.log" % self.node, 'r')
63                 return log
64
65         def dump_plconf_file(self):
66                 c = self.c
67                 self.c.modules.sys.path.append("/tmp/source/")
68                 self.c.modules.os.chdir('/tmp/source')
69
70                 log = c.modules.BootManager.log('/tmp/new.log')
71                 bm = c.modules.BootManager.BootManager(log,'boot')
72
73                 BootManagerException = c.modules.Exceptions.BootManagerException
74                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
75                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
76                 bm_continue = True
77
78                 InitializeBootManager.Run(bm.VARS, bm.LOG)
79                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
80                 except Exception, x:
81                         bm_continue = False
82                         print "   ERROR:", x
83                         print "   Possibly, unable to find valid configuration file"
84
85                 if bm_continue and self.config and not self.config.quiet:
86                         for key in bm.VARS.keys():
87                                 print key, " == ", bm.VARS[key]
88                 else:
89                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
90                 
91
92         def compare_and_repair_nodekeys(self):
93                 c = self.c
94                 self.c.modules.sys.path.append("/tmp/source/")
95                 self.c.modules.os.chdir('/tmp/source')
96
97                 log = c.modules.BootManager.log('/tmp/new.log')
98                 bm = c.modules.BootManager.BootManager(log,'boot')
99
100                 BootManagerException = c.modules.Exceptions.BootManagerException
101                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
102                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
103                 bm_continue = True
104
105                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
106
107                 InitializeBootManager.Run(bm.VARS, bm.LOG)
108                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
109                 except Exception, x:
110                         bm_continue = False
111                         print "exception"
112                         print x
113                         print "   Possibly, unable to find valid configuration file"
114
115                 if bm_continue:
116                         print "   NODE: %s" % bm.VARS['NODE_KEY']
117                         print "   PLC : %s" % plcnode['key']
118
119                         if bm.VARS['NODE_KEY'] == plcnode['key']:
120                                 return True
121                         else:
122                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
123                                         print "   Successfully updated NODE_KEY with PLC"
124                                         return True
125                                 else:
126                                         return False
127                                 
128                         #for key in bm.VARS.keys():
129                         #       print key, " == ", bm.VARS[key]
130                 else:
131                         print "   Unable to retrieve NODE_KEY"
132
133         def bootmanager_running(self):
134                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
135                         return True
136                 else:
137                         return False
138
139         def set_nodestate(self, state='boot'):
140                 return api.UpdateNode(self.node, {'boot_state' : state})
141
142         def restart_node(self, state='boot'):
143                 api.UpdateNode(self.node, {'boot_state' : state})
144
145                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
146                 if not pflags.getRecentFlag('gentlekill'):
147                         print "   Killing all slice processes... : %s" %  self.node
148                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
149                         self.c.modules.os.system(cmd_slicekill)
150                         cmd = """ shutdown -r +1 & """
151                         print "   Restarting %s : %s" % ( self.node, cmd)
152                         self.c.modules.os.system(cmd)
153
154                         pflags.setRecentFlag('gentlekill')
155                         pflags.save()
156                 else:
157                         print "   Restarting with sysrq 'sub' %s" % self.node
158                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
159                         self.c.modules.os.system(cmd)
160
161                 return
162
163         def restart_bootmanager(self, forceState):
164
165                 self.c.modules.os.chdir('/tmp/source')
166                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
167                         print "   BootManager is already running: try again soon..."
168                 else:
169                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
170                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
171                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
172                                   "  rm -f /tmp/BM_RUNNING " + \
173                                   ") &" 
174                         cmd = cmd % forceState
175                         self.c.modules.os.system(cmd)
176
177                 return 
178
179
180 import random
181 class PlanetLabSession:
182         globalport = 22000 + int(random.random()*1000)
183
184         def __init__(self, node, nosetup, verbose):
185                 self.verbose = verbose
186                 self.node = node
187                 self.port = None
188                 self.nosetup = nosetup
189                 self.command = None
190                 self.setup_host()
191
192         def get_connection(self, config):
193                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
194         
195         def setup_host(self):
196                 self.port = PlanetLabSession.globalport
197                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
198
199                 args = {}
200                 args['port'] = self.port
201                 args['user'] = 'root'
202                 args['hostname'] = self.node
203                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
204                 ssh_port = 22
205
206                 if self.nosetup:
207                         print "Skipping setup"
208                         return 
209
210                 # COPY Rpyc files to host
211                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
212                 if self.verbose: print cmd
213                 # TODO: Add timeout
214                 timeout = 120
215                 localos = moncommands.CMD()
216
217                 ret = localos.system(cmd, timeout)
218                 print ret
219                 if ret != 0:
220                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
221                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
222                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
223                         ret = localos.system(cmd, timeout)
224                         print ret
225                         if ret != 0:
226                                 print "\tFAILED TWICE"
227                                 #sys.exit(1)
228                                 raise Exception("Failed twice trying to login with updated ssh host key")
229
230                 t1 = time.time()
231                 # KILL any already running servers.
232                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
233                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
234             rm -f out.log
235             echo "kill server" >> out.log
236             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
237             echo "export" >> out.log
238             export PYTHONPATH=$HOME  ;
239             echo "start server" >> out.log
240             python Rpyc/Servers/forking_server.py &> server.log &
241             echo "done" >> out.log
242 EOF""")
243                 #cmd = """ssh %(user)s@%(hostname)s """ + \
244                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
245                 #cmd = cmd % args
246                 #if self.verbose: print cmd
247                 ## TODO: Add timeout
248                 #print localos.system(cmd,timeout)
249
250                 ## START a new rpyc server.
251                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
252                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
253                 #cmd = cmd % args
254                 #if self.verbose: print cmd
255                 #print localos.system(cmd,timeout)
256                 print ssh.ret
257
258                 # TODO: Add timeout
259                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
260                 # and the following options seems to work well.
261                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
262                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
263                           """-o ConnectTimeout=120 """ + \
264                           """-n -N -L %(port)s:localhost:18812 """ + \
265                           """%(user)s@%(hostname)s"""
266                 cmd = cmd % args
267                 if self.verbose: print cmd
268                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
269                 # TODO: the read() here may block indefinitely.  Need a better
270                 # approach therefore, that includes a timeout.
271                 #ret = self.command.stdout.read(5)
272                 ret = moncommands.read_t(self.command.stdout, 5)
273
274                 t2 = time.time()
275                 if 'READY' in ret:
276                         # NOTE: There is still a slight race for machines that are slow...
277                         self.timeout = 2*(t2-t1)
278                         print "Sleeping for %s sec" % self.timeout
279                         time.sleep(self.timeout)
280                         return
281
282                 if self.command.returncode is not None:
283                         print "Failed to establish tunnel!"
284                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
285
286                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
287
288         def __del__(self):
289                 if self.command:
290                         if self.verbose: print "Killing SSH session %s" % self.port
291                         self.command.kill()
292
293
294 def steps_to_list(steps):
295         ret_list = []
296         for (id,label) in steps:
297                 ret_list.append(label)
298         return ret_list
299
300 def index_to_id(steps,index):
301         if index < len(steps):
302                 return steps[index][0]
303         else:
304                 return "done"
305
306 def reboot(hostname, config=None, forced_action=None):
307
308         # NOTE: Nothing works if the bootcd is REALLY old.
309         #       So, this is the first step.
310         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
311         if fbnode['category'] == "OLDBOOTCD":
312                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
313                 args = {}
314                 args['hostname_list'] = "    %s" % hostname
315
316                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
317                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
318
319                 loginbase = plc.siteId(hostname)
320                 emails = plc.getTechEmails(loginbase)
321                 m.send(emails) 
322
323                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
324                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
325                 return True
326
327         node = hostname
328         print "Creating session for %s" % node
329         # update known_hosts file (in case the node has rebooted since last run)
330         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
331         try:
332                 k = SSHKnownHosts(); k.update(node); k.write(); del k
333         except:
334                 print traceback.print_exc()
335                 return False
336
337         try:
338                 if config == None:
339                         session = PlanetLabSession(node, False, True)
340                 else:
341                         session = PlanetLabSession(node, config.nosetup, config.verbose)
342         except Exception, e:
343                 print "ERROR setting up session for %s" % hostname
344                 print traceback.print_exc()
345                 print e
346                 return False
347
348         try:
349                 conn = session.get_connection(config)
350         except EOFError:
351                 # NOTE: sometimes the wait in setup_host() is not long enough.  
352                 # So, here we try to wait a little longer before giving up entirely.
353                 try:
354                         time.sleep(session.timeout*4)
355                         conn = session.get_connection(config)
356                 except:
357                         print traceback.print_exc()
358                         return False
359
360         if forced_action == "reboot":
361                 conn.restart_node('rins')
362                 return True
363
364         boot_state = conn.get_boot_state()
365         if boot_state == "boot":
366                 print "...Boot state of %s already completed : skipping..." % node
367                 return True
368         elif boot_state == "unknown":
369                 print "...Unknown bootstate for %s : skipping..."% node
370                 return False
371         else:
372                 pass
373
374         if conn.bootmanager_running():
375                 print "...BootManager is currently running.  Skipping host %s" % node
376                 return True
377
378         #if config != None:
379         #       if config.force:
380         #               conn.restart_bootmanager(config.force)
381         #               return True
382
383         # Read persistent flags, tagged on one week intervals.
384         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
385                 
386
387         if config and not config.quiet: print "...downloading dmesg from %s" % node
388         dmesg = conn.get_dmesg()
389         child = fdpexpect.fdspawn(dmesg)
390
391         sequence = []
392         while True:
393                 steps = [
394                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
395                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
396                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
397
398                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
399
400                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
401                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
402
403                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
404                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
405
406                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
407                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
408
409                         ('floppytimeout','floppy0: floppy timeout called'),
410                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
411
412                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
413                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
414
415                         # floppy0: floppy timeout called
416                         # end_request: I/O error, dev fd0, sector 0
417
418                         # Buffer I/O error on device dm-2, logical block 8888896
419                         # ata1: status=0x51 { DriveReady SeekComplete Error }
420                         # ata1: error=0x40 { UncorrectableError }
421                         # SCSI error : <0 0 0 0> return code = 0x8000002
422                         # sda: Current: sense key: Medium Error
423                         #       Additional sense: Unrecovered read error - auto reallocate failed
424
425                         # SCSI error : <0 2 0 0> return code = 0x40001
426                         # end_request: I/O error, dev sda, sector 572489600
427                 ]
428                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
429                 sequence.append(id)
430
431                 if id == "done":
432                         break
433
434         s = Set(sequence)
435         if config and not config.quiet: print "\tSET: ", s
436
437         if len(s) > 1:
438                 print "...Potential drive errors on %s" % node
439                 if len(s) == 2 and 'floppyerror' in s:
440                         print "...Should investigate.  Continuing with node."
441                 else:
442                         print "...Should investigate.  Skipping node."
443                         # TODO: send message related to these errors.
444                         args = {}
445                         args['hostname'] = hostname
446                         args['log'] = conn.get_dmesg().read()
447
448                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
449                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
450
451                         loginbase = plc.siteId(hostname)
452                         emails = plc.getTechEmails(loginbase)
453                         m.send(emails) 
454                         conn.set_nodestate('disable')
455                         return False
456
457         print "...Downloading bm.log from %s" % node
458         log = conn.get_bootmanager_log()
459         child = fdpexpect.fdspawn(log)
460
461         try:
462                 if config.collect: return True
463         except:
464                 pass
465
466         time.sleep(1)
467
468         if config and not config.quiet: print "...Scanning bm.log for errors"
469         action_id = "dbg"
470         sequence = []
471         while True:
472
473                 steps = [
474                         ('bminit'               , 'Initializing the BootManager.'),
475                         ('cfg'                  , 'Reading node configuration file.'),
476                         ('auth'                 , 'Authenticating node with PLC.'),
477                         ('getplc'               , 'Retrieving details of node from PLC.'),
478                         ('update'               , 'Updating node boot state at PLC.'),
479                         ('hardware'             , 'Checking if hardware requirements met.'),
480                         ('installinit'  , 'Install: Initializing.'),
481                         ('installdisk'  , 'Install: partitioning disks.'),
482                         ('installbootfs', 'Install: bootstrapfs tarball.'),
483                         ('installcfg'   , 'Install: Writing configuration files.'),
484                         ('installstop'  , 'Install: Shutting down installer.'),
485                         ('update2'              , 'Updating node boot state at PLC.'),
486                         ('installinit2' , 'Install: Initializing.'),
487                         ('validate'             , 'Validating node installation.'),
488                         ('rebuildinitrd', 'Rebuilding initrd'),
489                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
490                         ('update3'              , 'Updating node configuration.'),
491                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
492                         ('update4'              , 'Sending hardware configuration to PLC.'),
493                         ('debug'                , 'Starting debug mode'),
494                         ('bmexceptmount', 'BootManagerException during mount'),
495                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
496                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
497                         ('exception'    , 'Exception'),
498                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
499                         ('protoerror'   , 'XML RPC protocol error'),
500                         ('nodehostname' , 'Configured node hostname does not resolve'),
501                         ('implementerror', 'Implementation Error'),
502                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
503                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
504                         ('noinstall'    , 'notinstalled'),
505                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
506                         ('noblockdev'   , "No block devices detected."),
507                         ('dnserror'     , 'Name or service not known'),
508                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
509                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
510                         ('hardwarerequirefail' , 'Hardware requirements not met'),
511                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
512                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
513                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
514                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
515                         ('modulefail'   , 'Unable to get list of system modules'),
516                         ('writeerror'   , 'write error: No space left on device'),
517                         ('nospace'      , "No space left on device"),
518                         ('nonode'       , 'Failed to authenticate call: No such node'),
519                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
520                         ('bootcheckfail'     , 'BootCheckAuthentication'),
521                         ('bootupdatefail'   , 'BootUpdateNode'),
522                 ]
523                 list = steps_to_list(steps)
524                 index = child.expect( list + [ pexpect.EOF ])
525                 id = index_to_id(steps,index)
526                 sequence.append(id)
527
528                 if id == "exception":
529                         if config and not config.quiet: print "...Found An Exception!!!"
530                 elif index == len(list):
531                         #print "Reached EOF"
532                         break
533                 
534         s = "-".join(sequence)
535         print "   FOUND SEQUENCE: ", s
536
537         # NOTE: We get or set the flag based on the current sequence identifier.
538         #  By using the sequence identifier, we guarantee that there will be no
539         #  frequent loops.  I'm guessing there is a better way to track loops,
540         #  though.
541         #if not config.force and pflags.getRecentFlag(s):
542         #       pflags.setRecentFlag(s)
543         #       pflags.save() 
544         #       print "... flag is set or it has already run recently. Skipping %s" % node
545         #       return True
546
547         sequences = {}
548
549
550         # restart_bootmanager_boot
551         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
552                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
553                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
554
555                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
556
557                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
558                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
559                         "bminit-cfg-auth-getplc-update-debug-done",
560                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
561                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
562                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
563                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
564                         "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
565                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
566                         ]:
567                 sequences.update({n : "restart_bootmanager_boot"})
568
569         #       conn.restart_bootmanager('rins')
570         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
571                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
572                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
573                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
574                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
575                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
576                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
577                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
578                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
579                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
580                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
581                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
582                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
583                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
584                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
585                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
586                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
587                         # actual solution appears to involve removing the bad files, and
588                         # continually trying to boot the node.
589                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
590                         ]:
591                 sequences.update({n : "restart_bootmanager_rins"})
592
593         # repair_node_keys
594         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
595
596         #   conn.restart_node('rins')
597         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
598                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
599                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
600                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
601                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
602                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
603                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
604                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
605                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
606                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
607                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
608                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
609                         ]:
610                 sequences.update({n : "restart_node_rins"})
611
612         #       restart_node_boot
613         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
614                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
615                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
616                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
617                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
618                          "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
619                          ]:
620                 sequences.update({n: "restart_node_boot"})
621
622         # update_node_config_email
623         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
624                           "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
625                           "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
626                         ]:
627                 sequences.update({n : "update_node_config_email"})
628
629         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", 
630                            "bminit-cfg-update-exception-nodehostname-update-debug-done", 
631                         ]:
632                 sequences.update({n : "nodenetwork_email"})
633
634         # update_bootcd_email
635         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
636                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
637                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
638                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
639                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
640                         ]:
641                 sequences.update({n : "update_bootcd_email"})
642
643         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
644                         ]:
645                 sequences.update({n: "suspect_error_email"})
646
647         # update_hardware_email
648         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
649         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
650
651         # broken_hardware_email
652         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
653
654         # bad_dns_email
655         for n in [ 
656          "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
657                 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
658                 ]:
659                 sequences.update( { n : "bad_dns_email"})
660
661         flag_set = True
662
663         
664         if s not in sequences:
665                 print "   HOST %s" % hostname
666                 print "   UNKNOWN SEQUENCE: %s" % s
667
668                 args = {}
669                 args['hostname'] = hostname
670                 args['sequence'] = s
671                 args['bmlog'] = conn.get_bootmanager_log().read()
672                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
673                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
674                 m.reset()
675                 m.send([config.cc_email]) 
676
677                 conn.restart_bootmanager('boot')
678
679                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
680                 # This way, we can check it again after we've fixed it.
681                 flag_set = False
682
683         else:
684
685                 if   sequences[s] == "restart_bootmanager_boot":
686                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
687                         conn.restart_bootmanager('boot')
688                 elif sequences[s] == "restart_bootmanager_rins":
689                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
690                         conn.restart_bootmanager('rins')
691                 elif sequences[s] == "restart_node_rins":
692                         conn.restart_node('rins')
693                 elif sequences[s] == "restart_node_boot":
694                         conn.restart_node('boot')
695                 elif sequences[s] == "repair_node_keys":
696                         if conn.compare_and_repair_nodekeys():
697                                 # the keys either are in sync or were forced in sync.
698                                 # so try to reboot the node again.
699                                 conn.restart_bootmanager('rins')
700                                 pass
701                         else:
702                                 # there was some failure to synchronize the keys.
703                                 print "...Unable to repair node keys on %s" % node
704
705                 elif sequences[s] == "suspect_error_email":
706                         args = {}
707                         args['hostname'] = hostname
708                         args['sequence'] = s
709                         args['bmlog'] = conn.get_bootmanager_log().read()
710                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
711                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
712                         m.reset()
713                         m.send([config.cc_email]) 
714
715                         conn.restart_bootmanager('boot')
716
717                 elif sequences[s] == "update_node_config_email":
718                         print "...Sending message to UPDATE NODE CONFIG"
719                         args = {}
720                         args['hostname'] = hostname
721                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
722                                                                 True, db='nodeid_persistmessages')
723                         loginbase = plc.siteId(hostname)
724                         emails = plc.getTechEmails(loginbase)
725                         m.send(emails) 
726                         conn.dump_plconf_file()
727                         conn.set_nodestate('disable')
728
729                 elif sequences[s] == "nodenetwork_email":
730                         print "...Sending message to LOOK AT NODE NETWORK"
731                         args = {}
732                         args['hostname'] = hostname
733                         args['bmlog'] = conn.get_bootmanager_log().read()
734                         m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
735                                                                 True, db='nodenet_persistmessages')
736                         loginbase = plc.siteId(hostname)
737                         emails = plc.getTechEmails(loginbase)
738                         m.send(emails) 
739                         conn.dump_plconf_file()
740                         conn.set_nodestate('disable')
741
742                 elif sequences[s] == "update_bootcd_email":
743                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
744                         import getconf
745                         args = {}
746                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
747                         args['hostname_list'] = "%s" % hostname
748
749                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
750                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
751
752                         loginbase = plc.siteId(hostname)
753                         emails = plc.getTechEmails(loginbase)
754                         m.send(emails) 
755
756                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
757                         conn.set_nodestate('disable')
758
759                 elif sequences[s] == "broken_hardware_email":
760                         # MAKE An ACTION record that this host has failed hardware.  May
761                         # require either an exception "/minhw" or other manual intervention.
762                         # Definitely need to send out some more EMAIL.
763                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
764                         # TODO: email notice of broken hardware
765                         args = {}
766                         args['hostname'] = hostname
767                         args['log'] = conn.get_dmesg().read()
768                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
769                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
770
771                         loginbase = plc.siteId(hostname)
772                         emails = plc.getTechEmails(loginbase)
773                         m.send(emails) 
774                         conn.set_nodestate('disable')
775
776                 elif sequences[s] == "update_hardware_email":
777                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
778                         args = {}
779                         args['hostname'] = hostname
780                         args['bmlog'] = conn.get_bootmanager_log().read()
781                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
782                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
783
784                         loginbase = plc.siteId(hostname)
785                         emails = plc.getTechEmails(loginbase)
786                         m.send(emails) 
787                         conn.set_nodestate('disable')
788
789                 elif sequences[s] == "bad_dns_email":
790                         print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
791                         args = {}
792                         try:
793                                 node = api.GetNodes(hostname)[0]
794                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
795                         except:
796                                 print traceback.print_exc()
797                                 # TODO: api error. skip email, b/c all info is not available,
798                                 # flag_set will not be recorded.
799                                 return False
800                         nodenet_str = network_config_to_str(net)
801
802                         args['hostname'] = hostname
803                         args['network_config'] = nodenet_str
804                         args['nodenetwork_id'] = net['nodenetwork_id']
805                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
806                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
807
808                         loginbase = plc.siteId(hostname)
809                         emails = plc.getTechEmails(loginbase)
810                         m.send(emails) 
811                         conn.set_nodestate('disable')
812
813         if flag_set:
814                 pflags.setRecentFlag(s)
815                 pflags.save() 
816
817         return True
818         
819
820 # MAIN -------------------------------------------------------------------
821
822 def main():
823         from monitor import parser as parsermodule
824         parser = parsermodule.getParser()
825
826         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False, 
827                                                 force=None, quiet=False)
828         parser.add_option("", "--child", dest="child", action="store_true", 
829                                                 help="This is the child mode of this process.")
830         parser.add_option("", "--force", dest="force", metavar="boot_state",
831                                                 help="Force a boot state passed to BootManager.py.")
832         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
833                                                 help="Extra quiet output messages.")
834         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
835                                                 help="Extra debug output messages.")
836         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
837                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
838         parser.add_option("", "--collect", dest="collect", action="store_true", 
839                                                 help="No action, just collect dmesg, and bm.log")
840         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
841                                                 help="Do not perform the orginary setup phase.")
842
843         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
844         config = parsermodule.parse_args(parser)
845
846         if config.nodelist:
847                 nodes = config.getListFromFile(config.nodelist)
848         elif config.node:
849                 nodes = [ config.node ]
850         else:
851                 parser.print_help()
852                 sys.exit(1)
853
854         for node in nodes:
855                 reboot(node, config)
856
857 if __name__ == "__main__":
858         main()