AM nagios/plc2nagios.py
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import plc
6 import auth
7 api = plc.PLC(auth.auth, auth.plc)
8
9 import sys
10 import os
11 import policy
12
13 from getsshkeys import SSHKnownHosts
14
15 import subprocess
16 import time
17 import soltesz
18 from sets import Set
19
20 import ssh.pxssh as pxssh
21 import ssh.fdpexpect as fdpexpect
22 import ssh.pexpect as pexpect
23 from unified_model import *
24 from emailTxt import mailtxt
25
26 import signal
27 class Sopen(subprocess.Popen):
28         def kill(self, signal = signal.SIGTERM):
29                 os.kill(self.pid, signal)
30
31 #from Rpyc import SocketConnection, Async
32 from Rpyc import SocketConnection, Async
33 from Rpyc.Utils import *
34
35 def get_fbnode(node):
36         fb = soltesz.dbLoad("findbad")
37         fbnode = fb['nodes'][node]['values']
38         return fbnode
39
40 class NodeConnection:
41         def __init__(self, connection, node, config):
42                 self.node = node
43                 self.c = connection
44                 self.config = config
45
46         def get_boot_state(self):
47                 if self.c.modules.os.path.exists('/tmp/source'):
48                         return "dbg"
49                 elif self.c.modules.os.path.exists('/vservers'): 
50                         return "boot"
51                 else:
52                         return "unknown"
53
54         def get_dmesg(self):
55                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
56                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
57                 log = open("log/dmesg.%s.log" % self.node, 'r')
58                 return log
59
60         def get_bootmanager_log(self):
61                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
62                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
63                 log = open("log/bm.%s.log" % self.node, 'r')
64                 return log
65
66         def dump_plconf_file(self):
67                 c = self.c
68                 c.modules.sys.path.append("/tmp/source/")
69                 c.modules.os.chdir('/tmp/source')
70
71                 log = c.modules.BootManager.log('/tmp/new.log')
72                 bm = c.modules.BootManager.BootManager(log,'boot')
73
74                 BootManagerException = c.modules.Exceptions.BootManagerException
75                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
76                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
77                 bm_continue = True
78
79                 InitializeBootManager.Run(bm.VARS, bm.LOG)
80                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
81                 except Exception, x:
82                         bm_continue = False
83                         print "   ERROR:", x
84                         print "   Possibly, unable to find valid configuration file"
85
86                 if bm_continue and self.config and not self.config.quiet:
87                         for key in bm.VARS.keys():
88                                 print key, " == ", bm.VARS[key]
89                 else:
90                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
91                 
92
93         def compare_and_repair_nodekeys(self):
94                 c = self.c
95                 c.modules.sys.path.append("/tmp/source/")
96                 c.modules.os.chdir('/tmp/source')
97
98                 log = c.modules.BootManager.log('/tmp/new.log')
99                 bm = c.modules.BootManager.BootManager(log,'boot')
100
101                 BootManagerException = c.modules.Exceptions.BootManagerException
102                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
103                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
104                 bm_continue = True
105
106                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
107
108                 InitializeBootManager.Run(bm.VARS, bm.LOG)
109                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
110                 except Exception, x:
111                         bm_continue = False
112                         print "exception"
113                         print x
114                         print "   Possibly, unable to find valid configuration file"
115
116                 if bm_continue:
117                         print "   NODE: %s" % bm.VARS['NODE_KEY']
118                         print "   PLC : %s" % plcnode['key']
119
120                         if bm.VARS['NODE_KEY'] == plcnode['key']:
121                                 return True
122                         else:
123                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
124                                         print "   Successfully updated NODE_KEY with PLC"
125                                         return True
126                                 else:
127                                         return False
128                                 
129                         #for key in bm.VARS.keys():
130                         #       print key, " == ", bm.VARS[key]
131                 else:
132                         print "   Unable to retrieve NODE_KEY"
133
134         def bootmanager_running(self):
135                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
136                         return True
137                 else:
138                         return False
139
140         def set_nodestate(self, state='boot'):
141                 return api.UpdateNode(self.node, {'boot_state' : state})
142
143         def restart_node(self, state='boot'):
144                 api.UpdateNode(self.node, {'boot_state' : state})
145
146                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
147                 if not pflags.getRecentFlag('gentlekill'):
148                         print "   Killing all slice processes... : %s" %  self.node
149                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
150                         self.c.modules.os.system(cmd_slicekill)
151                         cmd = """ shutdown -r +1 & """
152                         print "   Restarting %s : %s" % ( self.node, cmd)
153                         self.c.modules.os.system(cmd)
154
155                         pflags.setRecentFlag('gentlekill')
156                         pflags.save()
157                 else:
158                         print "   Restarting with sysrq 'sub' %s" % self.node
159                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
160                         self.c.modules.os.system(cmd)
161
162                 return
163
164         def restart_bootmanager(self, forceState):
165
166                 self.c.modules.os.chdir('/tmp/source')
167                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
168                         print "   BootManager is already running: try again soon..."
169                 else:
170                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
171                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
172                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
173                                   "  rm -f /tmp/BM_RUNNING " + \
174                                   ") &" 
175                         cmd = cmd % forceState
176                         self.c.modules.os.system(cmd)
177
178                 return 
179
180
181 import random
182 class PlanetLabSession:
183         globalport = 22000 + int(random.random()*1000)
184
185         def __init__(self, node, nosetup, verbose):
186                 self.verbose = verbose
187                 self.node = node
188                 self.port = None
189                 self.nosetup = nosetup
190                 self.command = None
191                 self.setup_host()
192
193         def get_connection(self, config):
194                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
195         
196         def setup_host(self):
197                 self.port = PlanetLabSession.globalport
198                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
199
200                 args = {}
201                 args['port'] = self.port
202                 args['user'] = 'root'
203                 args['hostname'] = self.node
204                 args['monitordir'] = "/home/soltesz/monitor"
205                 ssh_port = 22
206
207                 if self.nosetup:
208                         print "Skipping setup"
209                         return 
210
211                 # COPY Rpyc files to host
212                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
213                 if self.verbose: print cmd
214                 # TODO: Add timeout
215                 timeout = 120
216                 localos = soltesz.CMD()
217
218                 ret = localos.system(cmd, timeout)
219                 print ret
220                 if ret != 0:
221                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
222                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
223                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
224                         ret = localos.system(cmd, timeout)
225                         print ret
226                         if ret != 0:
227                                 print "\tFAILED TWICE"
228                                 #sys.exit(1)
229                                 raise Exception("Failed twice trying to login with updated ssh host key")
230
231                 t1 = time.time()
232                 # KILL any already running servers.
233                 ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port)
234                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
235             rm -f out.log
236             echo "kill server" >> out.log
237             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
238             echo "export" >> out.log
239             export PYTHONPATH=$HOME  ;
240             echo "start server" >> out.log
241             python Rpyc/Servers/forking_server.py &> server.log &
242             echo "done" >> out.log
243 EOF""")
244                 #cmd = """ssh %(user)s@%(hostname)s """ + \
245                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
246                 #cmd = cmd % args
247                 #if self.verbose: print cmd
248                 ## TODO: Add timeout
249                 #print localos.system(cmd,timeout)
250
251                 ## START a new rpyc server.
252                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
253                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
254                 #cmd = cmd % args
255                 #if self.verbose: print cmd
256                 #print localos.system(cmd,timeout)
257                 print ssh.ret
258
259                 # TODO: Add timeout
260                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
261                 # and the following options seems to work well.
262                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
263                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
264                           """-o ConnectTimeout=120 """ + \
265                           """-n -N -L %(port)s:localhost:18812 """ + \
266                           """%(user)s@%(hostname)s"""
267                 cmd = cmd % args
268                 if self.verbose: print cmd
269                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
270                 # TODO: the read() here may block indefinitely.  Need a better
271                 # approach therefore, that includes a timeout.
272                 #ret = self.command.stdout.read(5)
273                 ret = soltesz.read_t(self.command.stdout, 5)
274
275                 t2 = time.time()
276                 if 'READY' in ret:
277                         # NOTE: There is still a slight race for machines that are slow...
278                         self.timeout = 2*(t2-t1)
279                         print "Sleeping for %s sec" % self.timeout
280                         time.sleep(self.timeout)
281                         return
282
283                 if self.command.returncode is not None:
284                         print "Failed to establish tunnel!"
285                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
286
287                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
288
289         def __del__(self):
290                 if self.command:
291                         if self.verbose: print "Killing SSH session %s" % self.port
292                         self.command.kill()
293
294
295 def steps_to_list(steps):
296         ret_list = []
297         for (id,label) in steps:
298                 ret_list.append(label)
299         return ret_list
300
301 def index_to_id(steps,index):
302         if index < len(steps):
303                 return steps[index][0]
304         else:
305                 return "done"
306
307 def reboot(hostname, config=None, forced_action=None):
308
309         # NOTE: Nothing works if the bootcd is REALLY old.
310         #       So, this is the first step.
311         fbnode = get_fbnode(hostname)
312         if fbnode['category'] == "OLDBOOTCD":
313                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
314                 args = {}
315                 args['hostname_list'] = "    %s" % hostname
316
317                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
318                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
319
320                 loginbase = plc.siteId(hostname)
321                 m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
322
323                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
324                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
325                 return True
326
327         node = hostname
328         print "Creating session for %s" % node
329         # update known_hosts file (in case the node has rebooted since last run)
330         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
331         try:
332                 k = SSHKnownHosts(); k.update(node); k.write(); del k
333         except:
334                 import traceback; print traceback.print_exc()
335                 return False
336
337         try:
338                 if config == None:
339                         session = PlanetLabSession(node, False, True)
340                 else:
341                         session = PlanetLabSession(node, config.nosetup, config.verbose)
342         except Exception, e:
343                 print "ERROR setting up session for %s" % hostname
344                 import traceback; print traceback.print_exc()
345                 print e
346                 return False
347
348         try:
349                 conn = session.get_connection(config)
350         except EOFError:
351                 # NOTE: sometimes the wait in setup_host() is not long enough.  
352                 # So, here we try to wait a little longer before giving up entirely.
353                 try:
354                         time.sleep(session.timeout*4)
355                         conn = session.get_connection(config)
356                 except:
357                         import traceback; print traceback.print_exc()
358                         return False
359                         
360
361         if forced_action == "reboot":
362                 conn.restart_node('rins')
363                 return True
364
365         boot_state = conn.get_boot_state()
366         if boot_state == "boot":
367                 print "...Boot state of %s already completed : skipping..." % node
368                 return True
369         elif boot_state == "unknown":
370                 print "...Unknown bootstate for %s : skipping..."% node
371                 return False
372         else:
373                 pass
374
375         if conn.bootmanager_running():
376                 print "...BootManager is currently running.  Skipping host %s" % node
377                 return True
378
379         #if config != None:
380         #       if config.force:
381         #               conn.restart_bootmanager(config.force)
382         #               return True
383
384         # Read persistent flags, tagged on one week intervals.
385         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
386                 
387
388         if config and not config.quiet: print "...downloading dmesg from %s" % node
389         dmesg = conn.get_dmesg()
390         child = fdpexpect.fdspawn(dmesg)
391
392         sequence = []
393         while True:
394                 steps = [
395                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
396                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
397                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
398
399                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
400                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
401                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
402                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
403                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
404                         ('floppytimeout','floppy0: floppy timeout called'),
405                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
406
407                         # floppy0: floppy timeout called
408                         # end_request: I/O error, dev fd0, sector 0
409
410                         #Buffer I/O error on device dm-2, logical block 8888896
411                         #ata1: status=0x51 { DriveReady SeekComplete Error }
412                         #ata1: error=0x40 { UncorrectableError }
413                         #SCSI error : <0 0 0 0> return code = 0x8000002
414                         #sda: Current: sense key: Medium Error
415                         #       Additional sense: Unrecovered read error - auto reallocate failed
416
417                         #SCSI error : <0 2 0 0> return code = 0x40001
418                         #end_request: I/O error, dev sda, sector 572489600
419                 ]
420                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
421                 sequence.append(id)
422
423                 if id == "done":
424                         break
425
426         s = Set(sequence)
427         if config and not config.quiet: print "\tSET: ", s
428
429         if len(s) > 1:
430                 print "...Potential drive errors on %s" % node
431                 if len(s) == 2 and 'floppyerror' in s:
432                         print "...Should investigate.  Continuing with node."
433                 else:
434                         print "...Should investigate.  Skipping node."
435                         # TODO: send message related to these errors.
436                         args = {}
437                         args['hostname'] = hostname
438                         args['log'] = conn.get_dmesg().read()
439
440                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
441                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
442
443                         loginbase = plc.siteId(hostname)
444                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
445                         conn.set_nodestate('diag')
446                         return False
447
448         print "...Downloading bm.log from %s" % node
449         log = conn.get_bootmanager_log()
450         child = fdpexpect.fdspawn(log)
451
452         try:
453                 if config.collect: return True
454         except:
455                 pass
456
457         time.sleep(1)
458
459         if config and not config.quiet: print "...Scanning bm.log for errors"
460         action_id = "dbg"
461         sequence = []
462         while True:
463
464                 steps = [
465                         ('bminit'               , 'Initializing the BootManager.'),
466                         ('cfg'                  , 'Reading node configuration file.'),
467                         ('auth'                 , 'Authenticating node with PLC.'),
468                         ('getplc'               , 'Retrieving details of node from PLC.'),
469                         ('update'               , 'Updating node boot state at PLC.'),
470                         ('hardware'             , 'Checking if hardware requirements met.'),
471                         ('installinit'  , 'Install: Initializing.'),
472                         ('installdisk'  , 'Install: partitioning disks.'),
473                         ('installbootfs', 'Install: bootstrapfs tarball.'),
474                         ('installcfg'   , 'Install: Writing configuration files.'),
475                         ('installstop'  , 'Install: Shutting down installer.'),
476                         ('update2'              , 'Updating node boot state at PLC.'),
477                         ('installinit2' , 'Install: Initializing.'),
478                         ('validate'             , 'Validating node installation.'),
479                         ('rebuildinitrd', 'Rebuilding initrd'),
480                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
481                         ('update3'              , 'Updating node configuration.'),
482                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
483                         ('update4'              , 'Sending hardware configuration to PLC.'),
484                         ('debug'                , 'Starting debug mode'),
485                         ('bmexceptmount', 'BootManagerException during mount'),
486                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
487                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
488                         ('exception'    , 'Exception'),
489                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
490                         ('protoerror'   , 'XML RPC protocol error'),
491                         ('nodehostname' , 'Configured node hostname does not resolve'),
492                         ('implementerror', 'Implementation Error'),
493                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
494                         ('noinstall'    , 'notinstalled'),
495                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
496                         ('noblockdev'   , "No block devices detected."),
497                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
498                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
499                         ('hardwarerequirefail' , 'Hardware requirements not met'),
500                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
501                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
502                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
503                         ('modulefail'   , 'Unable to get list of system modules'),
504                         ('writeerror'   , 'write error: No space left on device'),
505                         ('nospace'      , "No space left on device"),
506                         ('nonode'       , 'Failed to authenticate call: No such node'),
507                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
508                         ('bootcheckfail'     , 'BootCheckAuthentication'),
509                         ('bootupdatefail'   , 'BootUpdateNode'),
510                 ]
511                 list = steps_to_list(steps)
512                 index = child.expect( list + [ pexpect.EOF ])
513                 id = index_to_id(steps,index)
514                 sequence.append(id)
515
516                 if id == "exception":
517                         if config and not config.quiet: print "...Found An Exception!!!"
518                 elif index == len(list):
519                         #print "Reached EOF"
520                         break
521                 
522         s = "-".join(sequence)
523         print "   FOUND SEQUENCE: ", s
524
525         # NOTE: We get or set the flag based on the current sequence identifier.
526         #  By using the sequence identifier, we guarantee that there will be no
527         #  frequent loops.  I'm guessing there is a better way to track loops,
528         #  though.
529         if not config.force and pflags.getRecentFlag(s):
530                 pflags.setRecentFlag(s)
531                 pflags.save() 
532                 print "... flag is set or it has already run recently. Skipping %s" % node
533                 return True
534
535         sequences = {}
536
537
538         # restart_bootmanager_boot
539         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
540                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
541                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
542                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
543                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
544                         "bminit-cfg-auth-getplc-update-debug-done",
545                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
546                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
547                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
548                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
549                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
550                         ]:
551                 sequences.update({n : "restart_bootmanager_boot"})
552
553         #       conn.restart_bootmanager('rins')
554         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
555                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
556                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
557                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
558                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
559                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
560                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
561                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
562                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
563                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
564                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
565                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
566                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
567                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
568                         ]:
569                 sequences.update({n : "restart_bootmanager_rins"})
570
571         # repair_node_keys
572         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
573
574         #   conn.restart_node('rins')
575         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
576                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
577                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
578                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
579                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
580                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
581                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
582                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
583                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
584                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
585                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
586                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
587                         ]:
588                 sequences.update({n : "restart_node_rins"})
589
590         #       restart_node_boot
591         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
592                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
593                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
594                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
595                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
596                          ]:
597                 sequences.update({n: "restart_node_boot"})
598
599         # update_node_config_email
600         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
601                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
602                         ]:
603                 sequences.update({n : "update_node_config_email"})
604
605         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
606                 sequences.update({n : "nodenetwork_email"})
607
608         # update_bootcd_email
609         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
610                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
611                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
612                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
613                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
614                         ]:
615                 sequences.update({n : "update_bootcd_email"})
616
617         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
618                         ]:
619                 sequences.update({n: "suspect_error_email"})
620
621         # update_hardware_email
622         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
623         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
624
625         # broken_hardware_email
626         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
627
628         flag_set = True
629
630         
631         if s not in sequences:
632                 print "   HOST %s" % hostname
633                 print "   UNKNOWN SEQUENCE: %s" % s
634
635                 args = {}
636                 args['hostname'] = hostname
637                 args['sequence'] = s
638                 args['bmlog'] = conn.get_bootmanager_log().read()
639                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
640                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
641                 m.reset()
642                 m.send(['monitor-list@lists.planet-lab.org'])
643
644                 conn.restart_bootmanager('boot')
645
646                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
647                 # This way, we can check it again after we've fixed it.
648                 flag_set = False
649
650         else:
651
652                 if   sequences[s] == "restart_bootmanager_boot":
653                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
654                         conn.restart_bootmanager('boot')
655                 elif sequences[s] == "restart_bootmanager_rins":
656                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
657                         conn.restart_bootmanager('rins')
658                 elif sequences[s] == "restart_node_rins":
659                         conn.restart_node('rins')
660                 elif sequences[s] == "restart_node_boot":
661                         conn.restart_node('boot')
662                 elif sequences[s] == "repair_node_keys":
663                         if conn.compare_and_repair_nodekeys():
664                                 # the keys either are in sync or were forced in sync.
665                                 # so try to reboot the node again.
666                                 conn.restart_bootmanager('rins')
667                                 pass
668                         else:
669                                 # there was some failure to synchronize the keys.
670                                 print "...Unable to repair node keys on %s" % node
671
672                 elif sequences[s] == "suspect_error_email":
673                         args = {}
674                         args['hostname'] = hostname
675                         args['sequence'] = s
676                         args['bmlog'] = conn.get_bootmanager_log().read()
677                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
678                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
679                         m.reset()
680                         m.send(['monitor-list@lists.planet-lab.org'])
681
682                         conn.restart_bootmanager('boot')
683
684                 elif sequences[s] == "update_node_config_email":
685                         print "...Sending message to UPDATE NODE CONFIG"
686                         args = {}
687                         args['hostname'] = hostname
688                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
689                                                                 True, db='nodeid_persistmessages')
690                         loginbase = plc.siteId(hostname)
691                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
692                         conn.dump_plconf_file()
693                         conn.set_nodestate('diag')
694
695                 elif sequences[s] == "nodenetwork_email":
696                         print "...Sending message to LOOK AT NODE NETWORK"
697                         args = {}
698                         args['hostname'] = hostname
699                         args['bmlog'] = conn.get_bootmanager_log().read()
700                         m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
701                                                                 True, db='nodenet_persistmessages')
702                         loginbase = plc.siteId(hostname)
703                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
704                         conn.dump_plconf_file()
705                         conn.set_nodestate('diag')
706
707                 elif sequences[s] == "update_bootcd_email":
708                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
709                         import getconf
710                         args = {}
711                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
712                         args['hostname_list'] = "%s" % hostname
713
714                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
715                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
716
717                         loginbase = plc.siteId(hostname)
718                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
719
720                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
721                         conn.set_nodestate('disable')
722
723                 elif sequences[s] == "broken_hardware_email":
724                         # MAKE An ACTION record that this host has failed hardware.  May
725                         # require either an exception "/minhw" or other manual intervention.
726                         # Definitely need to send out some more EMAIL.
727                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
728                         # TODO: email notice of broken hardware
729                         args = {}
730                         args['hostname'] = hostname
731                         args['log'] = conn.get_dmesg().read()
732                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
733                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
734
735                         loginbase = plc.siteId(hostname)
736                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
737                         conn.set_nodestate('disable')
738
739                 elif sequences[s] == "update_hardware_email":
740                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
741                         args = {}
742                         args['hostname'] = hostname
743                         args['bmlog'] = conn.get_bootmanager_log().read()
744                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
745                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
746
747                         loginbase = plc.siteId(hostname)
748                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
749                         conn.set_nodestate('disable')
750
751         if flag_set:
752                 pflags.setRecentFlag(s)
753                 pflags.save() 
754
755         return True
756         
757
758 # MAIN -------------------------------------------------------------------
759
760 def main():
761         from config import config
762         from optparse import OptionParser
763         parser = OptionParser()
764         parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False)
765         parser.add_option("", "--child", dest="child", action="store_true", 
766                                                 help="This is the child mode of this process.")
767         parser.add_option("", "--force", dest="force", metavar="boot_state",
768                                                 help="Force a boot state passed to BootManager.py.")
769         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
770                                                 help="Extra quiet output messages.")
771         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
772                                                 help="Extra debug output messages.")
773         parser.add_option("", "--collect", dest="collect", action="store_true", 
774                                                 help="No action, just collect dmesg, and bm.log")
775         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
776                                                 help="Do not perform the orginary setup phase.")
777         parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
778                                                 help="A single node name to try to bring out of debug mode.")
779         parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
780                                                 help="A list of nodes to bring out of debug mode.")
781         config = config(parser)
782         config.parse_args()
783
784         if config.nodelist:
785                 nodes = config.getListFromFile(config.nodelist)
786         elif config.node:
787                 nodes = [ config.node ]
788         else:
789                 parser.print_help()
790                 sys.exit(1)
791
792         for node in nodes:
793                 reboot(node, config)
794
795 if __name__ == "__main__":
796         main()