Added a check for bad dns on the node that prevents bootmanager from booting.
[monitor.git] / bootman.py
1 #!/usr/bin/python
2
3 # Attempt to reboot a node in debug state.
4
5 import plc
6 import auth
7 api = plc.PLC(auth.auth, auth.plc)
8
9 import sys
10 import os
11 import policy
12
13 from getsshkeys import SSHKnownHosts
14
15 import subprocess
16 import time
17 import database
18 import moncommands
19 from sets import Set
20
21 import ssh.pxssh as pxssh
22 import ssh.fdpexpect as fdpexpect
23 import ssh.pexpect as pexpect
24 from unified_model import *
25 from emailTxt import mailtxt
26 from nodeconfig import network_config_to_str
27 import traceback
28 import monitorconfig
29
30 import signal
31 class Sopen(subprocess.Popen):
32         def kill(self, signal = signal.SIGTERM):
33                 os.kill(self.pid, signal)
34
35 #from Rpyc import SocketConnection, Async
36 from Rpyc import SocketConnection, Async
37 from Rpyc.Utils import *
38
39 def get_fbnode(node):
40         fb = database.dbLoad("findbad")
41         fbnode = fb['nodes'][node]['values']
42         return fbnode
43
44 class NodeConnection:
45         def __init__(self, connection, node, config):
46                 self.node = node
47                 self.c = connection
48                 self.config = config
49
50         def get_boot_state(self):
51                 if self.c.modules.os.path.exists('/tmp/source'):
52                         return "dbg"
53                 elif self.c.modules.os.path.exists('/vservers'): 
54                         return "boot"
55                 else:
56                         return "unknown"
57
58         def get_dmesg(self):
59                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
60                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
61                 log = open("log/dmesg.%s.log" % self.node, 'r')
62                 return log
63
64         def get_bootmanager_log(self):
65                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
66                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
67                 log = open("log/bm.%s.log" % self.node, 'r')
68                 return log
69
70         def dump_plconf_file(self):
71                 c = self.c
72                 self.c.modules.sys.path.append("/tmp/source/")
73                 self.c.modules.os.chdir('/tmp/source')
74
75                 log = c.modules.BootManager.log('/tmp/new.log')
76                 bm = c.modules.BootManager.BootManager(log,'boot')
77
78                 BootManagerException = c.modules.Exceptions.BootManagerException
79                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
80                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
81                 bm_continue = True
82
83                 InitializeBootManager.Run(bm.VARS, bm.LOG)
84                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
85                 except Exception, x:
86                         bm_continue = False
87                         print "   ERROR:", x
88                         print "   Possibly, unable to find valid configuration file"
89
90                 if bm_continue and self.config and not self.config.quiet:
91                         for key in bm.VARS.keys():
92                                 print key, " == ", bm.VARS[key]
93                 else:
94                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
95                 
96
97         def compare_and_repair_nodekeys(self):
98                 c = self.c
99                 self.c.modules.sys.path.append("/tmp/source/")
100                 self.c.modules.os.chdir('/tmp/source')
101
102                 log = c.modules.BootManager.log('/tmp/new.log')
103                 bm = c.modules.BootManager.BootManager(log,'boot')
104
105                 BootManagerException = c.modules.Exceptions.BootManagerException
106                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
107                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
108                 bm_continue = True
109
110                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
111
112                 InitializeBootManager.Run(bm.VARS, bm.LOG)
113                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
114                 except Exception, x:
115                         bm_continue = False
116                         print "exception"
117                         print x
118                         print "   Possibly, unable to find valid configuration file"
119
120                 if bm_continue:
121                         print "   NODE: %s" % bm.VARS['NODE_KEY']
122                         print "   PLC : %s" % plcnode['key']
123
124                         if bm.VARS['NODE_KEY'] == plcnode['key']:
125                                 return True
126                         else:
127                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
128                                         print "   Successfully updated NODE_KEY with PLC"
129                                         return True
130                                 else:
131                                         return False
132                                 
133                         #for key in bm.VARS.keys():
134                         #       print key, " == ", bm.VARS[key]
135                 else:
136                         print "   Unable to retrieve NODE_KEY"
137
138         def bootmanager_running(self):
139                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
140                         return True
141                 else:
142                         return False
143
144         def set_nodestate(self, state='boot'):
145                 return api.UpdateNode(self.node, {'boot_state' : state})
146
147         def restart_node(self, state='boot'):
148                 api.UpdateNode(self.node, {'boot_state' : state})
149
150                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
151                 if not pflags.getRecentFlag('gentlekill'):
152                         print "   Killing all slice processes... : %s" %  self.node
153                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
154                         self.c.modules.os.system(cmd_slicekill)
155                         cmd = """ shutdown -r +1 & """
156                         print "   Restarting %s : %s" % ( self.node, cmd)
157                         self.c.modules.os.system(cmd)
158
159                         pflags.setRecentFlag('gentlekill')
160                         pflags.save()
161                 else:
162                         print "   Restarting with sysrq 'sub' %s" % self.node
163                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
164                         self.c.modules.os.system(cmd)
165
166                 return
167
168         def restart_bootmanager(self, forceState):
169
170                 self.c.modules.os.chdir('/tmp/source')
171                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
172                         print "   BootManager is already running: try again soon..."
173                 else:
174                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
175                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
176                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
177                                   "  rm -f /tmp/BM_RUNNING " + \
178                                   ") &" 
179                         cmd = cmd % forceState
180                         self.c.modules.os.system(cmd)
181
182                 return 
183
184
185 import random
186 class PlanetLabSession:
187         globalport = 22000 + int(random.random()*1000)
188
189         def __init__(self, node, nosetup, verbose):
190                 self.verbose = verbose
191                 self.node = node
192                 self.port = None
193                 self.nosetup = nosetup
194                 self.command = None
195                 self.setup_host()
196
197         def get_connection(self, config):
198                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
199         
200         def setup_host(self):
201                 self.port = PlanetLabSession.globalport
202                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
203
204                 args = {}
205                 args['port'] = self.port
206                 args['user'] = 'root'
207                 args['hostname'] = self.node
208                 args['monitordir'] = monitorconfig.MONITOR_SCRIPT_ROOT
209                 ssh_port = 22
210
211                 if self.nosetup:
212                         print "Skipping setup"
213                         return 
214
215                 # COPY Rpyc files to host
216                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
217                 if self.verbose: print cmd
218                 # TODO: Add timeout
219                 timeout = 120
220                 localos = moncommands.CMD()
221
222                 ret = localos.system(cmd, timeout)
223                 print ret
224                 if ret != 0:
225                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
226                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
227                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
228                         ret = localos.system(cmd, timeout)
229                         print ret
230                         if ret != 0:
231                                 print "\tFAILED TWICE"
232                                 #sys.exit(1)
233                                 raise Exception("Failed twice trying to login with updated ssh host key")
234
235                 t1 = time.time()
236                 # KILL any already running servers.
237                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
238                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
239             rm -f out.log
240             echo "kill server" >> out.log
241             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ; 
242             echo "export" >> out.log
243             export PYTHONPATH=$HOME  ;
244             echo "start server" >> out.log
245             python Rpyc/Servers/forking_server.py &> server.log &
246             echo "done" >> out.log
247 EOF""")
248                 #cmd = """ssh %(user)s@%(hostname)s """ + \
249                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
250                 #cmd = cmd % args
251                 #if self.verbose: print cmd
252                 ## TODO: Add timeout
253                 #print localos.system(cmd,timeout)
254
255                 ## START a new rpyc server.
256                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
257                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """ 
258                 #cmd = cmd % args
259                 #if self.verbose: print cmd
260                 #print localos.system(cmd,timeout)
261                 print ssh.ret
262
263                 # TODO: Add timeout
264                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1, 
265                 # and the following options seems to work well.
266                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
267                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
268                           """-o ConnectTimeout=120 """ + \
269                           """-n -N -L %(port)s:localhost:18812 """ + \
270                           """%(user)s@%(hostname)s"""
271                 cmd = cmd % args
272                 if self.verbose: print cmd
273                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
274                 # TODO: the read() here may block indefinitely.  Need a better
275                 # approach therefore, that includes a timeout.
276                 #ret = self.command.stdout.read(5)
277                 ret = moncommands.read_t(self.command.stdout, 5)
278
279                 t2 = time.time()
280                 if 'READY' in ret:
281                         # NOTE: There is still a slight race for machines that are slow...
282                         self.timeout = 2*(t2-t1)
283                         print "Sleeping for %s sec" % self.timeout
284                         time.sleep(self.timeout)
285                         return
286
287                 if self.command.returncode is not None:
288                         print "Failed to establish tunnel!"
289                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
290
291                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
292
293         def __del__(self):
294                 if self.command:
295                         if self.verbose: print "Killing SSH session %s" % self.port
296                         self.command.kill()
297
298
299 def steps_to_list(steps):
300         ret_list = []
301         for (id,label) in steps:
302                 ret_list.append(label)
303         return ret_list
304
305 def index_to_id(steps,index):
306         if index < len(steps):
307                 return steps[index][0]
308         else:
309                 return "done"
310
311 def reboot(hostname, config=None, forced_action=None):
312
313         # NOTE: Nothing works if the bootcd is REALLY old.
314         #       So, this is the first step.
315         fbnode = get_fbnode(hostname)
316         if fbnode['category'] == "OLDBOOTCD":
317                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
318                 args = {}
319                 args['hostname_list'] = "    %s" % hostname
320
321                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
322                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
323
324                 loginbase = plc.siteId(hostname)
325                 m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
326
327                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
328                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
329                 return True
330
331         node = hostname
332         print "Creating session for %s" % node
333         # update known_hosts file (in case the node has rebooted since last run)
334         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
335         try:
336                 k = SSHKnownHosts(); k.update(node); k.write(); del k
337         except:
338                 print traceback.print_exc()
339                 return False
340
341         try:
342                 if config == None:
343                         session = PlanetLabSession(node, False, True)
344                 else:
345                         session = PlanetLabSession(node, config.nosetup, config.verbose)
346         except Exception, e:
347                 print "ERROR setting up session for %s" % hostname
348                 print traceback.print_exc()
349                 print e
350                 return False
351
352         try:
353                 conn = session.get_connection(config)
354         except EOFError:
355                 # NOTE: sometimes the wait in setup_host() is not long enough.  
356                 # So, here we try to wait a little longer before giving up entirely.
357                 try:
358                         time.sleep(session.timeout*4)
359                         conn = session.get_connection(config)
360                 except:
361                         print traceback.print_exc()
362                         return False
363                         
364
365         if forced_action == "reboot":
366                 conn.restart_node('rins')
367                 return True
368
369         boot_state = conn.get_boot_state()
370         if boot_state == "boot":
371                 print "...Boot state of %s already completed : skipping..." % node
372                 return True
373         elif boot_state == "unknown":
374                 print "...Unknown bootstate for %s : skipping..."% node
375                 return False
376         else:
377                 pass
378
379         if conn.bootmanager_running():
380                 print "...BootManager is currently running.  Skipping host %s" % node
381                 return True
382
383         #if config != None:
384         #       if config.force:
385         #               conn.restart_bootmanager(config.force)
386         #               return True
387
388         # Read persistent flags, tagged on one week intervals.
389         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
390                 
391
392         if config and not config.quiet: print "...downloading dmesg from %s" % node
393         dmesg = conn.get_dmesg()
394         child = fdpexpect.fdspawn(dmesg)
395
396         sequence = []
397         while True:
398                 steps = [
399                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
400                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
401                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
402
403                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
404                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
405                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
406                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
407                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
408                         ('floppytimeout','floppy0: floppy timeout called'),
409                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
410
411                         # floppy0: floppy timeout called
412                         # end_request: I/O error, dev fd0, sector 0
413
414                         #Buffer I/O error on device dm-2, logical block 8888896
415                         #ata1: status=0x51 { DriveReady SeekComplete Error }
416                         #ata1: error=0x40 { UncorrectableError }
417                         #SCSI error : <0 0 0 0> return code = 0x8000002
418                         #sda: Current: sense key: Medium Error
419                         #       Additional sense: Unrecovered read error - auto reallocate failed
420
421                         #SCSI error : <0 2 0 0> return code = 0x40001
422                         #end_request: I/O error, dev sda, sector 572489600
423                 ]
424                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
425                 sequence.append(id)
426
427                 if id == "done":
428                         break
429
430         s = Set(sequence)
431         if config and not config.quiet: print "\tSET: ", s
432
433         if len(s) > 1:
434                 print "...Potential drive errors on %s" % node
435                 if len(s) == 2 and 'floppyerror' in s:
436                         print "...Should investigate.  Continuing with node."
437                 else:
438                         print "...Should investigate.  Skipping node."
439                         # TODO: send message related to these errors.
440                         args = {}
441                         args['hostname'] = hostname
442                         args['log'] = conn.get_dmesg().read()
443
444                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
445                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
446
447                         loginbase = plc.siteId(hostname)
448                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
449                         conn.set_nodestate('diag')
450                         return False
451
452         print "...Downloading bm.log from %s" % node
453         log = conn.get_bootmanager_log()
454         child = fdpexpect.fdspawn(log)
455
456         try:
457                 if config.collect: return True
458         except:
459                 pass
460
461         time.sleep(1)
462
463         if config and not config.quiet: print "...Scanning bm.log for errors"
464         action_id = "dbg"
465         sequence = []
466         while True:
467
468                 steps = [
469                         ('bminit'               , 'Initializing the BootManager.'),
470                         ('cfg'                  , 'Reading node configuration file.'),
471                         ('auth'                 , 'Authenticating node with PLC.'),
472                         ('getplc'               , 'Retrieving details of node from PLC.'),
473                         ('update'               , 'Updating node boot state at PLC.'),
474                         ('hardware'             , 'Checking if hardware requirements met.'),
475                         ('installinit'  , 'Install: Initializing.'),
476                         ('installdisk'  , 'Install: partitioning disks.'),
477                         ('installbootfs', 'Install: bootstrapfs tarball.'),
478                         ('installcfg'   , 'Install: Writing configuration files.'),
479                         ('installstop'  , 'Install: Shutting down installer.'),
480                         ('update2'              , 'Updating node boot state at PLC.'),
481                         ('installinit2' , 'Install: Initializing.'),
482                         ('validate'             , 'Validating node installation.'),
483                         ('rebuildinitrd', 'Rebuilding initrd'),
484                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
485                         ('update3'              , 'Updating node configuration.'),
486                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
487                         ('update4'              , 'Sending hardware configuration to PLC.'),
488                         ('debug'                , 'Starting debug mode'),
489                         ('bmexceptmount', 'BootManagerException during mount'),
490                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
491                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
492                         ('exception'    , 'Exception'),
493                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
494                         ('protoerror'   , 'XML RPC protocol error'),
495                         ('nodehostname' , 'Configured node hostname does not resolve'),
496                         ('implementerror', 'Implementation Error'),
497                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
498                         ('noinstall'    , 'notinstalled'),
499                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
500                         ('noblockdev'   , "No block devices detected."),
501                         ('dnserror'     , 'Name or service not known'),
502                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
503                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
504                         ('hardwarerequirefail' , 'Hardware requirements not met'),
505                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
506                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
507                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
508                         ('modulefail'   , 'Unable to get list of system modules'),
509                         ('writeerror'   , 'write error: No space left on device'),
510                         ('nospace'      , "No space left on device"),
511                         ('nonode'       , 'Failed to authenticate call: No such node'),
512                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
513                         ('bootcheckfail'     , 'BootCheckAuthentication'),
514                         ('bootupdatefail'   , 'BootUpdateNode'),
515                 ]
516                 list = steps_to_list(steps)
517                 index = child.expect( list + [ pexpect.EOF ])
518                 id = index_to_id(steps,index)
519                 sequence.append(id)
520
521                 if id == "exception":
522                         if config and not config.quiet: print "...Found An Exception!!!"
523                 elif index == len(list):
524                         #print "Reached EOF"
525                         break
526                 
527         s = "-".join(sequence)
528         print "   FOUND SEQUENCE: ", s
529
530         # NOTE: We get or set the flag based on the current sequence identifier.
531         #  By using the sequence identifier, we guarantee that there will be no
532         #  frequent loops.  I'm guessing there is a better way to track loops,
533         #  though.
534         if not config.force and pflags.getRecentFlag(s):
535                 pflags.setRecentFlag(s)
536                 pflags.save() 
537                 print "... flag is set or it has already run recently. Skipping %s" % node
538                 return True
539
540         sequences = {}
541
542
543         # restart_bootmanager_boot
544         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
545                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
546                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
547
548                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
549
550                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
551                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
552                         "bminit-cfg-auth-getplc-update-debug-done",
553                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
554                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
555                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
556                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
557                         "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
558                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
559                         ]:
560                 sequences.update({n : "restart_bootmanager_boot"})
561
562         #       conn.restart_bootmanager('rins')
563         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
564                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
565                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
566                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
567                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
568                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
569                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
570                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
571                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
572                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
573                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
574                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
575                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
576                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
577                         ]:
578                 sequences.update({n : "restart_bootmanager_rins"})
579
580         # repair_node_keys
581         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
582
583         #   conn.restart_node('rins')
584         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
585                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
586                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
587                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
588                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
589                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
590                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
591                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
592                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
593                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
594                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
595                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
596                         ]:
597                 sequences.update({n : "restart_node_rins"})
598
599         #       restart_node_boot
600         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
601                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
602                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
603                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
604                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
605                          ]:
606                 sequences.update({n: "restart_node_boot"})
607
608         # update_node_config_email
609         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
610                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
611                         ]:
612                 sequences.update({n : "update_node_config_email"})
613
614         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
615                 sequences.update({n : "nodenetwork_email"})
616
617         # update_bootcd_email
618         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
619                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
620                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
621                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
622                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
623                         ]:
624                 sequences.update({n : "update_bootcd_email"})
625
626         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
627                         ]:
628                 sequences.update({n: "suspect_error_email"})
629
630         # update_hardware_email
631         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
632         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
633
634         # broken_hardware_email
635         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
636
637         # bad_dns_email
638         sequences.update({"bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done" : "bad_dns_email"})
639
640         flag_set = True
641
642         
643         if s not in sequences:
644                 print "   HOST %s" % hostname
645                 print "   UNKNOWN SEQUENCE: %s" % s
646
647                 args = {}
648                 args['hostname'] = hostname
649                 args['sequence'] = s
650                 args['bmlog'] = conn.get_bootmanager_log().read()
651                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
652                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
653                 m.reset()
654                 m.send(['monitor-list@lists.planet-lab.org'])
655
656                 conn.restart_bootmanager('boot')
657
658                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
659                 # This way, we can check it again after we've fixed it.
660                 flag_set = False
661
662         else:
663
664                 if   sequences[s] == "restart_bootmanager_boot":
665                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
666                         conn.restart_bootmanager('boot')
667                 elif sequences[s] == "restart_bootmanager_rins":
668                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
669                         conn.restart_bootmanager('rins')
670                 elif sequences[s] == "restart_node_rins":
671                         conn.restart_node('rins')
672                 elif sequences[s] == "restart_node_boot":
673                         conn.restart_node('boot')
674                 elif sequences[s] == "repair_node_keys":
675                         if conn.compare_and_repair_nodekeys():
676                                 # the keys either are in sync or were forced in sync.
677                                 # so try to reboot the node again.
678                                 conn.restart_bootmanager('rins')
679                                 pass
680                         else:
681                                 # there was some failure to synchronize the keys.
682                                 print "...Unable to repair node keys on %s" % node
683
684                 elif sequences[s] == "suspect_error_email":
685                         args = {}
686                         args['hostname'] = hostname
687                         args['sequence'] = s
688                         args['bmlog'] = conn.get_bootmanager_log().read()
689                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
690                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
691                         m.reset()
692                         m.send(['monitor-list@lists.planet-lab.org'])
693
694                         conn.restart_bootmanager('boot')
695
696                 elif sequences[s] == "update_node_config_email":
697                         print "...Sending message to UPDATE NODE CONFIG"
698                         args = {}
699                         args['hostname'] = hostname
700                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args, 
701                                                                 True, db='nodeid_persistmessages')
702                         loginbase = plc.siteId(hostname)
703                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
704                         conn.dump_plconf_file()
705                         conn.set_nodestate('diag')
706
707                 elif sequences[s] == "nodenetwork_email":
708                         print "...Sending message to LOOK AT NODE NETWORK"
709                         args = {}
710                         args['hostname'] = hostname
711                         args['bmlog'] = conn.get_bootmanager_log().read()
712                         m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args, 
713                                                                 True, db='nodenet_persistmessages')
714                         loginbase = plc.siteId(hostname)
715                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
716                         conn.dump_plconf_file()
717                         conn.set_nodestate('diag')
718
719                 elif sequences[s] == "update_bootcd_email":
720                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
721                         import getconf
722                         args = {}
723                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
724                         args['hostname_list'] = "%s" % hostname
725
726                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
727                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
728
729                         loginbase = plc.siteId(hostname)
730                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
731
732                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
733                         conn.set_nodestate('disable')
734
735                 elif sequences[s] == "broken_hardware_email":
736                         # MAKE An ACTION record that this host has failed hardware.  May
737                         # require either an exception "/minhw" or other manual intervention.
738                         # Definitely need to send out some more EMAIL.
739                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
740                         # TODO: email notice of broken hardware
741                         args = {}
742                         args['hostname'] = hostname
743                         args['log'] = conn.get_dmesg().read()
744                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
745                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
746
747                         loginbase = plc.siteId(hostname)
748                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
749                         conn.set_nodestate('disable')
750
751                 elif sequences[s] == "update_hardware_email":
752                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
753                         args = {}
754                         args['hostname'] = hostname
755                         args['bmlog'] = conn.get_bootmanager_log().read()
756                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
757                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
758
759                         loginbase = plc.siteId(hostname)
760                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
761                         conn.set_nodestate('disable')
762
763                 elif sequences[s] == "bad_dns_email":
764                         print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
765                         args = {}
766                         try:
767                                 node = api.GetNodes(hostname)[0]
768                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
769                         except:
770                                 print traceback.print_exc()
771                                 # TODO: api error. skip email, b/c all info is not available,
772                                 # flag_set will not be recorded.
773                                 return False
774                         nodenet_str = network_config_to_str(net)
775
776                         args['hostname'] = hostname
777                         args['network_config'] = nodenet_str
778                         args['nodenetwork_id'] = net['nodenetwork_id']
779                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
780                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
781
782                         loginbase = plc.siteId(hostname)
783                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
784                         conn.set_nodestate('disable')
785
786         if flag_set:
787                 pflags.setRecentFlag(s)
788                 pflags.save() 
789
790         return True
791         
792
793 # MAIN -------------------------------------------------------------------
794
795 def main():
796         from config import config
797         from optparse import OptionParser
798         parser = OptionParser()
799         parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False)
800         parser.add_option("", "--child", dest="child", action="store_true", 
801                                                 help="This is the child mode of this process.")
802         parser.add_option("", "--force", dest="force", metavar="boot_state",
803                                                 help="Force a boot state passed to BootManager.py.")
804         parser.add_option("", "--quiet", dest="quiet", action="store_true", 
805                                                 help="Extra quiet output messages.")
806         parser.add_option("", "--verbose", dest="verbose", action="store_true", 
807                                                 help="Extra debug output messages.")
808         parser.add_option("", "--nonet", dest="nonet", action="store_true", 
809                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
810         parser.add_option("", "--collect", dest="collect", action="store_true", 
811                                                 help="No action, just collect dmesg, and bm.log")
812         parser.add_option("", "--nosetup", dest="nosetup", action="store_true", 
813                                                 help="Do not perform the orginary setup phase.")
814         parser.add_option("", "--node", dest="node", metavar="nodename.edu", 
815                                                 help="A single node name to try to bring out of debug mode.")
816         parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt", 
817                                                 help="A list of nodes to bring out of debug mode.")
818         config = config(parser)
819         config.parse_args()
820
821         if config.nodelist:
822                 nodes = config.getListFromFile(config.nodelist)
823         elif config.node:
824                 nodes = [ config.node ]
825         else:
826                 parser.print_help()
827                 sys.exit(1)
828
829         for node in nodes:
830                 reboot(node, config)
831
832 if __name__ == "__main__":
833         main()