bootman.py

   1 #!/usr/bin/python
   2
   3 # Attempt to reboot a node in debug state.
   4
   5 import plc
   6 import auth
   7 api = plc.PLC(auth.auth, auth.plc)
   8
   9 import sys
  10 import os
  11 import policy
  12
  13 from getsshkeys import SSHKnownHosts
  14
  15 import subprocess
  16 import time
  17 import soltesz
  18 from sets import Set
  19
  20 import ssh.pxssh as pxssh
  21 import ssh.fdpexpect as fdpexpect
  22 import ssh.pexpect as pexpect
  23 from unified_model import *
  24 from emailTxt import mailtxt
  25
  26 import signal
  27 class Sopen(subprocess.Popen):
  28         def kill(self, signal = signal.SIGTERM):
  29                 os.kill(self.pid, signal)
  30
  31 #from Rpyc import SocketConnection, Async
  32 from Rpyc import SocketConnection, Async
  33 from Rpyc.Utils import *
  34
  35 def get_fbnode(node):
  36         fb = soltesz.dbLoad("findbad")
  37         fbnode = fb['nodes'][node]['values']
  38         return fbnode
  39
  40 class NodeConnection:
  41         def __init__(self, connection, node, config):
  42                 self.node = node
  43                 self.c = connection
  44                 self.config = config
  45
  46         def get_boot_state(self):
  47                 if self.c.modules.os.path.exists('/tmp/source'):
  48                         return "dbg"
  49                 elif self.c.modules.os.path.exists('/vservers'):
  50                         return "boot"
  51                 else:
  52                         return "unknown"
  53
  54         def get_dmesg(self):
  55                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
  56                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
  57                 log = open("log/dmesg.%s.log" % self.node, 'r')
  58                 return log
  59
  60         def get_bootmanager_log(self):
  61                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
  62                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
  63                 log = open("log/bm.%s.log" % self.node, 'r')
  64                 return log
  65
  66         def dump_plconf_file(self):
  67                 c = self.c
  68                 c.modules.sys.path.append("/tmp/source/")
  69                 c.modules.os.chdir('/tmp/source')
  70
  71                 log = c.modules.BootManager.log('/tmp/new.log')
  72                 bm = c.modules.BootManager.BootManager(log,'boot')
  73
  74                 BootManagerException = c.modules.Exceptions.BootManagerException
  75                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
  76                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
  77                 bm_continue = True
  78
  79                 InitializeBootManager.Run(bm.VARS, bm.LOG)
  80                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
  81                 except Exception, x:
  82                         bm_continue = False
  83                         print "   ERROR:", x
  84                         print "   Possibly, unable to find valid configuration file"
  85
  86                 if bm_continue and self.config and not self.config.quiet:
  87                         for key in bm.VARS.keys():
  88                                 print key, " == ", bm.VARS[key]
  89                 else:
  90                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
  91
  92
  93         def compare_and_repair_nodekeys(self):
  94                 c = self.c
  95                 c.modules.sys.path.append("/tmp/source/")
  96                 c.modules.os.chdir('/tmp/source')
  97
  98                 log = c.modules.BootManager.log('/tmp/new.log')
  99                 bm = c.modules.BootManager.BootManager(log,'boot')
 100
 101                 BootManagerException = c.modules.Exceptions.BootManagerException
 102                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
 103                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
 104                 bm_continue = True
 105
 106                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
 107
 108                 InitializeBootManager.Run(bm.VARS, bm.LOG)
 109                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
 110                 except Exception, x:
 111                         bm_continue = False
 112                         print "exception"
 113                         print x
 114                         print "   Possibly, unable to find valid configuration file"
 115
 116                 if bm_continue:
 117                         print "   NODE: %s" % bm.VARS['NODE_KEY']
 118                         print "   PLC : %s" % plcnode['key']
 119
 120                         if bm.VARS['NODE_KEY'] == plcnode['key']:
 121                                 return True
 122                         else:
 123                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
 124                                         print "   Successfully updated NODE_KEY with PLC"
 125                                         return True
 126                                 else:
 127                                         return False
 128
 129                         #for key in bm.VARS.keys():
 130                         #       print key, " == ", bm.VARS[key]
 131                 else:
 132                         print "   Unable to retrieve NODE_KEY"
 133
 134         def bootmanager_running(self):
 135                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 136                         return True
 137                 else:
 138                         return False
 139
 140         def set_nodestate(self, state='boot'):
 141                 return api.UpdateNode(self.node, {'boot_state' : state})
 142
 143         def restart_node(self, state='boot'):
 144                 api.UpdateNode(self.node, {'boot_state' : state})
 145
 146                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
 147                 if not pflags.getRecentFlag('gentlekill'):
 148                         print "   Killing all slice processes... : %s" %  self.node
 149                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
 150                         self.c.modules.os.system(cmd_slicekill)
 151                         cmd = """ shutdown -r +1 & """
 152                         print "   Restarting %s : %s" % ( self.node, cmd)
 153                         self.c.modules.os.system(cmd)
 154
 155                         pflags.setRecentFlag('gentlekill')
 156                         pflags.save()
 157                 else:
 158                         print "   Restarting with sysrq 'sub' %s" % self.node
 159                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
 160                         self.c.modules.os.system(cmd)
 161
 162                 return
 163
 164         def restart_bootmanager(self, forceState):
 165
 166                 self.c.modules.os.chdir('/tmp/source')
 167                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 168                         print "   BootManager is already running: try again soon..."
 169                 else:
 170                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
 171                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
 172                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
 173                                   "  rm -f /tmp/BM_RUNNING " + \
 174                                   ") &"
 175                         cmd = cmd % forceState
 176                         self.c.modules.os.system(cmd)
 177
 178                 return
 179
 180
 181 import random
 182 class PlanetLabSession:
 183         globalport = 22000 + int(random.random()*1000)
 184
 185         def __init__(self, node, nosetup, verbose):
 186                 self.verbose = verbose
 187                 self.node = node
 188                 self.port = None
 189                 self.nosetup = nosetup
 190                 self.command = None
 191                 self.setup_host()
 192
 193         def get_connection(self, config):
 194                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
 195
 196         def setup_host(self):
 197                 self.port = PlanetLabSession.globalport
 198                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
 199
 200                 args = {}
 201                 args['port'] = self.port
 202                 args['user'] = 'root'
 203                 args['hostname'] = self.node
 204                 args['monitordir'] = "/home/soltesz/monitor"
 205                 ssh_port = 22
 206
 207                 if self.nosetup:
 208                         print "Skipping setup"
 209                         return
 210
 211                 # COPY Rpyc files to host
 212                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
 213                 if self.verbose: print cmd
 214                 # TODO: Add timeout
 215                 timeout = 120
 216                 localos = soltesz.CMD()
 217
 218                 ret = localos.system(cmd, timeout)
 219                 print ret
 220                 if ret != 0:
 221                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
 222                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
 223                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
 224                         ret = localos.system(cmd, timeout)
 225                         print ret
 226                         if ret != 0:
 227                                 print "\tFAILED TWICE"
 228                                 #sys.exit(1)
 229                                 raise Exception("Failed twice trying to login with updated ssh host key")
 230
 231                 t1 = time.time()
 232                 # KILL any already running servers.
 233                 ssh = soltesz.SSH(args['user'], args['hostname'], ssh_port)
 234                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
 235             rm -f out.log
 236             echo "kill server" >> out.log
 237             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
 238             echo "export" >> out.log
 239             export PYTHONPATH=$HOME  ;
 240             echo "start server" >> out.log
 241             python Rpyc/Servers/forking_server.py &> server.log &
 242             echo "done" >> out.log
 243 EOF""")
 244                 #cmd = """ssh %(user)s@%(hostname)s """ + \
 245                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
 246                 #cmd = cmd % args
 247                 #if self.verbose: print cmd
 248                 ## TODO: Add timeout
 249                 #print localos.system(cmd,timeout)
 250
 251                 ## START a new rpyc server.
 252                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
 253                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
 254                 #cmd = cmd % args
 255                 #if self.verbose: print cmd
 256                 #print localos.system(cmd,timeout)
 257                 print ssh.ret
 258
 259                 # TODO: Add timeout
 260                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1,
 261                 # and the following options seems to work well.
 262                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
 263                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
 264                           """-o ConnectTimeout=120 """ + \
 265                           """-n -N -L %(port)s:localhost:18812 """ + \
 266                           """%(user)s@%(hostname)s"""
 267                 cmd = cmd % args
 268                 if self.verbose: print cmd
 269                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
 270                 # TODO: the read() here may block indefinitely.  Need a better
 271                 # approach therefore, that includes a timeout.
 272                 #ret = self.command.stdout.read(5)
 273                 ret = soltesz.read_t(self.command.stdout, 5)
 274
 275                 t2 = time.time()
 276                 if 'READY' in ret:
 277                         # NOTE: There is still a slight race for machines that are slow...
 278                         self.timeout = 2*(t2-t1)
 279                         print "Sleeping for %s sec" % self.timeout
 280                         time.sleep(self.timeout)
 281                         return
 282
 283                 if self.command.returncode is not None:
 284                         print "Failed to establish tunnel!"
 285                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
 286
 287                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
 288
 289         def __del__(self):
 290                 if self.command:
 291                         if self.verbose: print "Killing SSH session %s" % self.port
 292                         self.command.kill()
 293
 294
 295 def steps_to_list(steps):
 296         ret_list = []
 297         for (id,label) in steps:
 298                 ret_list.append(label)
 299         return ret_list
 300
 301 def index_to_id(steps,index):
 302         if index < len(steps):
 303                 return steps[index][0]
 304         else:
 305                 return "done"
 306
 307 def reboot(hostname, config=None, forced_action=None):
 308
 309         # NOTE: Nothing works if the bootcd is REALLY old.
 310         #       So, this is the first step.
 311         fbnode = get_fbnode(hostname)
 312         if fbnode['category'] == "OLDBOOTCD":
 313                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
 314                 args = {}
 315                 args['hostname_list'] = "    %s" % hostname
 316
 317                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
 318                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
 319
 320                 loginbase = plc.siteId(hostname)
 321                 m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 322
 323                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 324                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
 325                 return True
 326
 327         node = hostname
 328         print "Creating session for %s" % node
 329         # update known_hosts file (in case the node has rebooted since last run)
 330         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
 331         try:
 332                 k = SSHKnownHosts(); k.update(node); k.write(); del k
 333         except:
 334                 import traceback; print traceback.print_exc()
 335                 return False
 336
 337         try:
 338                 if config == None:
 339                         session = PlanetLabSession(node, False, True)
 340                 else:
 341                         session = PlanetLabSession(node, config.nosetup, config.verbose)
 342         except Exception, e:
 343                 print "ERROR setting up session for %s" % hostname
 344                 import traceback; print traceback.print_exc()
 345                 print e
 346                 return False
 347
 348         try:
 349                 conn = session.get_connection(config)
 350         except EOFError:
 351                 # NOTE: sometimes the wait in setup_host() is not long enough.
 352                 # So, here we try to wait a little longer before giving up entirely.
 353                 try:
 354                         time.sleep(session.timeout*4)
 355                         conn = session.get_connection(config)
 356                 except:
 357                         import traceback; print traceback.print_exc()
 358                         return False
 359
 360
 361         if forced_action == "reboot":
 362                 conn.restart_node('rins')
 363                 return True
 364
 365         boot_state = conn.get_boot_state()
 366         if boot_state == "boot":
 367                 print "...Boot state of %s already completed : skipping..." % node
 368                 return True
 369         elif boot_state == "unknown":
 370                 print "...Unknown bootstate for %s : skipping..."% node
 371                 return False
 372         else:
 373                 pass
 374
 375         if conn.bootmanager_running():
 376                 print "...BootManager is currently running.  Skipping host %s" % node
 377                 return True
 378
 379         #if config != None:
 380         #       if config.force:
 381         #               conn.restart_bootmanager(config.force)
 382         #               return True
 383
 384         # Read persistent flags, tagged on one week intervals.
 385         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
 386
 387
 388         if config and not config.quiet: print "...downloading dmesg from %s" % node
 389         dmesg = conn.get_dmesg()
 390         child = fdpexpect.fdspawn(dmesg)
 391
 392         sequence = []
 393         while True:
 394                 steps = [
 395                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
 396                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
 397                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
 398
 399                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
 400                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
 401                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
 402                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
 403                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
 404                         ('floppytimeout','floppy0: floppy timeout called'),
 405                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
 406
 407                         # floppy0: floppy timeout called
 408                         # end_request: I/O error, dev fd0, sector 0
 409
 410                         #Buffer I/O error on device dm-2, logical block 8888896
 411                         #ata1: status=0x51 { DriveReady SeekComplete Error }
 412                         #ata1: error=0x40 { UncorrectableError }
 413                         #SCSI error : <0 0 0 0> return code = 0x8000002
 414                         #sda: Current: sense key: Medium Error
 415                         #       Additional sense: Unrecovered read error - auto reallocate failed
 416
 417                         #SCSI error : <0 2 0 0> return code = 0x40001
 418                         #end_request: I/O error, dev sda, sector 572489600
 419                 ]
 420                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
 421                 sequence.append(id)
 422
 423                 if id == "done":
 424                         break
 425
 426         s = Set(sequence)
 427         if config and not config.quiet: print "\tSET: ", s
 428
 429         if len(s) > 1:
 430                 print "...Potential drive errors on %s" % node
 431                 if len(s) == 2 and 'floppyerror' in s:
 432                         print "...Should investigate.  Continuing with node."
 433                 else:
 434                         print "...Should investigate.  Skipping node."
 435                         # TODO: send message related to these errors.
 436                         args = {}
 437                         args['hostname'] = hostname
 438                         args['log'] = conn.get_dmesg().read()
 439
 440                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
 441                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 442
 443                         loginbase = plc.siteId(hostname)
 444                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 445                         conn.set_nodestate('diag')
 446                         return False
 447
 448         print "...Downloading bm.log from %s" % node
 449         log = conn.get_bootmanager_log()
 450         child = fdpexpect.fdspawn(log)
 451
 452         try:
 453                 if config.collect: return True
 454         except:
 455                 pass
 456
 457         time.sleep(1)
 458
 459         if config and not config.quiet: print "...Scanning bm.log for errors"
 460         action_id = "dbg"
 461         sequence = []
 462         while True:
 463
 464                 steps = [
 465                         ('bminit'               , 'Initializing the BootManager.'),
 466                         ('cfg'                  , 'Reading node configuration file.'),
 467                         ('auth'                 , 'Authenticating node with PLC.'),
 468                         ('getplc'               , 'Retrieving details of node from PLC.'),
 469                         ('update'               , 'Updating node boot state at PLC.'),
 470                         ('hardware'             , 'Checking if hardware requirements met.'),
 471                         ('installinit'  , 'Install: Initializing.'),
 472                         ('installdisk'  , 'Install: partitioning disks.'),
 473                         ('installbootfs', 'Install: bootstrapfs tarball.'),
 474                         ('installcfg'   , 'Install: Writing configuration files.'),
 475                         ('installstop'  , 'Install: Shutting down installer.'),
 476                         ('update2'              , 'Updating node boot state at PLC.'),
 477                         ('installinit2' , 'Install: Initializing.'),
 478                         ('validate'             , 'Validating node installation.'),
 479                         ('rebuildinitrd', 'Rebuilding initrd'),
 480                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
 481                         ('update3'              , 'Updating node configuration.'),
 482                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
 483                         ('update4'              , 'Sending hardware configuration to PLC.'),
 484                         ('debug'                , 'Starting debug mode'),
 485                         ('bmexceptmount', 'BootManagerException during mount'),
 486                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
 487                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
 488                         ('exception'    , 'Exception'),
 489                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
 490                         ('protoerror'   , 'XML RPC protocol error'),
 491                         ('nodehostname' , 'Configured node hostname does not resolve'),
 492                         ('implementerror', 'Implementation Error'),
 493                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
 494                         ('noinstall'    , 'notinstalled'),
 495                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
 496                         ('noblockdev'   , "No block devices detected."),
 497                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
 498                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
 499                         ('hardwarerequirefail' , 'Hardware requirements not met'),
 500                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
 501                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
 502                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
 503                         ('modulefail'   , 'Unable to get list of system modules'),
 504                         ('writeerror'   , 'write error: No space left on device'),
 505                         ('nospace'      , "No space left on device"),
 506                         ('nonode'       , 'Failed to authenticate call: No such node'),
 507                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
 508                         ('bootcheckfail'     , 'BootCheckAuthentication'),
 509                         ('bootupdatefail'   , 'BootUpdateNode'),
 510                 ]
 511                 list = steps_to_list(steps)
 512                 index = child.expect( list + [ pexpect.EOF ])
 513                 id = index_to_id(steps,index)
 514                 sequence.append(id)
 515
 516                 if id == "exception":
 517                         if config and not config.quiet: print "...Found An Exception!!!"
 518                 elif index == len(list):
 519                         #print "Reached EOF"
 520                         break
 521
 522         s = "-".join(sequence)
 523         print "   FOUND SEQUENCE: ", s
 524
 525         # NOTE: We get or set the flag based on the current sequence identifier.
 526         #  By using the sequence identifier, we guarantee that there will be no
 527         #  frequent loops.  I'm guessing there is a better way to track loops,
 528         #  though.
 529         if not config.force and pflags.getRecentFlag(s):
 530                 pflags.setRecentFlag(s)
 531                 pflags.save()
 532                 print "... flag is set or it has already run recently. Skipping %s" % node
 533                 return True
 534
 535         sequences = {}
 536
 537
 538         # restart_bootmanager_boot
 539         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
 540                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
 541                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
 542                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
 543                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
 544                         "bminit-cfg-auth-getplc-update-debug-done",
 545                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
 546                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
 547                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
 548                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
 549                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
 550                         ]:
 551                 sequences.update({n : "restart_bootmanager_boot"})
 552
 553         #       conn.restart_bootmanager('rins')
 554         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
 555                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
 556                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
 557                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
 558                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
 559                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
 560                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
 561                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
 562                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
 563                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
 564                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
 565                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
 566                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
 567                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
 568                         ]:
 569                 sequences.update({n : "restart_bootmanager_rins"})
 570
 571         # repair_node_keys
 572         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
 573
 574         #   conn.restart_node('rins')
 575         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
 576                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
 577                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
 578                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
 579                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
 580                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
 581                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
 582                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
 583                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
 584                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
 585                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
 586                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
 587                         ]:
 588                 sequences.update({n : "restart_node_rins"})
 589
 590         #       restart_node_boot
 591         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
 592                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
 593                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
 594                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
 595                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
 596                          ]:
 597                 sequences.update({n: "restart_node_boot"})
 598
 599         # update_node_config_email
 600         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
 601                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
 602                         ]:
 603                 sequences.update({n : "update_node_config_email"})
 604
 605         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done", ]:
 606                 sequences.update({n : "nodenetwork_email"})
 607
 608         # update_bootcd_email
 609         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
 610                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
 611                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
 612                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
 613                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
 614                         ]:
 615                 sequences.update({n : "update_bootcd_email"})
 616
 617         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
 618                         ]:
 619                 sequences.update({n: "suspect_error_email"})
 620
 621         # update_hardware_email
 622         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
 623         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
 624
 625         # broken_hardware_email
 626         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
 627
 628         flag_set = True
 629
 630
 631         if s not in sequences:
 632                 print "   HOST %s" % hostname
 633                 print "   UNKNOWN SEQUENCE: %s" % s
 634
 635                 args = {}
 636                 args['hostname'] = hostname
 637                 args['sequence'] = s
 638                 args['bmlog'] = conn.get_bootmanager_log().read()
 639                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
 640                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
 641                 m.reset()
 642                 m.send(['monitor-list@lists.planet-lab.org'])
 643
 644                 conn.restart_bootmanager('boot')
 645
 646                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
 647                 # This way, we can check it again after we've fixed it.
 648                 flag_set = False
 649
 650         else:
 651
 652                 if   sequences[s] == "restart_bootmanager_boot":
 653                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
 654                         conn.restart_bootmanager('boot')
 655                 elif sequences[s] == "restart_bootmanager_rins":
 656                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
 657                         conn.restart_bootmanager('rins')
 658                 elif sequences[s] == "restart_node_rins":
 659                         conn.restart_node('rins')
 660                 elif sequences[s] == "restart_node_boot":
 661                         conn.restart_node('boot')
 662                 elif sequences[s] == "repair_node_keys":
 663                         if conn.compare_and_repair_nodekeys():
 664                                 # the keys either are in sync or were forced in sync.
 665                                 # so try to reboot the node again.
 666                                 conn.restart_bootmanager('rins')
 667                                 pass
 668                         else:
 669                                 # there was some failure to synchronize the keys.
 670                                 print "...Unable to repair node keys on %s" % node
 671
 672                 elif sequences[s] == "suspect_error_email":
 673                         args = {}
 674                         args['hostname'] = hostname
 675                         args['sequence'] = s
 676                         args['bmlog'] = conn.get_bootmanager_log().read()
 677                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
 678                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
 679                         m.reset()
 680                         m.send(['monitor-list@lists.planet-lab.org'])
 681
 682                         conn.restart_bootmanager('boot')
 683
 684                 elif sequences[s] == "update_node_config_email":
 685                         print "...Sending message to UPDATE NODE CONFIG"
 686                         args = {}
 687                         args['hostname'] = hostname
 688                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args,
 689                                                                 True, db='nodeid_persistmessages')
 690                         loginbase = plc.siteId(hostname)
 691                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 692                         conn.dump_plconf_file()
 693                         conn.set_nodestate('diag')
 694
 695                 elif sequences[s] == "nodenetwork_email":
 696                         print "...Sending message to LOOK AT NODE NETWORK"
 697                         args = {}
 698                         args['hostname'] = hostname
 699                         args['bmlog'] = conn.get_bootmanager_log().read()
 700                         m = PersistMessage(hostname,  mailtxt.plnode_network[0] % args,  mailtxt.plnode_cfg[1] % args,
 701                                                                 True, db='nodenet_persistmessages')
 702                         loginbase = plc.siteId(hostname)
 703                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 704                         conn.dump_plconf_file()
 705                         conn.set_nodestate('diag')
 706
 707                 elif sequences[s] == "update_bootcd_email":
 708                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
 709                         import getconf
 710                         args = {}
 711                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
 712                         args['hostname_list'] = "%s" % hostname
 713
 714                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
 715                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 716
 717                         loginbase = plc.siteId(hostname)
 718                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 719
 720                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 721                         conn.set_nodestate('disable')
 722
 723                 elif sequences[s] == "broken_hardware_email":
 724                         # MAKE An ACTION record that this host has failed hardware.  May
 725                         # require either an exception "/minhw" or other manual intervention.
 726                         # Definitely need to send out some more EMAIL.
 727                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
 728                         # TODO: email notice of broken hardware
 729                         args = {}
 730                         args['hostname'] = hostname
 731                         args['log'] = conn.get_dmesg().read()
 732                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
 733                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 734
 735                         loginbase = plc.siteId(hostname)
 736                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 737                         conn.set_nodestate('disable')
 738
 739                 elif sequences[s] == "update_hardware_email":
 740                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
 741                         args = {}
 742                         args['hostname'] = hostname
 743                         args['bmlog'] = conn.get_bootmanager_log().read()
 744                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
 745                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 746
 747                         loginbase = plc.siteId(hostname)
 748                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 749                         conn.set_nodestate('disable')
 750
 751         if flag_set:
 752                 pflags.setRecentFlag(s)
 753                 pflags.save()
 754
 755         return True
 756
 757
 758 # MAIN -------------------------------------------------------------------
 759
 760 def main():
 761         from config import config
 762         from optparse import OptionParser
 763         parser = OptionParser()
 764         parser.set_defaults(node=None, nodelist=None, child=False, collect=False, nosetup=False, verbose=False, force=None, quiet=False)
 765         parser.add_option("", "--child", dest="child", action="store_true",
 766                                                 help="This is the child mode of this process.")
 767         parser.add_option("", "--force", dest="force", metavar="boot_state",
 768                                                 help="Force a boot state passed to BootManager.py.")
 769         parser.add_option("", "--quiet", dest="quiet", action="store_true",
 770                                                 help="Extra quiet output messages.")
 771         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 772                                                 help="Extra debug output messages.")
 773         parser.add_option("", "--collect", dest="collect", action="store_true",
 774                                                 help="No action, just collect dmesg, and bm.log")
 775         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 776                                                 help="Do not perform the orginary setup phase.")
 777         parser.add_option("", "--node", dest="node", metavar="nodename.edu",
 778                                                 help="A single node name to try to bring out of debug mode.")
 779         parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt",
 780                                                 help="A list of nodes to bring out of debug mode.")
 781         config = config(parser)
 782         config.parse_args()
 783
 784         if config.nodelist:
 785                 nodes = config.getListFromFile(config.nodelist)
 786         elif config.node:
 787                 nodes = [ config.node ]
 788         else:
 789                 parser.print_help()
 790                 sys.exit(1)
 791
 792         for node in nodes:
 793                 reboot(node, config)
 794
 795 if __name__ == "__main__":
 796         main()