bootman.py

   1 #!/usr/bin/python
   2
   3 # Attempt to reboot a node in debug state.
   4
   5 import plc
   6 api = plc.getAuthAPI()
   7
   8 import sys
   9 import os
  10 import const
  11
  12 from getsshkeys import SSHKnownHosts
  13
  14 import subprocess
  15 import time
  16 import database
  17 import moncommands
  18 from sets import Set
  19
  20 import ssh.pxssh as pxssh
  21 import ssh.fdpexpect as fdpexpect
  22 import ssh.pexpect as pexpect
  23 from unified_model import *
  24 from emailTxt import mailtxt
  25 from nodeconfig import network_config_to_str
  26 import traceback
  27 import config
  28
  29 class ExceptionDoubleSSHError(Exception): pass
  30
  31 import signal
  32 class Sopen(subprocess.Popen):
  33         def kill(self, signal = signal.SIGTERM):
  34                 os.kill(self.pid, signal)
  35
  36 #from Rpyc import SocketConnection, Async
  37 from Rpyc import SocketConnection, Async
  38 from Rpyc.Utils import *
  39 fb = None
  40
  41 def get_fbnode(node):
  42         global fb
  43         if fb is None:
  44                 fb = database.dbLoad("findbad")
  45         fbnode = fb['nodes'][node]['values']
  46         return fbnode
  47
  48 class NodeConnection:
  49         def __init__(self, connection, node, config):
  50                 self.node = node
  51                 self.c = connection
  52                 self.config = config
  53
  54         def get_boot_state(self):
  55                 if self.c.modules.os.path.exists('/tmp/source'):
  56                         return "dbg"
  57                 elif self.c.modules.os.path.exists('/vservers'):
  58                         return "boot"
  59                 else:
  60                         return "unknown"
  61
  62         def get_dmesg(self):
  63                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
  64                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
  65                 log = open("log/dmesg.%s.log" % self.node, 'r')
  66                 return log
  67
  68         def get_bootmanager_log(self):
  69                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
  70                 #os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
  71                 os.system("cp log/bm.%s.log.gz log/bm.%s.log" % (self.node, self.node))
  72                 log = open("log/bm.%s.log" % self.node, 'r')
  73                 return log
  74
  75         def dump_plconf_file(self):
  76                 c = self.c
  77                 self.c.modules.sys.path.append("/tmp/source/")
  78                 self.c.modules.os.chdir('/tmp/source')
  79
  80                 log = c.modules.BootManager.log('/tmp/new.log')
  81                 bm = c.modules.BootManager.BootManager(log,'boot')
  82
  83                 BootManagerException = c.modules.Exceptions.BootManagerException
  84                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
  85                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
  86                 bm_continue = True
  87
  88                 InitializeBootManager.Run(bm.VARS, bm.LOG)
  89                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
  90                 except Exception, x:
  91                         bm_continue = False
  92                         print "   ERROR:", x
  93                         print "   Possibly, unable to find valid configuration file"
  94
  95                 if bm_continue and self.config and not self.config.quiet:
  96                         for key in bm.VARS.keys():
  97                                 print key, " == ", bm.VARS[key]
  98                 else:
  99                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
 100
 101
 102         def compare_and_repair_nodekeys(self):
 103                 c = self.c
 104                 self.c.modules.sys.path.append("/tmp/source/")
 105                 self.c.modules.os.chdir('/tmp/source')
 106
 107                 log = c.modules.BootManager.log('/tmp/new.log')
 108                 bm = c.modules.BootManager.BootManager(log,'boot')
 109
 110                 BootManagerException = c.modules.Exceptions.BootManagerException
 111                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
 112                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
 113                 bm_continue = True
 114
 115                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
 116
 117                 InitializeBootManager.Run(bm.VARS, bm.LOG)
 118                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
 119                 except Exception, x:
 120                         bm_continue = False
 121                         print "exception"
 122                         print x
 123                         print "   Possibly, unable to find valid configuration file"
 124
 125                 if bm_continue:
 126                         print "   NODE: %s" % bm.VARS['NODE_KEY']
 127                         print "   PLC : %s" % plcnode['key']
 128
 129                         if bm.VARS['NODE_KEY'] == plcnode['key']:
 130                                 return True
 131                         else:
 132                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
 133                                         print "   Successfully updated NODE_KEY with PLC"
 134                                         return True
 135                                 else:
 136                                         return False
 137
 138                         #for key in bm.VARS.keys():
 139                         #       print key, " == ", bm.VARS[key]
 140                 else:
 141                         print "   Unable to retrieve NODE_KEY"
 142
 143         def bootmanager_running(self):
 144                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 145                         return True
 146                 else:
 147                         return False
 148
 149         def set_nodestate(self, state='boot'):
 150                 return api.UpdateNode(self.node, {'boot_state' : state})
 151
 152         def restart_node(self, state='boot'):
 153                 api.UpdateNode(self.node, {'boot_state' : state})
 154
 155                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
 156                 if not pflags.getRecentFlag('gentlekill'):
 157                         print "   Killing all slice processes... : %s" %  self.node
 158                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
 159                         self.c.modules.os.system(cmd_slicekill)
 160                         cmd = """ shutdown -r +1 & """
 161                         print "   Restarting %s : %s" % ( self.node, cmd)
 162                         self.c.modules.os.system(cmd)
 163
 164                         pflags.setRecentFlag('gentlekill')
 165                         pflags.save()
 166                 else:
 167                         print "   Restarting with sysrq 'sub' %s" % self.node
 168                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
 169                         self.c.modules.os.system(cmd)
 170
 171                 return
 172
 173         def restart_bootmanager(self, forceState):
 174
 175                 self.c.modules.os.chdir('/tmp/source')
 176                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 177                         print "   BootManager is already running: try again soon..."
 178                 else:
 179                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
 180                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
 181                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
 182                                   "  rm -f /tmp/BM_RUNNING " + \
 183                                   ") &"
 184                         cmd = cmd % forceState
 185                         self.c.modules.os.system(cmd)
 186
 187                 return
 188
 189
 190 import random
 191 class PlanetLabSession:
 192         globalport = 22000 + int(random.random()*1000)
 193
 194         def __init__(self, node, nosetup, verbose):
 195                 self.verbose = verbose
 196                 self.node = node
 197                 self.port = None
 198                 self.nosetup = nosetup
 199                 self.command = None
 200                 self.setup_host()
 201
 202         def get_connection(self, config):
 203                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
 204
 205         def setup_host(self):
 206                 self.port = PlanetLabSession.globalport
 207                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
 208
 209                 args = {}
 210                 args['port'] = self.port
 211                 args['user'] = 'root'
 212                 args['hostname'] = self.node
 213                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
 214                 ssh_port = 22
 215
 216                 if self.nosetup:
 217                         print "Skipping setup"
 218                         return
 219
 220                 # COPY Rpyc files to host
 221                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
 222                 if self.verbose: print cmd
 223                 # TODO: Add timeout
 224                 timeout = 120
 225                 localos = moncommands.CMD()
 226
 227                 ret = localos.system(cmd, timeout)
 228                 print ret
 229                 if ret != 0:
 230                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
 231                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
 232                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
 233                         ret = localos.system(cmd, timeout)
 234                         print ret
 235                         if ret != 0:
 236                                 print "\tFAILED TWICE"
 237                                 #sys.exit(1)
 238                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 239
 240                 t1 = time.time()
 241                 # KILL any already running servers.
 242                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
 243                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
 244             rm -f out.log
 245             echo "kill server" >> out.log
 246             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
 247             echo "export" >> out.log
 248             export PYTHONPATH=$HOME  ;
 249             echo "start server" >> out.log
 250             python Rpyc/Servers/forking_server.py &> server.log &
 251             echo "done" >> out.log
 252 EOF""")
 253                 #cmd = """ssh %(user)s@%(hostname)s """ + \
 254                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
 255                 #cmd = cmd % args
 256                 #if self.verbose: print cmd
 257                 ## TODO: Add timeout
 258                 #print localos.system(cmd,timeout)
 259
 260                 ## START a new rpyc server.
 261                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
 262                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
 263                 #cmd = cmd % args
 264                 #if self.verbose: print cmd
 265                 #print localos.system(cmd,timeout)
 266                 print ssh.ret
 267
 268                 # TODO: Add timeout
 269                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1,
 270                 # and the following options seems to work well.
 271                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
 272                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
 273                           """-o ConnectTimeout=120 """ + \
 274                           """-n -N -L %(port)s:localhost:18812 """ + \
 275                           """%(user)s@%(hostname)s"""
 276                 cmd = cmd % args
 277                 if self.verbose: print cmd
 278                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
 279                 # TODO: the read() here may block indefinitely.  Need a better
 280                 # approach therefore, that includes a timeout.
 281                 #ret = self.command.stdout.read(5)
 282                 ret = moncommands.read_t(self.command.stdout, 5)
 283
 284                 t2 = time.time()
 285                 if 'READY' in ret:
 286                         # NOTE: There is still a slight race for machines that are slow...
 287                         self.timeout = 2*(t2-t1)
 288                         print "Sleeping for %s sec" % self.timeout
 289                         time.sleep(self.timeout)
 290                         return
 291
 292                 if self.command.returncode is not None:
 293                         print "Failed to establish tunnel!"
 294                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
 295
 296                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
 297
 298         def __del__(self):
 299                 if self.command:
 300                         if self.verbose: print "Killing SSH session %s" % self.port
 301                         self.command.kill()
 302
 303
 304 def steps_to_list(steps):
 305         ret_list = []
 306         for (id,label) in steps:
 307                 ret_list.append(label)
 308         return ret_list
 309
 310 def index_to_id(steps,index):
 311         if index < len(steps):
 312                 return steps[index][0]
 313         else:
 314                 return "done"
 315
 316 def reboot(hostname, config=None, forced_action=None):
 317
 318         # NOTE: Nothing works if the bootcd is REALLY old.
 319         #       So, this is the first step.
 320         fbnode = get_fbnode(hostname)
 321         if fbnode['category'] == "OLDBOOTCD":
 322                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
 323                 args = {}
 324                 args['hostname_list'] = "    %s" % hostname
 325
 326                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
 327                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
 328
 329                 loginbase = plc.siteId(hostname)
 330                 emails = plc.getTechEmails(loginbase)
 331                 m.send(emails)
 332
 333                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 334                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
 335                 return True
 336
 337         node = hostname
 338         print "Creating session for %s" % node
 339         # update known_hosts file (in case the node has rebooted since last run)
 340         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
 341         try:
 342                 k = SSHKnownHosts(); k.update(node); k.write(); del k
 343         except:
 344                 from nodecommon import email_exception
 345                 email_exception()
 346                 print traceback.print_exc()
 347                 return False
 348
 349         try:
 350                 if config == None:
 351                         session = PlanetLabSession(node, False, True)
 352                 else:
 353                         session = PlanetLabSession(node, config.nosetup, config.verbose)
 354         except ExceptionDoubleSSHError, e:
 355                 msg = "ERROR setting up session for %s" % hostname
 356                 print msg
 357                 return False
 358         except Exception, e:
 359                 msg = "ERROR setting up session for %s" % hostname
 360                 print msg
 361                 print traceback.print_exc()
 362                 from nodecommon import email_exception
 363                 email_exception(msg)
 364                 print e
 365                 return False
 366
 367         try:
 368                 conn = session.get_connection(config)
 369         except EOFError:
 370                 # NOTE: sometimes the wait in setup_host() is not long enough.
 371                 # So, here we try to wait a little longer before giving up entirely.
 372                 try:
 373                         time.sleep(session.timeout*4)
 374                         conn = session.get_connection(config)
 375                 except EOFError:
 376                         # failed twice... no need to report this really, it's just in a
 377                         # weird state...
 378                         return False
 379                 except:
 380                         print traceback.print_exc()
 381                         from nodecommon import email_exception
 382                         email_exception(node)
 383                         return False
 384
 385         if forced_action == "reboot":
 386                 conn.restart_node('rins')
 387                 return True
 388
 389         boot_state = conn.get_boot_state()
 390         if boot_state == "boot":
 391                 print "...Boot state of %s already completed : skipping..." % node
 392                 return True
 393         elif boot_state == "unknown":
 394                 print "...Unknown bootstate for %s : skipping..."% node
 395                 return False
 396         else:
 397                 pass
 398
 399         if conn.bootmanager_running():
 400                 print "...BootManager is currently running.  Skipping host %s" % node
 401                 return True
 402
 403         #if config != None:
 404         #       if config.force:
 405         #               conn.restart_bootmanager(config.force)
 406         #               return True
 407
 408         # Read persistent flags, tagged on one week intervals.
 409         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
 410
 411
 412         if config and not config.quiet: print "...downloading dmesg from %s" % node
 413         dmesg = conn.get_dmesg()
 414         child = fdpexpect.fdspawn(dmesg)
 415
 416         sequence = []
 417         while True:
 418                 steps = [
 419                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
 420                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
 421                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
 422
 423                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
 424
 425                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
 426                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
 427
 428                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
 429                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
 430
 431                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
 432                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
 433
 434                         ('floppytimeout','floppy0: floppy timeout called'),
 435                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
 436
 437                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
 438                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
 439
 440                         # floppy0: floppy timeout called
 441                         # end_request: I/O error, dev fd0, sector 0
 442
 443                         # Buffer I/O error on device dm-2, logical block 8888896
 444                         # ata1: status=0x51 { DriveReady SeekComplete Error }
 445                         # ata1: error=0x40 { UncorrectableError }
 446                         # SCSI error : <0 0 0 0> return code = 0x8000002
 447                         # sda: Current: sense key: Medium Error
 448                         #       Additional sense: Unrecovered read error - auto reallocate failed
 449
 450                         # SCSI error : <0 2 0 0> return code = 0x40001
 451                         # end_request: I/O error, dev sda, sector 572489600
 452                 ]
 453                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
 454                 sequence.append(id)
 455
 456                 if id == "done":
 457                         break
 458
 459         s = Set(sequence)
 460         if config and not config.quiet: print "\tSET: ", s
 461
 462         if len(s) > 1:
 463                 print "...Potential drive errors on %s" % node
 464                 if len(s) == 2 and 'floppyerror' in s:
 465                         print "...Should investigate.  Continuing with node."
 466                 else:
 467                         print "...Should investigate.  Skipping node."
 468                         # TODO: send message related to these errors.
 469                         args = {}
 470                         args['hostname'] = hostname
 471                         args['log'] = conn.get_dmesg().read()
 472
 473                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
 474                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 475
 476                         loginbase = plc.siteId(hostname)
 477                         emails = plc.getTechEmails(loginbase)
 478                         m.send(emails)
 479                         conn.set_nodestate('disable')
 480                         return False
 481
 482         print "...Downloading bm.log from %s" % node
 483         log = conn.get_bootmanager_log()
 484         child = fdpexpect.fdspawn(log)
 485
 486         try:
 487                 if config.collect: return True
 488         except:
 489                 pass
 490
 491         time.sleep(1)
 492
 493         if config and not config.quiet: print "...Scanning bm.log for errors"
 494         action_id = "dbg"
 495         sequence = []
 496         while True:
 497
 498                 steps = [
 499                         ('bminit'               , 'Initializing the BootManager.'),
 500                         ('cfg'                  , 'Reading node configuration file.'),
 501                         ('auth'                 , 'Authenticating node with PLC.'),
 502                         ('getplc'               , 'Retrieving details of node from PLC.'),
 503                         ('update'               , 'Updating node boot state at PLC.'),
 504                         ('hardware'             , 'Checking if hardware requirements met.'),
 505                         ('installinit'  , 'Install: Initializing.'),
 506                         ('installdisk'  , 'Install: partitioning disks.'),
 507                         ('installbootfs', 'Install: bootstrapfs tarball.'),
 508                         ('installcfg'   , 'Install: Writing configuration files.'),
 509                         ('installstop'  , 'Install: Shutting down installer.'),
 510                         ('update2'              , 'Updating node boot state at PLC.'),
 511                         ('installinit2' , 'Install: Initializing.'),
 512                         ('validate'             , 'Validating node installation.'),
 513                         ('rebuildinitrd', 'Rebuilding initrd'),
 514                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
 515                         ('update3'              , 'Updating node configuration.'),
 516                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
 517                         ('update4'              , 'Sending hardware configuration to PLC.'),
 518                         ('debug'                , 'Starting debug mode'),
 519                         ('bmexceptmount', 'BootManagerException during mount'),
 520                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
 521                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
 522                         ('exception'    , 'Exception'),
 523                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
 524                         ('protoerror'   , 'XML RPC protocol error'),
 525                         ('nodehostname' , 'Configured node hostname does not resolve'),
 526                         ('implementerror', 'Implementation Error'),
 527                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
 528                         ('noinstall'    , 'notinstalled'),
 529                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
 530                         ('noblockdev'   , "No block devices detected."),
 531                         ('dnserror'     , 'Name or service not known'),
 532                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
 533                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
 534                         ('hardwarerequirefail' , 'Hardware requirements not met'),
 535                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
 536                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
 537                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
 538                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
 539                         ('modulefail'   , 'Unable to get list of system modules'),
 540                         ('writeerror'   , 'write error: No space left on device'),
 541                         ('nospace'      , "No space left on device"),
 542                         ('nonode'       , 'Failed to authenticate call: No such node'),
 543                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
 544                         ('bootcheckfail'     , 'BootCheckAuthentication'),
 545                         ('bootupdatefail'   , 'BootUpdateNode'),
 546                 ]
 547                 list = steps_to_list(steps)
 548                 index = child.expect( list + [ pexpect.EOF ])
 549                 id = index_to_id(steps,index)
 550                 sequence.append(id)
 551
 552                 if id == "exception":
 553                         if config and not config.quiet: print "...Found An Exception!!!"
 554                 elif index == len(list):
 555                         #print "Reached EOF"
 556                         break
 557
 558         s = "-".join(sequence)
 559         print "   FOUND SEQUENCE: ", s
 560
 561         # NOTE: We get or set the flag based on the current sequence identifier.
 562         #  By using the sequence identifier, we guarantee that there will be no
 563         #  frequent loops.  I'm guessing there is a better way to track loops,
 564         #  though.
 565         #if not config.force and pflags.getRecentFlag(s):
 566         #       pflags.setRecentFlag(s)
 567         #       pflags.save()
 568         #       print "... flag is set or it has already run recently. Skipping %s" % node
 569         #       return True
 570
 571         sequences = {}
 572
 573
 574         # restart_bootmanager_boot
 575         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
 576                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
 577                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
 578
 579                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
 580
 581                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
 582                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
 583                         "bminit-cfg-auth-getplc-update-debug-done",
 584                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
 585                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
 586                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
 587                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
 588                         "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
 589                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
 590                         ]:
 591                 sequences.update({n : "restart_bootmanager_boot"})
 592
 593         #       conn.restart_bootmanager('rins')
 594         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
 595                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
 596                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
 597                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
 598                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
 599                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
 600                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
 601                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
 602                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
 603                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
 604                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
 605                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
 606                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
 607                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
 608                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
 609                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
 610                         # actual solution appears to involve removing the bad files, and
 611                         # continually trying to boot the node.
 612                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
 613                         "bminit-cfg-auth-getplc-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
 614                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-bmexceptmount-exception-noinstall-update-debug-done",
 615                         ]:
 616                 sequences.update({n : "restart_bootmanager_rins"})
 617
 618         # repair_node_keys
 619         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
 620
 621         #   conn.restart_node('rins')
 622         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
 623                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
 624                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
 625                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
 626                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
 627                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
 628                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
 629                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
 630                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
 631                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
 632                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
 633                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
 634                         ]:
 635                 sequences.update({n : "restart_node_rins"})
 636
 637         #       restart_node_boot
 638         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
 639                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
 640                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
 641                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
 642                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
 643                          "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
 644                          ]:
 645                 sequences.update({n: "restart_node_boot"})
 646
 647         # update_node_config_email
 648         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
 649                           "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
 650                           "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
 651                         ]:
 652                 sequences.update({n : "update_node_config_email"})
 653
 654         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
 655                            "bminit-cfg-update-exception-nodehostname-update-debug-done",
 656                         ]:
 657                 sequences.update({n : "nodenetwork_email"})
 658
 659         # update_bootcd_email
 660         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
 661                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
 662                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
 663                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
 664                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
 665                         ]:
 666                 sequences.update({n : "update_bootcd_email"})
 667
 668         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
 669                         ]:
 670                 sequences.update({n: "suspect_error_email"})
 671
 672         # update_hardware_email
 673         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
 674         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
 675
 676         # broken_hardware_email
 677         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
 678
 679         # bad_dns_email
 680         for n in [
 681          "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
 682                 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
 683                 ]:
 684                 sequences.update( { n : "bad_dns_email"})
 685
 686         flag_set = True
 687
 688
 689         if s not in sequences:
 690                 print "   HOST %s" % hostname
 691                 print "   UNKNOWN SEQUENCE: %s" % s
 692
 693                 args = {}
 694                 args['hostname'] = hostname
 695                 args['sequence'] = s
 696                 args['bmlog'] = conn.get_bootmanager_log().read()
 697                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
 698                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
 699                 m.reset()
 700                 m.send([config.cc_email])
 701
 702                 conn.restart_bootmanager('boot')
 703
 704                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
 705                 # This way, we can check it again after we've fixed it.
 706                 flag_set = False
 707
 708         else:
 709
 710                 if   sequences[s] == "restart_bootmanager_boot":
 711                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
 712                         conn.restart_bootmanager('boot')
 713                 elif sequences[s] == "restart_bootmanager_rins":
 714                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
 715                         conn.restart_bootmanager('rins')
 716                 elif sequences[s] == "restart_node_rins":
 717                         conn.restart_node('rins')
 718                 elif sequences[s] == "restart_node_boot":
 719                         conn.restart_node('boot')
 720                 elif sequences[s] == "repair_node_keys":
 721                         if conn.compare_and_repair_nodekeys():
 722                                 # the keys either are in sync or were forced in sync.
 723                                 # so try to reboot the node again.
 724                                 conn.restart_bootmanager('rins')
 725                                 pass
 726                         else:
 727                                 # there was some failure to synchronize the keys.
 728                                 print "...Unable to repair node keys on %s" % node
 729
 730                 elif sequences[s] == "suspect_error_email":
 731                         args = {}
 732                         args['hostname'] = hostname
 733                         args['sequence'] = s
 734                         args['bmlog'] = conn.get_bootmanager_log().read()
 735                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
 736                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
 737                         m.reset()
 738                         m.send([config.cc_email])
 739
 740                         conn.restart_bootmanager('boot')
 741
 742                 elif sequences[s] == "update_node_config_email":
 743                         print "...Sending message to UPDATE NODE CONFIG"
 744                         args = {}
 745                         args['hostname'] = hostname
 746                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args,
 747                                                                 True, db='nodeid_persistmessages')
 748                         loginbase = plc.siteId(hostname)
 749                         emails = plc.getTechEmails(loginbase)
 750                         m.send(emails)
 751                         conn.dump_plconf_file()
 752                         conn.set_nodestate('disable')
 753
 754                 elif sequences[s] == "nodenetwork_email":
 755                         print "...Sending message to LOOK AT NODE NETWORK"
 756                         args = {}
 757                         args['hostname'] = hostname
 758                         args['bmlog'] = conn.get_bootmanager_log().read()
 759                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args,
 760                                                                 True, db='nodenet_persistmessages')
 761                         loginbase = plc.siteId(hostname)
 762                         emails = plc.getTechEmails(loginbase)
 763                         m.send(emails)
 764                         conn.dump_plconf_file()
 765                         conn.set_nodestate('disable')
 766
 767                 elif sequences[s] == "update_bootcd_email":
 768                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
 769                         import getconf
 770                         args = {}
 771                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
 772                         args['hostname_list'] = "%s" % hostname
 773
 774                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
 775                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 776
 777                         loginbase = plc.siteId(hostname)
 778                         emails = plc.getTechEmails(loginbase)
 779                         m.send(emails)
 780
 781                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 782                         conn.set_nodestate('disable')
 783
 784                 elif sequences[s] == "broken_hardware_email":
 785                         # MAKE An ACTION record that this host has failed hardware.  May
 786                         # require either an exception "/minhw" or other manual intervention.
 787                         # Definitely need to send out some more EMAIL.
 788                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
 789                         # TODO: email notice of broken hardware
 790                         args = {}
 791                         args['hostname'] = hostname
 792                         args['log'] = conn.get_dmesg().read()
 793                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
 794                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 795
 796                         loginbase = plc.siteId(hostname)
 797                         emails = plc.getTechEmails(loginbase)
 798                         m.send(emails)
 799                         conn.set_nodestate('disable')
 800
 801                 elif sequences[s] == "update_hardware_email":
 802                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
 803                         args = {}
 804                         args['hostname'] = hostname
 805                         args['bmlog'] = conn.get_bootmanager_log().read()
 806                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
 807                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 808
 809                         loginbase = plc.siteId(hostname)
 810                         emails = plc.getTechEmails(loginbase)
 811                         m.send(emails)
 812                         conn.set_nodestate('disable')
 813
 814                 elif sequences[s] == "bad_dns_email":
 815                         print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
 816                         args = {}
 817                         try:
 818                                 node = api.GetNodes(hostname)[0]
 819                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
 820                         except:
 821                                 from nodecommon import email_exception
 822                                 email_exception()
 823                                 print traceback.print_exc()
 824                                 # TODO: api error. skip email, b/c all info is not available,
 825                                 # flag_set will not be recorded.
 826                                 return False
 827                         nodenet_str = network_config_to_str(net)
 828
 829                         args['hostname'] = hostname
 830                         args['network_config'] = nodenet_str
 831                         args['nodenetwork_id'] = net['nodenetwork_id']
 832                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
 833                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
 834
 835                         loginbase = plc.siteId(hostname)
 836                         emails = plc.getTechEmails(loginbase)
 837                         m.send(emails)
 838                         conn.set_nodestate('disable')
 839
 840         if flag_set:
 841                 pflags.setRecentFlag(s)
 842                 pflags.save()
 843
 844         return True
 845
 846
 847 # MAIN -------------------------------------------------------------------
 848
 849 def main():
 850         import parser as parsermodule
 851         parser = parsermodule.getParser()
 852
 853         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
 854                                                 force=None, quiet=False)
 855         parser.add_option("", "--child", dest="child", action="store_true",
 856                                                 help="This is the child mode of this process.")
 857         parser.add_option("", "--force", dest="force", metavar="boot_state",
 858                                                 help="Force a boot state passed to BootManager.py.")
 859         parser.add_option("", "--quiet", dest="quiet", action="store_true",
 860                                                 help="Extra quiet output messages.")
 861         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 862                                                 help="Extra debug output messages.")
 863         parser.add_option("", "--nonet", dest="nonet", action="store_true",
 864                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
 865         parser.add_option("", "--collect", dest="collect", action="store_true",
 866                                                 help="No action, just collect dmesg, and bm.log")
 867         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 868                                                 help="Do not perform the orginary setup phase.")
 869
 870         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
 871         config = parsermodule.parse_args(parser)
 872
 873         if config.nodelist:
 874                 nodes = config.getListFromFile(config.nodelist)
 875         elif config.node:
 876                 nodes = [ config.node ]
 877         else:
 878                 parser.print_help()
 879                 sys.exit(1)
 880
 881         for node in nodes:
 882                 reboot(node, config)
 883
 884 if __name__ == "__main__":
 885         main()