bootman.py

   1 #!/usr/bin/python
   2
   3 # Attempt to reboot a node in debug state.
   4
   5 from monitor import const
   6 from monitor.database.info.model import *
   7 from monitor.wrapper import plc
   8 api = plc.getAuthAPI()
   9
  10 import sys
  11 import os
  12
  13 from getsshkeys import SSHKnownHosts
  14
  15 import subprocess
  16 import time
  17 from pcucontrol.util import command as moncommands
  18 from sets import Set
  19
  20 from pcucontrol.transports.ssh import pxssh as pxssh
  21 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
  22 from pcucontrol.transports.ssh import pexpect as pexpect
  23 from monitor.model import *
  24 from monitor.wrapper.emailTxt import mailtxt
  25 from nodeconfig import network_config_to_str
  26 import traceback
  27 from monitor import config
  28
  29 import signal
  30 class Sopen(subprocess.Popen):
  31         def kill(self, signal = signal.SIGTERM):
  32                 os.kill(self.pid, signal)
  33
  34 #from Rpyc import SocketConnection, Async
  35 from Rpyc import SocketConnection, Async
  36 from Rpyc.Utils import *
  37 fb = None
  38
  39 class NodeConnection:
  40         def __init__(self, connection, node, config):
  41                 self.node = node
  42                 self.c = connection
  43                 self.config = config
  44
  45         def get_boot_state(self):
  46                 if self.c.modules.os.path.exists('/tmp/source'):
  47                         return "dbg"
  48                 elif self.c.modules.os.path.exists('/vservers'):
  49                         return "boot"
  50                 else:
  51                         return "unknown"
  52
  53         def get_dmesg(self):
  54                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
  55                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
  56                 log = open("log/dmesg.%s.log" % self.node, 'r')
  57                 return log
  58
  59         def get_bootmanager_log(self):
  60                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
  61                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
  62                 log = open("log/bm.%s.log" % self.node, 'r')
  63                 return log
  64
  65         def dump_plconf_file(self):
  66                 c = self.c
  67                 self.c.modules.sys.path.append("/tmp/source/")
  68                 self.c.modules.os.chdir('/tmp/source')
  69
  70                 log = c.modules.BootManager.log('/tmp/new.log')
  71                 bm = c.modules.BootManager.BootManager(log,'boot')
  72
  73                 BootManagerException = c.modules.Exceptions.BootManagerException
  74                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
  75                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
  76                 bm_continue = True
  77
  78                 InitializeBootManager.Run(bm.VARS, bm.LOG)
  79                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
  80                 except Exception, x:
  81                         bm_continue = False
  82                         print "   ERROR:", x
  83                         print "   Possibly, unable to find valid configuration file"
  84
  85                 if bm_continue and self.config and not self.config.quiet:
  86                         for key in bm.VARS.keys():
  87                                 print key, " == ", bm.VARS[key]
  88                 else:
  89                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
  90
  91
  92         def compare_and_repair_nodekeys(self):
  93                 c = self.c
  94                 self.c.modules.sys.path.append("/tmp/source/")
  95                 self.c.modules.os.chdir('/tmp/source')
  96
  97                 log = c.modules.BootManager.log('/tmp/new.log')
  98                 bm = c.modules.BootManager.BootManager(log,'boot')
  99
 100                 BootManagerException = c.modules.Exceptions.BootManagerException
 101                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
 102                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
 103                 bm_continue = True
 104
 105                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
 106
 107                 InitializeBootManager.Run(bm.VARS, bm.LOG)
 108                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
 109                 except Exception, x:
 110                         bm_continue = False
 111                         print "exception"
 112                         print x
 113                         print "   Possibly, unable to find valid configuration file"
 114
 115                 if bm_continue:
 116                         print "   NODE: %s" % bm.VARS['NODE_KEY']
 117                         print "   PLC : %s" % plcnode['key']
 118
 119                         if bm.VARS['NODE_KEY'] == plcnode['key']:
 120                                 return True
 121                         else:
 122                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
 123                                         print "   Successfully updated NODE_KEY with PLC"
 124                                         return True
 125                                 else:
 126                                         return False
 127
 128                         #for key in bm.VARS.keys():
 129                         #       print key, " == ", bm.VARS[key]
 130                 else:
 131                         print "   Unable to retrieve NODE_KEY"
 132
 133         def bootmanager_running(self):
 134                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 135                         return True
 136                 else:
 137                         return False
 138
 139         def set_nodestate(self, state='boot'):
 140                 return api.UpdateNode(self.node, {'boot_state' : state})
 141
 142         def restart_node(self, state='boot'):
 143                 api.UpdateNode(self.node, {'boot_state' : state})
 144
 145                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
 146                 if not pflags.getRecentFlag('gentlekill'):
 147                         print "   Killing all slice processes... : %s" %  self.node
 148                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
 149                         self.c.modules.os.system(cmd_slicekill)
 150                         cmd = """ shutdown -r +1 & """
 151                         print "   Restarting %s : %s" % ( self.node, cmd)
 152                         self.c.modules.os.system(cmd)
 153
 154                         pflags.setRecentFlag('gentlekill')
 155                         pflags.save()
 156                 else:
 157                         print "   Restarting with sysrq 'sub' %s" % self.node
 158                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
 159                         self.c.modules.os.system(cmd)
 160
 161                 return
 162
 163         def restart_bootmanager(self, forceState):
 164
 165                 self.c.modules.os.chdir('/tmp/source')
 166                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 167                         print "   BootManager is already running: try again soon..."
 168                 else:
 169                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
 170                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
 171                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
 172                                   "  rm -f /tmp/BM_RUNNING " + \
 173                                   ") &"
 174                         cmd = cmd % forceState
 175                         self.c.modules.os.system(cmd)
 176
 177                 return
 178
 179
 180 import random
 181 class PlanetLabSession:
 182         globalport = 22000 + int(random.random()*1000)
 183
 184         def __init__(self, node, nosetup, verbose):
 185                 self.verbose = verbose
 186                 self.node = node
 187                 self.port = None
 188                 self.nosetup = nosetup
 189                 self.command = None
 190                 self.setup_host()
 191
 192         def get_connection(self, config):
 193                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
 194
 195         def setup_host(self):
 196                 self.port = PlanetLabSession.globalport
 197                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
 198
 199                 args = {}
 200                 args['port'] = self.port
 201                 args['user'] = 'root'
 202                 args['hostname'] = self.node
 203                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
 204                 ssh_port = 22
 205
 206                 if self.nosetup:
 207                         print "Skipping setup"
 208                         return
 209
 210                 # COPY Rpyc files to host
 211                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
 212                 if self.verbose: print cmd
 213                 # TODO: Add timeout
 214                 timeout = 120
 215                 localos = moncommands.CMD()
 216
 217                 ret = localos.system(cmd, timeout)
 218                 print ret
 219                 if ret != 0:
 220                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
 221                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
 222                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
 223                         ret = localos.system(cmd, timeout)
 224                         print ret
 225                         if ret != 0:
 226                                 print "\tFAILED TWICE"
 227                                 #sys.exit(1)
 228                                 raise Exception("Failed twice trying to login with updated ssh host key")
 229
 230                 t1 = time.time()
 231                 # KILL any already running servers.
 232                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
 233                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
 234             rm -f out.log
 235             echo "kill server" >> out.log
 236             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
 237             echo "export" >> out.log
 238             export PYTHONPATH=$HOME  ;
 239             echo "start server" >> out.log
 240             python Rpyc/Servers/forking_server.py &> server.log &
 241             echo "done" >> out.log
 242 EOF""")
 243                 #cmd = """ssh %(user)s@%(hostname)s """ + \
 244                 #        """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
 245                 #cmd = cmd % args
 246                 #if self.verbose: print cmd
 247                 ## TODO: Add timeout
 248                 #print localos.system(cmd,timeout)
 249
 250                 ## START a new rpyc server.
 251                 #cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
 252                 #        """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
 253                 #cmd = cmd % args
 254                 #if self.verbose: print cmd
 255                 #print localos.system(cmd,timeout)
 256                 print ssh.ret
 257
 258                 # TODO: Add timeout
 259                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1,
 260                 # and the following options seems to work well.
 261                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
 262                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
 263                           """-o ConnectTimeout=120 """ + \
 264                           """-n -N -L %(port)s:localhost:18812 """ + \
 265                           """%(user)s@%(hostname)s"""
 266                 cmd = cmd % args
 267                 if self.verbose: print cmd
 268                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
 269                 # TODO: the read() here may block indefinitely.  Need a better
 270                 # approach therefore, that includes a timeout.
 271                 #ret = self.command.stdout.read(5)
 272                 ret = moncommands.read_t(self.command.stdout, 5)
 273
 274                 t2 = time.time()
 275                 if 'READY' in ret:
 276                         # NOTE: There is still a slight race for machines that are slow...
 277                         self.timeout = 2*(t2-t1)
 278                         print "Sleeping for %s sec" % self.timeout
 279                         time.sleep(self.timeout)
 280                         return
 281
 282                 if self.command.returncode is not None:
 283                         print "Failed to establish tunnel!"
 284                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
 285
 286                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
 287
 288         def __del__(self):
 289                 if self.command:
 290                         if self.verbose: print "Killing SSH session %s" % self.port
 291                         self.command.kill()
 292
 293
 294 def steps_to_list(steps):
 295         ret_list = []
 296         for (id,label) in steps:
 297                 ret_list.append(label)
 298         return ret_list
 299
 300 def index_to_id(steps,index):
 301         if index < len(steps):
 302                 return steps[index][0]
 303         else:
 304                 return "done"
 305
 306 def reboot(hostname, config=None, forced_action=None):
 307
 308         # NOTE: Nothing works if the bootcd is REALLY old.
 309         #       So, this is the first step.
 310         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
 311         if fbnode['category'] == "OLDBOOTCD":
 312                 print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
 313                 args = {}
 314                 args['hostname_list'] = "    %s" % hostname
 315
 316                 m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
 317                                                         mailtxt.newbootcd_one[1] % args, True, db='bootcd_persistmessages')
 318
 319                 loginbase = plc.siteId(hostname)
 320                 emails = plc.getTechEmails(loginbase)
 321                 m.send(emails)
 322
 323                 print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 324                 api.UpdateNode(hostname, {'boot_state' : 'disable'})
 325                 return True
 326
 327         node = hostname
 328         print "Creating session for %s" % node
 329         # update known_hosts file (in case the node has rebooted since last run)
 330         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
 331         try:
 332                 k = SSHKnownHosts(); k.update(node); k.write(); del k
 333         except:
 334                 from monitor.common import email_exception
 335                 email_exception()
 336                 print traceback.print_exc()
 337                 return False
 338
 339         try:
 340                 if config == None:
 341                         session = PlanetLabSession(node, False, True)
 342                 else:
 343                         session = PlanetLabSession(node, config.nosetup, config.verbose)
 344         except Exception, e:
 345                 msg = "ERROR setting up session for %s" % hostname
 346                 print msg
 347                 print traceback.print_exc()
 348                 from monitor.common import email_exception
 349                 email_exception(msg)
 350                 print e
 351                 return False
 352
 353         try:
 354                 conn = session.get_connection(config)
 355         except EOFError:
 356                 # NOTE: sometimes the wait in setup_host() is not long enough.
 357                 # So, here we try to wait a little longer before giving up entirely.
 358                 try:
 359                         time.sleep(session.timeout*4)
 360                         conn = session.get_connection(config)
 361                 except:
 362                         print traceback.print_exc()
 363                         from monitor.common import email_exception
 364                         email_exception()
 365                         return False
 366
 367         if forced_action == "reboot":
 368                 conn.restart_node('rins')
 369                 return True
 370
 371         boot_state = conn.get_boot_state()
 372         if boot_state == "boot":
 373                 print "...Boot state of %s already completed : skipping..." % node
 374                 return True
 375         elif boot_state == "unknown":
 376                 print "...Unknown bootstate for %s : skipping..."% node
 377                 return False
 378         else:
 379                 pass
 380
 381         if conn.bootmanager_running():
 382                 print "...BootManager is currently running.  Skipping host %s" % node
 383                 return True
 384
 385         #if config != None:
 386         #       if config.force:
 387         #               conn.restart_bootmanager(config.force)
 388         #               return True
 389
 390         # Read persistent flags, tagged on one week intervals.
 391         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
 392
 393
 394         if config and not config.quiet: print "...downloading dmesg from %s" % node
 395         dmesg = conn.get_dmesg()
 396         child = fdpexpect.fdspawn(dmesg)
 397
 398         sequence = []
 399         while True:
 400                 steps = [
 401                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
 402                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
 403                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION  byte \w+ = \w+'),
 404
 405                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
 406
 407                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
 408                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
 409
 410                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
 411                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
 412
 413                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
 414                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
 415
 416                         ('floppytimeout','floppy0: floppy timeout called'),
 417                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
 418
 419                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
 420                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
 421
 422                         # floppy0: floppy timeout called
 423                         # end_request: I/O error, dev fd0, sector 0
 424
 425                         # Buffer I/O error on device dm-2, logical block 8888896
 426                         # ata1: status=0x51 { DriveReady SeekComplete Error }
 427                         # ata1: error=0x40 { UncorrectableError }
 428                         # SCSI error : <0 0 0 0> return code = 0x8000002
 429                         # sda: Current: sense key: Medium Error
 430                         #       Additional sense: Unrecovered read error - auto reallocate failed
 431
 432                         # SCSI error : <0 2 0 0> return code = 0x40001
 433                         # end_request: I/O error, dev sda, sector 572489600
 434                 ]
 435                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
 436                 sequence.append(id)
 437
 438                 if id == "done":
 439                         break
 440
 441         s = Set(sequence)
 442         if config and not config.quiet: print "\tSET: ", s
 443
 444         if len(s) > 1:
 445                 print "...Potential drive errors on %s" % node
 446                 if len(s) == 2 and 'floppyerror' in s:
 447                         print "...Should investigate.  Continuing with node."
 448                 else:
 449                         print "...Should investigate.  Skipping node."
 450                         # TODO: send message related to these errors.
 451                         args = {}
 452                         args['hostname'] = hostname
 453                         args['log'] = conn.get_dmesg().read()
 454
 455                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
 456                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 457
 458                         loginbase = plc.siteId(hostname)
 459                         emails = plc.getTechEmails(loginbase)
 460                         m.send(emails)
 461                         conn.set_nodestate('disable')
 462                         return False
 463
 464         print "...Downloading bm.log from %s" % node
 465         log = conn.get_bootmanager_log()
 466         child = fdpexpect.fdspawn(log)
 467
 468         try:
 469                 if config.collect: return True
 470         except:
 471                 pass
 472
 473         time.sleep(1)
 474
 475         if config and not config.quiet: print "...Scanning bm.log for errors"
 476         action_id = "dbg"
 477         sequence = []
 478         while True:
 479
 480                 steps = [
 481                         ('bminit'               , 'Initializing the BootManager.'),
 482                         ('cfg'                  , 'Reading node configuration file.'),
 483                         ('auth'                 , 'Authenticating node with PLC.'),
 484                         ('getplc'               , 'Retrieving details of node from PLC.'),
 485                         ('update'               , 'Updating node boot state at PLC.'),
 486                         ('hardware'             , 'Checking if hardware requirements met.'),
 487                         ('installinit'  , 'Install: Initializing.'),
 488                         ('installdisk'  , 'Install: partitioning disks.'),
 489                         ('installbootfs', 'Install: bootstrapfs tarball.'),
 490                         ('installcfg'   , 'Install: Writing configuration files.'),
 491                         ('installstop'  , 'Install: Shutting down installer.'),
 492                         ('update2'              , 'Updating node boot state at PLC.'),
 493                         ('installinit2' , 'Install: Initializing.'),
 494                         ('validate'             , 'Validating node installation.'),
 495                         ('rebuildinitrd', 'Rebuilding initrd'),
 496                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
 497                         ('update3'              , 'Updating node configuration.'),
 498                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
 499                         ('update4'              , 'Sending hardware configuration to PLC.'),
 500                         ('debug'                , 'Starting debug mode'),
 501                         ('bmexceptmount', 'BootManagerException during mount'),
 502                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
 503                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
 504                         ('exception'    , 'Exception'),
 505                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
 506                         ('protoerror'   , 'XML RPC protocol error'),
 507                         ('nodehostname' , 'Configured node hostname does not resolve'),
 508                         ('implementerror', 'Implementation Error'),
 509                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
 510                         ('baddisk'      , "IOError: [Errno 13] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
 511                         ('noinstall'    , 'notinstalled'),
 512                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
 513                         ('noblockdev'   , "No block devices detected."),
 514                         ('dnserror'     , 'Name or service not known'),
 515                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
 516                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
 517                         ('hardwarerequirefail' , 'Hardware requirements not met'),
 518                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
 519                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
 520                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
 521                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
 522                         ('modulefail'   , 'Unable to get list of system modules'),
 523                         ('writeerror'   , 'write error: No space left on device'),
 524                         ('nospace'      , "No space left on device"),
 525                         ('nonode'       , 'Failed to authenticate call: No such node'),
 526                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
 527                         ('bootcheckfail'     , 'BootCheckAuthentication'),
 528                         ('bootupdatefail'   , 'BootUpdateNode'),
 529                 ]
 530                 list = steps_to_list(steps)
 531                 index = child.expect( list + [ pexpect.EOF ])
 532                 id = index_to_id(steps,index)
 533                 sequence.append(id)
 534
 535                 if id == "exception":
 536                         if config and not config.quiet: print "...Found An Exception!!!"
 537                 elif index == len(list):
 538                         #print "Reached EOF"
 539                         break
 540
 541         s = "-".join(sequence)
 542         print "   FOUND SEQUENCE: ", s
 543
 544         # NOTE: We get or set the flag based on the current sequence identifier.
 545         #  By using the sequence identifier, we guarantee that there will be no
 546         #  frequent loops.  I'm guessing there is a better way to track loops,
 547         #  though.
 548         #if not config.force and pflags.getRecentFlag(s):
 549         #       pflags.setRecentFlag(s)
 550         #       pflags.save()
 551         #       print "... flag is set or it has already run recently. Skipping %s" % node
 552         #       return True
 553
 554         sequences = {}
 555
 556
 557         # restart_bootmanager_boot
 558         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
 559                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
 560                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-bootupdatefail-update-debug-done",
 561
 562                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-exception-protoerror-update-protoerror-debug-done",
 563
 564                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
 565                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-exception-chrootfail-update-debug-done",
 566                         "bminit-cfg-auth-getplc-update-debug-done",
 567                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
 568                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
 569                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
 570                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
 571                         "bminit-cfg-auth-getplc-exception-protoerror-update-debug-done",
 572                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
 573                         ]:
 574                 sequences.update({n : "restart_bootmanager_boot"})
 575
 576         #       conn.restart_bootmanager('rins')
 577         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
 578                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
 579                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
 580                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
 581                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
 582                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
 583                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
 584                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
 585                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
 586                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
 587                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
 588                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-exception-mkfsfail-update-debug-done",
 589                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
 590                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-exception-chrootfail-update-debug-done",
 591                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
 592                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-installstop-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-kernelcopyfail-exception-update-debug-done",
 593                         "bminit-cfg-auth-getplc-installinit-validate-exception-noinstall-update-debug-done",
 594                         # actual solution appears to involve removing the bad files, and
 595                         # continually trying to boot the node.
 596                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-disk-update4-update3-update3-implementerror-update-debug-done",
 597                         ]:
 598                 sequences.update({n : "restart_bootmanager_rins"})
 599
 600         # repair_node_keys
 601         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
 602
 603         #   conn.restart_node('rins')
 604         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
 605                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
 606                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
 607                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
 608                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
 609                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
 610                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
 611                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
 612                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
 613                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
 614                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
 615                         "bminit-cfg-auth-getplc-update-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
 616                         ]:
 617                 sequences.update({n : "restart_node_rins"})
 618
 619         #       restart_node_boot
 620         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
 621                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
 622                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
 623                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
 624                          "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-downloadfail-update-debug-done",
 625                          "bminit-cfg-auth-getplc-update-installinit-validate-implementerror-update-debug-done",
 626                          ]:
 627                 sequences.update({n: "restart_node_boot"})
 628
 629         # update_node_config_email
 630         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
 631                           "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
 632                           "bminit-cfg-auth-bootcheckfail-nonode-exception-update-bootupdatefail-nonode-debug-done",
 633                         ]:
 634                 sequences.update({n : "update_node_config_email"})
 635
 636         for n in [ "bminit-cfg-exception-nodehostname-update-debug-done",
 637                            "bminit-cfg-update-exception-nodehostname-update-debug-done",
 638                         ]:
 639                 sequences.update({n : "nodenetwork_email"})
 640
 641         # update_bootcd_email
 642         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
 643                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarerequirefail-update-debug-done",
 644                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
 645                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarerequirefail-update-debug-done",
 646                         "bminit-cfg-auth-getplc-hardware-exception-hardwarerequirefail-update-debug-done",
 647                         ]:
 648                 sequences.update({n : "update_bootcd_email"})
 649
 650         for n in [ "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nofilereference-update-debug-done",
 651                         ]:
 652                 sequences.update({n: "suspect_error_email"})
 653
 654         # update_hardware_email
 655         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
 656         sequences.update({"bminit-cfg-auth-getplc-hardware-disktoosmall-exception-hardwarerequirefail-update-debug-done" : "update_hardware_email"})
 657
 658         # broken_hardware_email
 659         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarerequirefail-update-debug-done" : "broken_hardware_email"})
 660
 661         # bad_dns_email
 662         for n in [
 663          "bminit-cfg-update-implementerror-bootupdatefail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
 664                 "bminit-cfg-auth-implementerror-bootcheckfail-dnserror-update-implementerror-bootupdatefail-dnserror-done",
 665                 ]:
 666                 sequences.update( { n : "bad_dns_email"})
 667
 668         flag_set = True
 669
 670
 671         if s not in sequences:
 672                 print "   HOST %s" % hostname
 673                 print "   UNKNOWN SEQUENCE: %s" % s
 674
 675                 args = {}
 676                 args['hostname'] = hostname
 677                 args['sequence'] = s
 678                 args['bmlog'] = conn.get_bootmanager_log().read()
 679                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
 680                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
 681                 m.reset()
 682                 m.send([config.cc_email])
 683
 684                 conn.restart_bootmanager('boot')
 685
 686                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
 687                 # This way, we can check it again after we've fixed it.
 688                 flag_set = False
 689
 690         else:
 691
 692                 if   sequences[s] == "restart_bootmanager_boot":
 693                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
 694                         conn.restart_bootmanager('boot')
 695                 elif sequences[s] == "restart_bootmanager_rins":
 696                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
 697                         conn.restart_bootmanager('rins')
 698                 elif sequences[s] == "restart_node_rins":
 699                         conn.restart_node('rins')
 700                 elif sequences[s] == "restart_node_boot":
 701                         conn.restart_node('boot')
 702                 elif sequences[s] == "repair_node_keys":
 703                         if conn.compare_and_repair_nodekeys():
 704                                 # the keys either are in sync or were forced in sync.
 705                                 # so try to reboot the node again.
 706                                 conn.restart_bootmanager('rins')
 707                                 pass
 708                         else:
 709                                 # there was some failure to synchronize the keys.
 710                                 print "...Unable to repair node keys on %s" % node
 711
 712                 elif sequences[s] == "suspect_error_email":
 713                         args = {}
 714                         args['hostname'] = hostname
 715                         args['sequence'] = s
 716                         args['bmlog'] = conn.get_bootmanager_log().read()
 717                         m = PersistMessage(hostname, "Suspicous error from BootManager on %s" % args,
 718                                                                                  mailtxt.unknownsequence[1] % args, False, db='suspect_persistmessages')
 719                         m.reset()
 720                         m.send([config.cc_email])
 721
 722                         conn.restart_bootmanager('boot')
 723
 724                 elif sequences[s] == "update_node_config_email":
 725                         print "...Sending message to UPDATE NODE CONFIG"
 726                         args = {}
 727                         args['hostname'] = hostname
 728                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args,
 729                                                                 True, db='nodeid_persistmessages')
 730                         loginbase = plc.siteId(hostname)
 731                         emails = plc.getTechEmails(loginbase)
 732                         m.send(emails)
 733                         conn.dump_plconf_file()
 734                         conn.set_nodestate('disable')
 735
 736                 elif sequences[s] == "nodenetwork_email":
 737                         print "...Sending message to LOOK AT NODE NETWORK"
 738                         args = {}
 739                         args['hostname'] = hostname
 740                         args['bmlog'] = conn.get_bootmanager_log().read()
 741                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args,
 742                                                                 True, db='nodenet_persistmessages')
 743                         loginbase = plc.siteId(hostname)
 744                         emails = plc.getTechEmails(loginbase)
 745                         m.send(emails)
 746                         conn.dump_plconf_file()
 747                         conn.set_nodestate('disable')
 748
 749                 elif sequences[s] == "update_bootcd_email":
 750                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
 751                         import getconf
 752                         args = {}
 753                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
 754                         args['hostname_list'] = "%s" % hostname
 755
 756                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
 757                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 758
 759                         loginbase = plc.siteId(hostname)
 760                         emails = plc.getTechEmails(loginbase)
 761                         m.send(emails)
 762
 763                         print "\tDisabling %s due to out-of-date BOOTCD" % hostname
 764                         conn.set_nodestate('disable')
 765
 766                 elif sequences[s] == "broken_hardware_email":
 767                         # MAKE An ACTION record that this host has failed hardware.  May
 768                         # require either an exception "/minhw" or other manual intervention.
 769                         # Definitely need to send out some more EMAIL.
 770                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
 771                         # TODO: email notice of broken hardware
 772                         args = {}
 773                         args['hostname'] = hostname
 774                         args['log'] = conn.get_dmesg().read()
 775                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
 776                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 777
 778                         loginbase = plc.siteId(hostname)
 779                         emails = plc.getTechEmails(loginbase)
 780                         m.send(emails)
 781                         conn.set_nodestate('disable')
 782
 783                 elif sequences[s] == "update_hardware_email":
 784                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
 785                         args = {}
 786                         args['hostname'] = hostname
 787                         args['bmlog'] = conn.get_bootmanager_log().read()
 788                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
 789                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 790
 791                         loginbase = plc.siteId(hostname)
 792                         emails = plc.getTechEmails(loginbase)
 793                         m.send(emails)
 794                         conn.set_nodestate('disable')
 795
 796                 elif sequences[s] == "bad_dns_email":
 797                         print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
 798                         args = {}
 799                         try:
 800                                 node = api.GetNodes(hostname)[0]
 801                                 net = api.GetNodeNetworks(node['nodenetwork_ids'])[0]
 802                         except:
 803                                 from monitor.common import email_exception
 804                                 email_exception()
 805                                 print traceback.print_exc()
 806                                 # TODO: api error. skip email, b/c all info is not available,
 807                                 # flag_set will not be recorded.
 808                                 return False
 809                         nodenet_str = network_config_to_str(net)
 810
 811                         args['hostname'] = hostname
 812                         args['network_config'] = nodenet_str
 813                         args['nodenetwork_id'] = net['nodenetwork_id']
 814                         m = PersistMessage(hostname, mailtxt.baddns[0] % args,
 815                                                                                  mailtxt.baddns[1] % args, True, db='baddns_persistmessages')
 816
 817                         loginbase = plc.siteId(hostname)
 818                         emails = plc.getTechEmails(loginbase)
 819                         m.send(emails)
 820                         conn.set_nodestate('disable')
 821
 822         if flag_set:
 823                 pflags.setRecentFlag(s)
 824                 pflags.save()
 825
 826         return True
 827
 828
 829 # MAIN -------------------------------------------------------------------
 830
 831 def main():
 832         from monitor import parser as parsermodule
 833         parser = parsermodule.getParser()
 834
 835         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
 836                                                 force=None, quiet=False)
 837         parser.add_option("", "--child", dest="child", action="store_true",
 838                                                 help="This is the child mode of this process.")
 839         parser.add_option("", "--force", dest="force", metavar="boot_state",
 840                                                 help="Force a boot state passed to BootManager.py.")
 841         parser.add_option("", "--quiet", dest="quiet", action="store_true",
 842                                                 help="Extra quiet output messages.")
 843         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 844                                                 help="Extra debug output messages.")
 845         parser.add_option("", "--nonet", dest="nonet", action="store_true",
 846                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
 847         parser.add_option("", "--collect", dest="collect", action="store_true",
 848                                                 help="No action, just collect dmesg, and bm.log")
 849         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 850                                                 help="Do not perform the orginary setup phase.")
 851
 852         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
 853         config = parsermodule.parse_args(parser)
 854
 855         if config.nodelist:
 856                 nodes = config.getListFromFile(config.nodelist)
 857         elif config.node:
 858                 nodes = [ config.node ]
 859         else:
 860                 parser.print_help()
 861                 sys.exit(1)
 862
 863         for node in nodes:
 864                 reboot(node, config)
 865
 866 if __name__ == "__main__":
 867         main()