nodereboot.py

   1 #!/usr/bin/python
   2
   3 # Attempt to reboot a node in debug state.
   4
   5
   6 import plc
   7 import auth
   8 api = plc.PLC(auth.auth, auth.plc)
   9
  10 import sys
  11 import os
  12 import policy
  13
  14 from getsshkeys import SSHKnownHosts
  15
  16 import subprocess
  17 import time
  18 import soltesz
  19 from sets import Set
  20
  21 import ssh.pxssh as pxssh
  22 import ssh.fdpexpect as fdpexpect
  23 import ssh.pexpect as pexpect
  24 from unified_model import *
  25 from emailTxt import mailtxt
  26
  27 import signal
  28 class Sopen(subprocess.Popen):
  29         def kill(self, signal = signal.SIGTERM):
  30                 os.kill(self.pid, signal)
  31
  32 #from Rpyc import SocketConnection, Async
  33 from Rpyc import SocketConnection, Async
  34 from Rpyc.Utils import *
  35
  36
  37 class NodeConnection:
  38         def __init__(self, connection, node, config):
  39                 self.node = node
  40                 self.c = connection
  41                 self.config = config
  42
  43         def get_boot_state(self):
  44                 if self.c.modules.os.path.exists('/tmp/source'):
  45                         return "dbg"
  46                 elif self.c.modules.os.path.exists('/vservers'):
  47                         return "boot"
  48                 else:
  49                         return "unknown"
  50
  51         def get_dmesg(self):
  52                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
  53                 download(self.c, "/var/log/dmesg.bm.log", "log/dmesg.%s.log" % self.node)
  54                 log = open("log/dmesg.%s.log" % self.node, 'r')
  55                 return log
  56
  57         def get_bootmanager_log(self):
  58                 download(self.c, "/tmp/bm.log", "log/bm.%s.log.gz" % self.node)
  59                 os.system("zcat log/bm.%s.log.gz > log/bm.%s.log" % (self.node, self.node))
  60                 log = open("log/bm.%s.log" % self.node, 'r')
  61                 return log
  62
  63         def dump_plconf_file(self):
  64                 c = self.c
  65                 c.modules.sys.path.append("/tmp/source/")
  66                 c.modules.os.chdir('/tmp/source')
  67
  68                 log = c.modules.BootManager.log('/tmp/new.log')
  69                 bm = c.modules.BootManager.BootManager(log,'boot')
  70
  71                 BootManagerException = c.modules.Exceptions.BootManagerException
  72                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
  73                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
  74                 bm_continue = True
  75
  76                 InitializeBootManager.Run(bm.VARS, bm.LOG)
  77                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
  78                 except Exception, x:
  79                         bm_continue = False
  80                         print "   ERROR:", x
  81                         print "   Possibly, unable to find valid configuration file"
  82
  83                 if bm_continue and self.config and not self.config.quiet:
  84                         for key in bm.VARS.keys():
  85                                 print key, " == ", bm.VARS[key]
  86                 else:
  87                         if self.config and not self.config.quiet: print "   Unable to read Node Configuration"
  88
  89
  90         def compare_and_repair_nodekeys(self):
  91                 c = self.c
  92                 c.modules.sys.path.append("/tmp/source/")
  93                 c.modules.os.chdir('/tmp/source')
  94
  95                 log = c.modules.BootManager.log('/tmp/new.log')
  96                 bm = c.modules.BootManager.BootManager(log,'boot')
  97
  98                 BootManagerException = c.modules.Exceptions.BootManagerException
  99                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
 100                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
 101                 bm_continue = True
 102
 103                 plcnode = api.GetNodes({'hostname': self.node}, None)[0]
 104
 105                 InitializeBootManager.Run(bm.VARS, bm.LOG)
 106                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
 107                 except Exception, x:
 108                         bm_continue = False
 109                         if not config.quiet: print "exception"
 110                         if not config.quiet: print x
 111                         print "   Possibly, unable to find valid configuration file"
 112
 113                 if bm_continue:
 114                         print "   NODE: %s" % bm.VARS['NODE_KEY']
 115                         print "   PLC : %s" % plcnode['key']
 116
 117                         if bm.VARS['NODE_KEY'] == plcnode['key']:
 118                                 return True
 119                         else:
 120                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
 121                                         print "   Successfully updated NODE_KEY with PLC"
 122                                         return True
 123                                 else:
 124                                         return False
 125
 126                         #for key in bm.VARS.keys():
 127                         #       print key, " == ", bm.VARS[key]
 128                 else:
 129                         print "   Unable to retrieve NODE_KEY"
 130
 131         def bootmanager_running(self):
 132                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 133                         return True
 134                 else:
 135                         return False
 136
 137         def restart_node(self, state='boot'):
 138                 api.UpdateNode(self.node, {'boot_state' : state})
 139
 140                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
 141                 if not pflags.getRecentFlag('gentlekill'):
 142                         print "   Killing all slice processes... : %s" %  self.node
 143                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
 144                         self.c.modules.os.system(cmd_slicekill)
 145                         cmd = """ shutdown -r +1 & """
 146                         print "   Restarting %s : %s" % ( self.node, cmd)
 147                         self.c.modules.os.system(cmd)
 148
 149                         pflags.setRecentFlag('gentlekill')
 150                         pflags.save()
 151                 else:
 152                         print "   Restarting with sysrq 'sub' %s" % self.node
 153                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
 154                         self.c.modules.os.system(cmd)
 155
 156                 return
 157
 158         def restart_bootmanager(self, forceState):
 159
 160                 self.c.modules.os.chdir('/tmp/source')
 161                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 162                         print "   BootManager is already running: try again soon..."
 163                 else:
 164                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
 165                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
 166                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
 167                                   "  rm -f /tmp/BM_RUNNING " + \
 168                                   ") &"
 169                         cmd = cmd % forceState
 170                         self.c.modules.os.system(cmd)
 171
 172                 return
 173
 174
 175 import random
 176 class PlanetLabSession:
 177         globalport = 22000 + int(random.random()*1000)
 178
 179         def __init__(self, node, nosetup, verbose):
 180                 self.verbose = verbose
 181                 self.node = node
 182                 self.port = None
 183                 self.nosetup = nosetup
 184                 self.command = None
 185                 self.setup_host()
 186
 187         def get_connection(self, config):
 188                 return NodeConnection(SocketConnection("localhost", self.port), self.node, config)
 189
 190         def setup_host(self):
 191                 self.port = PlanetLabSession.globalport
 192                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
 193
 194                 args = {}
 195                 args['port'] = self.port
 196                 args['user'] = 'root'
 197                 args['hostname'] = self.node
 198                 args['monitordir'] = "/home/soltesz/monitor"
 199
 200                 if self.nosetup:
 201                         print "Skipping setup"
 202                         return
 203
 204                 # COPY Rpyc files to host
 205                 cmd = "rsync -qv -az -e ssh %(monitordir)s/Rpyc-2.45-2.3/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
 206                 if self.verbose: print cmd
 207                 # TODO: Add timeout
 208                 timeout = 120
 209                 localos = soltesz.CMD()
 210
 211                 ret = localos.system(cmd, timeout)
 212                 print ret
 213                 if ret != 0:
 214                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
 215                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
 216                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
 217                         ret = localos.system(cmd, timeout)
 218                         print ret
 219                         if ret != 0:
 220                                 print "\tFAILED TWICE"
 221                                 #sys.exit(1)
 222                                 raise Exception("Failed twice trying to login with updated ssh host key")
 223
 224                 t1 = time.time()
 225                 # KILL any already running servers.
 226                 cmd = """ssh %(user)s@%(hostname)s """ + \
 227                          """'ps ax | grep Rpyc | grep -v grep | awk "{print \$1}" | xargs kill 2> /dev/null' """
 228                 cmd = cmd % args
 229                 if self.verbose: print cmd
 230                 # TODO: Add timeout
 231                 print localos.system(cmd,timeout)
 232
 233                 # START a new rpyc server.
 234                 cmd = """ssh -n %(user)s@%(hostname)s "export PYTHONPATH=\$HOME; """ + \
 235                          """python Rpyc/Servers/forking_server.py &> server.log < /dev/null &" """
 236                 cmd = cmd % args
 237                 if self.verbose: print cmd
 238                 print localos.system(cmd,timeout)
 239
 240                 # TODO: Add timeout
 241                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1,
 242                 # and the following options seems to work well.
 243                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
 244                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
 245                           """-o ConnectTimeout=120 """ + \
 246                           """-n -N -L %(port)s:localhost:18812 """ + \
 247                           """%(user)s@%(hostname)s"""
 248                 cmd = cmd % args
 249                 if self.verbose: print cmd
 250                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
 251                 # TODO: the read() here may block indefinitely.  Need a better
 252                 # approach therefore, that includes a timeout.
 253                 ret = self.command.stdout.read(5)
 254
 255                 t2 = time.time()
 256                 if 'READY' in ret:
 257                         # NOTE: There is still a slight race for machines that are slow...
 258                         self.timeout = 2*(t2-t1)
 259                         print "Sleeping for %s sec" % self.timeout
 260                         time.sleep(self.timeout)
 261                         return
 262
 263                 if self.command.returncode is not None:
 264                         print "Failed to establish tunnel!"
 265                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
 266
 267                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
 268
 269         def __del__(self):
 270                 if self.command:
 271                         if self.verbose: print "Killing SSH session %s" % self.port
 272                         self.command.kill()
 273
 274
 275 def steps_to_list(steps):
 276         ret_list = []
 277         for (id,label) in steps:
 278                 ret_list.append(label)
 279         return ret_list
 280
 281 def index_to_id(steps,index):
 282         if index < len(steps):
 283                 return steps[index][0]
 284         else:
 285                 return "done"
 286
 287 def reboot(hostname, config=None, forced_action=None):
 288
 289         node = hostname
 290         print "Creating session for %s" % node
 291         # update known_hosts file (in case the node has rebooted since last run)
 292         if config and not config.quiet: print "...updating known_hosts ssh-rsa key for %s" % node
 293         try:
 294                 k = SSHKnownHosts(); k.update(node); k.write(); del k
 295         except:
 296                 import traceback; print traceback.print_exc()
 297                 return False
 298
 299         try:
 300                 if config == None:
 301                         session = PlanetLabSession(node, False, True)
 302                 else:
 303                         session = PlanetLabSession(node, config.nosetup, config.verbose)
 304         except Exception, e:
 305                 print "ERROR setting up session for %s" % hostname
 306                 import traceback; print traceback.print_exc()
 307                 print e
 308                 return False
 309
 310         try:
 311                 conn = session.get_connection(config)
 312         except EOFError:
 313                 # NOTE: sometimes the wait in setup_host() is not long enough.
 314                 # So, here we try to wait a little longer before giving up entirely.
 315                 try:
 316                         time.sleep(session.timeout*4)
 317                         conn = session.get_connection(config)
 318                 except:
 319                         import traceback; print traceback.print_exc()
 320                         return False
 321
 322
 323         if forced_action == "reboot":
 324                 conn.restart_node('rins')
 325                 return True
 326
 327         boot_state = conn.get_boot_state()
 328         if boot_state == "boot":
 329                 print "...Boot state of %s already completed : skipping..." % node
 330                 return True
 331         elif boot_state == "unknown":
 332                 print "...Unknown bootstate for %s : skipping..."% node
 333                 return False
 334         else:
 335                 pass
 336
 337         if conn.bootmanager_running():
 338                 print "...BootManager is currently running.  Skipping host %s" % node
 339                 return True
 340
 341         #if config != None:
 342         #       if config.force:
 343         #               conn.restart_bootmanager(config.force)
 344         #               return True
 345
 346         # Read persistent flags, tagged on one week intervals.
 347         pflags = PersistFlags(hostname, 3*60*60*24, db='debug_persistflags')
 348
 349
 350         if config and not config.quiet: print "...downloading dmesg from %s" % node
 351         dmesg = conn.get_dmesg()
 352         child = fdpexpect.fdspawn(dmesg)
 353
 354         sequence = []
 355         while True:
 356                 steps = [
 357                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
 358                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
 359                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
 360                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
 361                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
 362                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
 363                         ('floppytimeout','floppy0: floppy timeout called'),
 364                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
 365
 366                         # floppy0: floppy timeout called
 367                         # end_request: I/O error, dev fd0, sector 0
 368
 369                         #Buffer I/O error on device dm-2, logical block 8888896
 370                         #ata1: status=0x51 { DriveReady SeekComplete Error }
 371                         #ata1: error=0x40 { UncorrectableError }
 372                         #SCSI error : <0 0 0 0> return code = 0x8000002
 373                         #sda: Current: sense key: Medium Error
 374                         #       Additional sense: Unrecovered read error - auto reallocate failed
 375
 376                         #SCSI error : <0 2 0 0> return code = 0x40001
 377                         #end_request: I/O error, dev sda, sector 572489600
 378                 ]
 379                 id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
 380                 sequence.append(id)
 381
 382                 if id == "done":
 383                         break
 384
 385         s = Set(sequence)
 386         if config and not config.quiet: print "\tSET: ", s
 387
 388         if len(s) > 1:
 389                 print "...Potential drive errors on %s" % node
 390                 if len(s) == 2 and 'floppyerror' in s:
 391                         print "...Should investigate.  Continuing with node."
 392                 else:
 393                         print "...Should investigate.  Skipping node."
 394                         # TODO: send message related to these errors.
 395                         args = {}
 396                         args['hostname'] = hostname
 397                         args['log'] = conn.get_dmesg().read()
 398
 399                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
 400                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 401
 402                         loginbase = plc.siteId(hostname)
 403                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 404                         return False
 405
 406         print "...Downloading bm.log from %s" % node
 407         log = conn.get_bootmanager_log()
 408         child = fdpexpect.fdspawn(log)
 409
 410         time.sleep(1)
 411
 412         if config and not config.quiet: print "...Scanning bm.log for errors"
 413         action_id = "dbg"
 414         sequence = []
 415         while True:
 416
 417                 steps = [
 418                         ('bminit'               , 'Initializing the BootManager.'),
 419                         ('cfg'                  , 'Reading node configuration file.'),
 420                         ('auth'                 , 'Authenticating node with PLC.'),
 421                         ('getplc'               , 'Retrieving details of node from PLC.'),
 422                         ('update'               , 'Updating node boot state at PLC.'),
 423                         ('hardware'             , 'Checking if hardware requirements met.'),
 424                         ('installinit'  , 'Install: Initializing.'),
 425                         ('installdisk'  , 'Install: partitioning disks.'),
 426                         ('installbootfs', 'Install: bootstrapfs tarball.'),
 427                         ('installcfg'   , 'Install: Writing configuration files.'),
 428                         ('installstop'  , 'Install: Shutting down installer.'),
 429                         ('update2'              , 'Updating node boot state at PLC.'),
 430                         ('installinit2' , 'Install: Initializing.'),
 431                         ('validate'             , 'Validating node installation.'),
 432                         ('rebuildinitrd', 'Rebuilding initrd'),
 433                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
 434                         ('update3'              , 'Updating node configuration.'),
 435                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
 436                         ('update4'              , 'Sending hardware configuration to PLC.'),
 437                         ('debug'                , 'Starting debug mode'),
 438                         ('bmexceptmount', 'BootManagerException during mount'),
 439                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
 440                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
 441                         ('exception'    , 'Exception'),
 442                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
 443                         ('protoerror'   , 'XML RPC protocol error'),
 444                         ('nodehostname' , 'Configured node hostname does not resolve'),
 445                         ('implementerror', 'Implementation Error'),
 446                         ('readonlyfs'   , '[Errno 30] Read-only file system'),
 447                         ('noinstall'    , 'notinstalled'),
 448                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
 449                         ('noblockdev'   , "No block devices detected."),
 450                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
 451                         ('hardwarefail' , 'Hardware requirements not met'),
 452                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
 453                         ('modulefail'   , 'Unable to get list of system modules'),
 454                         ('writeerror'   , 'write error: No space left on device'),
 455                         ('nospace'      , "No space left on device"),
 456                         ('nonode'       , 'Failed to authenticate call: No such node'),
 457                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
 458                         ('bootcheckfail'     , 'BootCheckAuthentication'),
 459                         ('bootupdatefail'   , 'BootUpdateNode'),
 460                 ]
 461                 list = steps_to_list(steps)
 462                 index = child.expect( list + [ pexpect.EOF ])
 463                 id = index_to_id(steps,index)
 464                 sequence.append(id)
 465
 466                 if id == "exception":
 467                         if config and not config.quiet: print "...Found An Exception!!!"
 468                 elif index == len(list):
 469                         #print "Reached EOF"
 470                         break
 471
 472         s = "-".join(sequence)
 473         print "   FOUND SEQUENCE: ", s
 474
 475         # NOTE: We get or set the flag based on the current sequence identifier.
 476         #  By using the sequence identifier, we guarantee that there will be no
 477         #  frequent loops.  I'm guessing there is a better way to track loops,
 478         #  though.
 479         if not config.force and ( pflags.getFlag(s) or pflags.isRecent() ):
 480                 pflags.resetFlag(s)
 481                 pflags.setRecent()
 482                 pflags.save()
 483                 print "... flag is set or it has already run recently. Skipping %s" % node
 484                 return True
 485
 486         sequences = {}
 487
 488
 489         # restart_bootmanager_boot
 490         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-done",
 491                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-protoerror-debug-done",
 492                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-exception-protoerror-update-debug-done",
 493                         "bminit-cfg-auth-getplc-update-debug-done",
 494                         "bminit-cfg-auth-getplc-exception-protoerror-update-protoerror-debug-done",
 495                         "bminit-cfg-auth-protoerror-exception-update-protoerror-debug-done",
 496                         "bminit-cfg-auth-protoerror-exception-update-bootupdatefail-authfail-debug-done",
 497                         "bminit-cfg-auth-protoerror-exception-update-debug-done",
 498                         "bminit-cfg-auth-getplc-implementerror-update-debug-done",
 499                         ]:
 500                 sequences.update({n : "restart_bootmanager_boot"})
 501
 502         #       conn.restart_bootmanager('rins')
 503         for n in [ "bminit-cfg-auth-getplc-installinit-validate-exception-modulefail-update-debug-done",
 504                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-modulefail-update-debug-done",
 505                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
 506                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptmount-exception-noinstall-update-debug-done",
 507                         "bminit-cfg-auth-getplc-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
 508                         "bminit-cfg-auth-getplc-update-installinit-validate-exception-noinstall-update-debug-done",
 509                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-bziperror-exception-update-debug-done",
 510                         "bminit-cfg-auth-getplc-update-installinit-validate-bmexceptvgscan-exception-noinstall-update-debug-done",
 511                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-exception-update-debug-done",
 512                         ]:
 513                 sequences.update({n : "restart_bootmanager_rins"})
 514
 515         # repair_node_keys
 516         sequences.update({"bminit-cfg-auth-bootcheckfail-authfail-exception-update-bootupdatefail-authfail-debug-done": "repair_node_keys"})
 517
 518         #   conn.restart_node('rins')
 519         for n in ["bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-exception-chrootfail-update-debug-done",
 520                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-exception-chrootfail-update-debug-done",
 521                         "bminit-cfg-auth-getplc-hardware-installinit-installdisk-installbootfs-installcfg-exception-chrootfail-update-debug-done",
 522                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-writeerror-exception-chrootfail-update-debug-done",
 523                         "bminit-cfg-auth-getplc-update-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
 524                         "bminit-cfg-auth-getplc-hardware-installinit-exception-bmexceptrmfail-update-debug-done",
 525                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-disk-update4-update3-implementerror-bootupdatefail-update-debug-done",
 526                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-readonlyfs-update-debug-done",
 527                         "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-nospace-exception-update-debug-done",
 528                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
 529                         "bminit-cfg-auth-getplc-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-update-debug-done",
 530                         ]:
 531                 sequences.update({n : "restart_node_rins"})
 532
 533         #       restart_node_boot
 534         for n in ["bminit-cfg-auth-getplc-implementerror-bootupdatefail-update-debug-done",
 535                          "bminit-cfg-auth-implementerror-bootcheckfail-update-debug-done",
 536                          "bminit-cfg-auth-implementerror-bootcheckfail-update-implementerror-bootupdatefail-done",
 537                          "bminit-cfg-auth-getplc-update-installinit-validate-rebuildinitrd-netcfg-update3-implementerror-nospace-update-debug-done",
 538                          ]:
 539                 sequences.update({n: "restart_node_boot"})
 540
 541         # update_node_config_email
 542         for n in ["bminit-cfg-exception-nocfg-update-bootupdatefail-nonode-debug-done",
 543                         "bminit-cfg-exception-update-bootupdatefail-nonode-debug-done",
 544                         "bminit-cfg-exception-nodehostname-update-debug-done",
 545                         ]:
 546                 sequences.update({n : "update_node_config_email"})
 547
 548         # update_bootcd_email
 549         for n in ["bminit-cfg-auth-getplc-update-hardware-exception-noblockdev-hardwarefail-update-debug-done",
 550                         "bminit-cfg-auth-getplc-hardware-exception-noblockdev-hardwarefail-update-debug-done",
 551                         "bminit-cfg-auth-getplc-update-hardware-noblockdev-exception-hardwarefail-update-debug-done",
 552                         "bminit-cfg-auth-getplc-hardware-noblockdev-exception-hardwarefail-update-debug-done",
 553                         "bminit-cfg-auth-getplc-hardware-exception-hardwarefail-update-debug-done",
 554                         ]:
 555                 sequences.update({n : "update_bootcd_email"})
 556
 557         # update_hardware_email
 558         sequences.update({"bminit-cfg-auth-getplc-hardware-exception-disktoosmall-hardwarefail-update-debug-done" : "update_hardware_email"})
 559
 560         # broken_hardware_email
 561         sequences.update({"bminit-cfg-auth-getplc-update-hardware-exception-hardwarefail-update-debug-done" : "broken_hardware_email"})
 562
 563
 564         if s not in sequences:
 565                 print "   HOST %s" % hostname
 566                 print "   UNKNOWN SEQUENCE: %s" % s
 567
 568                 args = {}
 569                 args['hostname'] = hostname
 570                 args['sequence'] = s
 571                 args['bmlog'] = conn.get_bootmanager_log().read()
 572                 m = PersistMessage(hostname, mailtxt.unknownsequence[0] % args,
 573                                                                          mailtxt.unknownsequence[1] % args, False, db='unknown_persistmessages')
 574                 m.reset()
 575                 m.send(['monitor-list@lists.planet-lab.org'])
 576
 577                 conn.restart_bootmanager('boot')
 578
 579         else:
 580
 581                 if   sequences[s] == "restart_bootmanager_boot":
 582                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
 583                         conn.restart_bootmanager('boot')
 584                 elif sequences[s] == "restart_bootmanager_rins":
 585                         if config and not config.quiet: print "...Restarting BootManager.py on %s "% node
 586                         conn.restart_bootmanager('rins')
 587                 elif sequences[s] == "restart_node_rins":
 588                         conn.restart_node('rins')
 589                 elif sequences[s] == "restart_node_boot":
 590                         conn.restart_node('boot')
 591                 elif sequences[s] == "repair_node_keys":
 592                         if conn.compare_and_repair_nodekeys():
 593                                 # the keys either are in sync or were forced in sync.
 594                                 # so try to reboot the node again.
 595                                 conn.restart_bootmanager('boot')
 596                         else:
 597                                 # there was some failure to synchronize the keys.
 598                                 print "...Unable to repair node keys on %s" % node
 599                 elif sequences[s] == "update_node_config_email":
 600                         print "...Sending message to UPDATE NODE CONFIG"
 601                         args = {}
 602                         args['hostname'] = hostname
 603                         m = PersistMessage(hostname,  mailtxt.plnode_cfg[0] % args,  mailtxt.plnode_cfg[1] % args,
 604                                                                 True, db='nodeid_persistmessages')
 605                         loginbase = plc.siteId(hostname)
 606                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 607                         conn.dump_plconf_file()
 608
 609                 elif sequences[s] == "update_bootcd_email":
 610                         print "...NOTIFY OWNER TO UPDATE BOOTCD!!!"
 611                         import getconf
 612                         args = {}
 613                         args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
 614                         args['hostname_list'] = "%s" % hostname
 615
 616                         m = PersistMessage(hostname, "Please Update Boot Image for %s" % hostname,
 617                                                                 mailtxt.newalphacd_one[1] % args, True, db='bootcd_persistmessages')
 618
 619                         loginbase = plc.siteId(hostname)
 620                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 621
 622                 elif sequences[s] == "broken_hardware_email":
 623                         # MAKE An ACTION record that this host has failed hardware.  May
 624                         # require either an exception "/minhw" or other manual intervention.
 625                         # Definitely need to send out some more EMAIL.
 626                         print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
 627                         # TODO: email notice of broken hardware
 628                         args = {}
 629                         args['hostname'] = hostname
 630                         args['log'] = conn.get_dmesg().read()
 631                         m = PersistMessage(hostname, mailtxt.baddisk[0] % args,
 632                                                                                  mailtxt.baddisk[1] % args, True, db='hardware_persistmessages')
 633
 634                         loginbase = plc.siteId(hostname)
 635                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 636
 637                 elif sequences[s] == "update_hardware_email":
 638                         print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
 639                         args = {}
 640                         args['hostname'] = hostname
 641                         args['bmlog'] = conn.get_bootmanager_log().read()
 642                         m = PersistMessage(hostname, mailtxt.minimalhardware[0] % args,
 643                                                                                  mailtxt.minimalhardware[1] % args, True, db='minhardware_persistmessages')
 644
 645                         loginbase = plc.siteId(hostname)
 646                         m.send([policy.PIEMAIL % loginbase, policy.TECHEMAIL % loginbase])
 647
 648         pflags.setFlag(s)
 649         pflags.save()
 650
 651         return True
 652
 653
 654 # MAIN -------------------------------------------------------------------
 655
 656 def main():
 657         from config import config
 658         from optparse import OptionParser
 659         parser = OptionParser()
 660         parser.set_defaults(node=None, nodelist=None, child=False, nosetup=False, verbose=False, force=None, quiet=False)
 661         parser.add_option("", "--child", dest="child", action="store_true",
 662                                                 help="This is the child mode of this process.")
 663         parser.add_option("", "--force", dest="force", metavar="boot_state",
 664                                                 help="Force a boot state passed to BootManager.py.")
 665         parser.add_option("", "--quiet", dest="quiet", action="store_true",
 666                                                 help="Extra quiet output messages.")
 667         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 668                                                 help="Extra debug output messages.")
 669         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 670                                                 help="Do not perform the orginary setup phase.")
 671         parser.add_option("", "--node", dest="node", metavar="nodename.edu",
 672                                                 help="A single node name to try to bring out of debug mode.")
 673         parser.add_option("", "--nodelist", dest="nodelist", metavar="nodelist.txt",
 674                                                 help="A list of nodes to bring out of debug mode.")
 675         config = config(parser)
 676         config.parse_args()
 677
 678         if config.nodelist:
 679                 nodes = config.getListFromFile(config.nodelist)
 680         elif config.node:
 681                 nodes = [ config.node ]
 682         else:
 683                 parser.print_help()
 684                 sys.exit(1)
 685
 686         for node in nodes:
 687                 reboot(node, config)
 688
 689 if __name__ == "__main__":
 690         main()