monitor/bootman.py

   1 #!/usr/bin/python
   2
   3 # Attempt to reboot a node in debug state.
   4
   5 import os
   6 import sys
   7 import time
   8 import random
   9 import signal
  10 import traceback
  11 import subprocess
  12 from sets import Set
  13
  14 from monitor.getsshkeys import SSHKnownHosts
  15 from monitor.Rpyc import SocketConnection, Async
  16 from monitor.Rpyc.Utils import *
  17
  18 from monitor import getconf
  19 from monitor import config
  20 from monitor import const
  21 from monitor.model import *
  22 from monitor.common import email_exception, found_within
  23 from monitor.database.info.model import *
  24 from monitor.database.info.interface import *
  25 from monitor.wrapper import plc
  26 from monitor.wrapper import plccache
  27 from monitor.wrapper.emailTxt import mailtxt
  28 from monitor.nodeconfig import network_config_to_str
  29
  30 from pcucontrol.util import command as moncommands
  31 from pcucontrol.util.command import Sopen
  32 from pcucontrol.transports.ssh import pxssh as pxssh
  33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
  34 from pcucontrol.transports.ssh import pexpect as pexpect
  35
  36 api = plc.getAuthAPI()
  37 fb = None
  38
  39 def bootmanager_log_name(hostname):
  40         t_stamp = time.strftime("%Y-%m-%d-%H:%M")
  41         base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
  42         short_target_filename = os.path.join('history', base_filename)
  43         return short_target_filename
  44
  45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
  46         try:
  47                 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
  48                 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
  49                 err = ""
  50         except:
  51                 loginbase = "unknown"
  52                 err = traceback.format_exc()
  53
  54         act = ActionRecord(loginbase=loginbase,
  55                                                 hostname=hostname,
  56                                                 action='log',
  57                                                 action_type=logtype,
  58                                                 log_path=short_log_path,
  59                                                 error_string=err)
  60         return
  61
  62
  63 class ExceptionDoubleSSHError(Exception): pass
  64
  65 class NodeConnection:
  66         def __init__(self, connection, node, config):
  67                 print "init nodeconnection"
  68                 self.node = node
  69                 self.c = connection
  70                 self.config = config
  71
  72         def get_boot_state(self):
  73                 print "get_boot_state(self)"
  74                 try:
  75                         if self.c.modules.os.path.exists('/tmp/source'):
  76                                 return "debug"
  77                         elif self.c.modules.os.path.exists('/vservers'):
  78                                 return "boot"
  79                         else:
  80                                 return "unknown"
  81                 except EOFError:
  82                         traceback.print_exc()
  83                         print self.c.modules.sys.path
  84                 except:
  85                         email_exception()
  86                         traceback.print_exc()
  87
  88                 return "unknown"
  89
  90         def get_dmesg(self):
  91                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
  92                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
  93                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
  94                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
  95                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
  96                 return log
  97
  98         def get_bootmanager_log(self):
  99                 bm_name = bootmanager_log_name(self.node)
 100                 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
 101                 #email_exception(self.node, "collected BM log for %s" % self.node)
 102                 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
 103                 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
 104                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
 105                 return log
 106
 107         def dump_plconf_file(self):
 108                 c = self.c
 109                 self.c.modules.sys.path.append("/tmp/source/")
 110                 self.c.modules.os.chdir('/tmp/source')
 111
 112                 log = c.modules.BootManager.log('/tmp/new.log')
 113                 bm = c.modules.BootManager.BootManager(log,'boot')
 114
 115                 BootManagerException = c.modules.Exceptions.BootManagerException
 116                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
 117                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
 118                 bm_continue = True
 119
 120                 InitializeBootManager.Run(bm.VARS, bm.LOG)
 121                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
 122                 except Exception, x:
 123                         bm_continue = False
 124                         print "   ERROR:", x
 125                         print "   Possibly, unable to find valid configuration file"
 126
 127                 if bm_continue:
 128                         for key in bm.VARS.keys():
 129                                 print key, " == ", bm.VARS[key]
 130                 else:
 131                         print "   Unable to read Node Configuration"
 132
 133         def fsck_repair_node(self):
 134                 c = self.c
 135                 self.c.modules.sys.path.append("/tmp/source/")
 136                 self.c.modules.os.chdir('/tmp/source')
 137                 # TODO: restart
 138                 # TODO: set boot state to node's actually boot state.
 139                 # could be 'boot' or 'safeboot'
 140                 self.c.modules.os.chdir('/tmp/source')
 141                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 142                         print "Running MANUAL FSCK already... try again soon."
 143                 else:
 144                         print "Running MANUAL fsck on %s" % self.node
 145                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
 146                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
 147                                   "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
 148                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
 149                                   "  rm -f /tmp/BM_RUNNING " + \
 150                                   ") &"
 151                         cmd = cmd % self.get_nodestate()
 152                         self.c.modules.os.system(cmd)
 153                 #self.restart_bootmanager('boot')
 154                 pass
 155
 156         def compare_and_repair_nodekeys(self):
 157                 c = self.c
 158                 self.c.modules.sys.path.append("/tmp/source/")
 159                 self.c.modules.os.chdir('/tmp/source')
 160
 161                 log = c.modules.BootManager.log('/tmp/new.log')
 162                 bm = c.modules.BootManager.BootManager(log,'boot')
 163
 164                 BootManagerException = c.modules.Exceptions.BootManagerException
 165                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
 166                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
 167                 bm_continue = True
 168
 169                 plcnode = plccache.GetNodeByName(self.node)
 170
 171                 InitializeBootManager.Run(bm.VARS, bm.LOG)
 172                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
 173                 except Exception, x:
 174                         bm_continue = False
 175                         print "exception"
 176                         print x
 177                         print "   Possibly, unable to find valid configuration file"
 178
 179                 if bm_continue:
 180                         print "   NODE: %s" % bm.VARS['NODE_KEY']
 181                         print "   PLC : %s" % plcnode['key']
 182
 183                         if bm.VARS['NODE_KEY'] == plcnode['key']:
 184                                 return True
 185                         else:
 186                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
 187                                         print "   Successfully updated NODE_KEY with PLC"
 188                                         return True
 189                                 else:
 190                                         return False
 191
 192                         #for key in bm.VARS.keys():
 193                         #       print key, " == ", bm.VARS[key]
 194                 else:
 195                         print "   Unable to retrieve NODE_KEY"
 196
 197         def bootmanager_running(self):
 198                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 199                         return True
 200                 else:
 201                         return False
 202
 203         def set_nodestate(self, state='boot'):
 204                 return api.UpdateNode(self.node, {'boot_state' : state})
 205
 206         def get_nodestate(self):
 207                 try:
 208                         return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
 209                 except:
 210                         traceback.print_exc()
 211                         # NOTE: use last cached value from plc
 212                         fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
 213                         return fbnode['plc_node_stats']['boot_state']
 214
 215
 216         def restart_node(self, state='boot'):
 217                 api.UpdateNode(self.node, {'boot_state' : state})
 218
 219                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
 220                 if not pflags.getRecentFlag('gentlekill'):
 221                         print "   Killing all slice processes... : %s" %  self.node
 222                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
 223                         self.c.modules.os.system(cmd_slicekill)
 224                         cmd = """ shutdown -r +1 & """
 225                         print "   Restarting %s : %s" % ( self.node, cmd)
 226                         self.c.modules.os.system(cmd)
 227
 228                         pflags.setRecentFlag('gentlekill')
 229                         pflags.save()
 230                 else:
 231                         print "   Restarting with sysrq 'sub' %s" % self.node
 232                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
 233                         self.c.modules.os.system(cmd)
 234
 235                 return
 236
 237         def restart_bootmanager(self, forceState):
 238
 239                 self.c.modules.os.chdir('/tmp/source')
 240                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 241                         print "   BootManager is already running: try again soon..."
 242                 else:
 243                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
 244                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
 245                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
 246                                   "  rm -f /tmp/BM_RUNNING " + \
 247                                   ") &"
 248                         cmd = cmd % forceState
 249                         self.c.modules.os.system(cmd)
 250
 251                 return
 252
 253
 254 class PlanetLabSession:
 255         globalport = 22000 + int(random.random()*1000)
 256
 257         def __init__(self, node, nosetup, verbose):
 258                 self.verbose = verbose
 259                 self.node = node
 260                 self.port = None
 261                 self.nosetup = nosetup
 262                 self.command = None
 263                 self.setup_host()
 264
 265         def get_connection(self, config):
 266                 try:
 267                         print "SocketConnection(localhost, %s" % self.port
 268                         sc = SocketConnection("localhost", self.port)
 269                         print "NodeConnection(%s, %s)" % (sc, self.node)
 270                         conn = NodeConnection(sc, self.node, config)
 271                 except:
 272                         # NOTE: try twice since this can sometimes fail the first time. If
 273                         #               it fails again, let it go.
 274                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
 275                 return conn
 276
 277         def setup_host(self):
 278                 self.port = PlanetLabSession.globalport
 279                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
 280
 281                 args = {}
 282                 args['port'] = self.port
 283                 args['user'] = 'root'
 284                 args['hostname'] = self.node
 285                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
 286                 ssh_port = 22
 287
 288                 if self.nosetup:
 289                         print "Skipping setup"
 290                         return
 291
 292                 # COPY Rpyc files to host
 293                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
 294                 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
 295                 if self.verbose: print cmd
 296                 print cmd
 297                 # TODO: Add timeout
 298                 timeout = 120
 299                 localos = moncommands.CMD()
 300
 301                 ret = localos.system(cmd, timeout)
 302                 print ret
 303                 if ret != 0:
 304                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
 305                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
 306                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
 307                         print "trying: ", cmd
 308                         print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
 309                         ret = localos.system(cmd, timeout)
 310                         print ret
 311                         if ret != 0:
 312                                 print "\tFAILED TWICE"
 313                                 #email_exception("%s rsync failed twice" % self.node)
 314                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 315
 316                 t1 = time.time()
 317                 # KILL any already running servers.
 318                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
 319                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
 320             rm -f out.log
 321             echo "kill server" >> out.log
 322                         netstat -ap | grep python | grep 18812 | awk '{print $7}' | awk -F / '{print $1}' | xargs kill
 323             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
 324             echo "export" >> out.log
 325             export PYTHONPATH=$HOME  ;
 326             echo "start server" >> out.log
 327             python Rpyc/Servers/forking_server.py &> server.log &
 328             echo "done" >> out.log
 329 EOF""")
 330                 print "setup rpyc server over ssh"
 331                 print ssh.ret
 332
 333                 # TODO: Add timeout
 334                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1,
 335                 # and the following options seems to work well.
 336                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
 337                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
 338                           """-o ConnectTimeout=120 """ + \
 339                           """-n -N -L %(port)s:localhost:18812 """ + \
 340                           """%(user)s@%(hostname)s"""
 341                 cmd = cmd % args
 342                 if self.verbose: print cmd
 343                 print cmd
 344                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
 345                 # TODO: the read() here may block indefinitely.  Need a better
 346                 # approach therefore, that includes a timeout.
 347                 #ret = self.command.stdout.read(5)
 348                 ret = moncommands.read_t(self.command.stdout, 5)
 349
 350                 t2 = time.time()
 351                 if 'READY' in ret:
 352                         # NOTE: There is still a slight race for machines that are slow...
 353                         self.timeout = 2*(t2-t1)
 354                         print "Sleeping for %s sec" % self.timeout
 355                         time.sleep(self.timeout)
 356                         return
 357
 358                 if self.command.returncode is not None:
 359                         print "Failed to establish tunnel!"
 360                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
 361
 362                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
 363
 364         def __del__(self):
 365                 if self.command:
 366                         if self.verbose: print "Killing SSH session %s" % self.port
 367                         print "Killing SSH session %s" % self.port
 368                         self.command.kill()
 369
 370
 371 def steps_to_list(steps, index=1):
 372         return map(lambda x: x[index], steps)
 373
 374 def index_to_id(steps,index):
 375         if index < len(steps):
 376                 return steps[index][0]
 377         else:
 378                 return "done"
 379
 380 class DebugInterface:
 381         def __init__(self, hostname):
 382                 self.hostname = hostname
 383                 self.session = None
 384
 385         def getConnection(self):
 386                 print "Creating session for %s" % self.hostname
 387                 # update known_hosts file (in case the node has rebooted since last run)
 388                 try:
 389                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
 390                 except:
 391                         email_exception()
 392                         print traceback.print_exc()
 393                         return False
 394
 395                 msg = "ERROR setting up session for %s" % self.hostname
 396                 try:
 397                         if config == None:
 398                                 self.session = PlanetLabSession(self.hostname, False, True)
 399                         else:
 400                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
 401                 except ExceptionDoubleSSHError, e:
 402                         print msg
 403                         return False
 404                 except Exception, e:
 405                         traceback.print_exc()
 406                         email_exception(msg)
 407                         return False
 408
 409                 print "Getting connection: 1st try"
 410                 try:
 411                         conn = self.session.get_connection(config)
 412                 except EOFError:
 413                         # NOTE: sometimes the wait in setup_host() is not long enough.
 414                         # So, here we try to wait a little longer before giving up entirely.
 415                         try:
 416                                 print "Getting connection: 2nd try"
 417                                 time.sleep(self.session.timeout*5)
 418                                 conn = self.session.get_connection(config)
 419                         except EOFError:
 420                                 # failed twice... no need to report this really, it's just in a
 421                                 # weird state...
 422                                 print "Getting connection: failed"
 423                                 email_exception(self.hostname, "failed twice to get connection")
 424                                 return False
 425                         except:
 426                                 traceback.print_exc()
 427                                 email_exception(self.hostname)
 428                                 return False
 429                 print "Getting connection: ok"
 430                 #print "trying to use conn before returning it."
 431                 #print conn.c.modules.sys.path
 432                 #print conn.c.modules.os.path.exists('/tmp/source')
 433                 #time.sleep(1)
 434
 435                 #print "conn: %s" % conn
 436                 return conn
 437
 438         def getSequences(self):
 439
 440                 # NOTE: The DB is now the autoritative record for all BM sequences.
 441                 #               An admin can introduce new patterns and actions without touching code.
 442                 sequences = {}
 443
 444                 bms = BootmanSequenceRecord.query.all()
 445                 for s in bms:
 446                         sequences[s.sequence] = s.action
 447
 448                 return sequences
 449
 450         def getDiskSteps(self):
 451                 steps = [
 452                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
 453                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
 454                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
 455
 456                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
 457
 458                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
 459                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
 460
 461                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
 462                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
 463
 464                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
 465                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
 466
 467                         ('floppytimeout','floppy0: floppy timeout called'),
 468                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
 469
 470                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
 471                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
 472
 473                         # floppy0: floppy timeout called
 474                         # end_request: I/O error, dev fd0, sector 0
 475
 476                         # Buffer I/O error on device dm-2, logical block 8888896
 477                         # ata1: status=0x51 { DriveReady SeekComplete Error }
 478                         # ata1: error=0x40 { UncorrectableError }
 479                         # SCSI error : <0 0 0 0> return code = 0x8000002
 480                         # sda: Current: sense key: Medium Error
 481                         #       Additional sense: Unrecovered read error - auto reallocate failed
 482
 483                         # SCSI error : <0 2 0 0> return code = 0x40001
 484                         # end_request: I/O error, dev sda, sector 572489600
 485                 ]
 486                 return steps
 487
 488         def getDiskSequence(self, steps, child):
 489                 sequence = []
 490                 while True:
 491                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
 492                         sequence.append(id)
 493
 494                         if id == "done":
 495                                 break
 496                 return sequence
 497
 498         def getBootManagerStepPatterns(self):
 499                 steps = [
 500                         ('bminit'               , 'Initializing the BootManager.'),
 501                         ('cfg'                  , 'Reading node configuration file.'),
 502                         ('auth'                 , 'Authenticating node with PLC.'),
 503                         ('getplc'               , 'Retrieving details of node from PLC.'),
 504                         ('update'               , 'Updating node boot state at PLC.'),
 505                         ('hardware'             , 'Checking if hardware requirements met.'),
 506                         ('installinit'  , 'Install: Initializing.'),
 507                         ('installdisk'  , 'Install: partitioning disks.'),
 508                         ('installbootfs', 'Install: bootstrapfs tarball.'),
 509                         ('installcfg'   , 'Install: Writing configuration files.'),
 510                         ('installstop'  , 'Install: Shutting down installer.'),
 511                         ('update2'              , 'Updating node boot state at PLC.'),
 512                         ('installinit2' , 'Install: Initializing.'),
 513                         ('validate'             , 'Validating node installation.'),
 514                         ('rebuildinitrd', 'Rebuilding initrd'),
 515                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
 516                         ('update3'              , 'Updating node configuration.'),
 517                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
 518                         ('update4'              , 'Sending hardware configuration to PLC.'),
 519                         ('debug'                , 'Starting debug mode'),
 520                         ('bmexceptmount', 'BootManagerException during mount'),
 521                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
 522                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
 523                         ('exception'    , 'Exception'),
 524                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
 525                         ('protoerror2'  , '500 Internal Server Error'),
 526                         ('protoerror'   , 'XML RPC protocol error'),
 527                         ('nodehostname' , 'Configured node hostname does not resolve'),
 528                         ('implementerror', 'Implementation Error'),
 529                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
 530                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
 531                         ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
 532                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
 533                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
 534                         ('noinstall'    , 'notinstalled'),
 535                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
 536                         ('noblockdev'   , "No block devices detected."),
 537                         ('missingkernel', "missingkernel"),
 538                         ('dnserror'     , 'Name or service not known'),
 539                         ('noparseconfig', "Found configuration file plnode.txt on floppy, but was unable to parse it"),
 540                         ('noconfig'             , "Unable to find and read a node configuration file"),
 541                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
 542                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
 543                         ('hardwarerequirefail' , 'Hardware requirements not met'),
 544                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
 545                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
 546                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
 547                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
 548                         ('modulefail'   , 'Unable to get list of system modules'),
 549                         ('writeerror'   , 'write error: No space left on device'),
 550                         ('nospace'      , "No space left on device"),
 551                         ('nonode'       , 'Failed to authenticate call: No such node'),
 552                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
 553                         ('authfail2'    , 'Authentication Failed'),
 554                         ('bootcheckfail'  , 'BootCheckAuthentication'),
 555                         ('bootupdatefail' , 'BootUpdateNode'),
 556                 ]
 557                 return steps
 558
 559         def getBootManagerSequenceFromLog(self, steps, child):
 560                 sequence = []
 561                 while True:
 562
 563                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
 564                         id = index_to_id(steps,index)
 565                         sequence.append(id)
 566
 567                         if id == "exception":
 568                                 print "...Found An Exception!!!"
 569                         elif id == "done": #index == len(steps_to_list(steps)):
 570                                 #print "Reached EOF"
 571                                 break
 572
 573                 return sequence
 574
 575 def restore(sitehist, hostname, config=None, forced_action=None):
 576         ret = restore_basic(sitehist, hostname, config, forced_action)
 577         session.flush()
 578         return ret
 579
 580 def restore_basic(sitehist, hostname, config=None, forced_action=None):
 581
 582         # NOTE: Nothing works if the bootcd is REALLY old.
 583         #       So, this is the first step.
 584
 585         bootman_action = "unknown"
 586
 587         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
 588         recent_actions = sitehist.getRecentActions(hostname=hostname)
 589
 590         if fbnode['observed_category'] == "OLDBOOTCD":
 591                 print "\t...Notify owner to update BootImage!!!"
 592
 593                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
 594                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
 595
 596                         print "\tDisabling %s due to out-of-date BootImage" % hostname
 597                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
 598
 599                 # NOTE: nothing else is possible.
 600                 return "disabled"
 601
 602         debugnode = DebugInterface(hostname)
 603         conn = debugnode.getConnection()
 604         if type(conn) == type(False): return "connect_failed"
 605
 606         boot_state = conn.get_boot_state()
 607         if boot_state != "debug":
 608                 print "... %s in %s state: skipping..." % (hostname , boot_state)
 609                 return "skipped" #boot_state == "boot"
 610
 611         if conn.bootmanager_running():
 612                 print "...BootManager is currently running.  Skipping host %s" %hostname
 613                 return "skipped" # True
 614
 615         # Read persistent flags, tagged on one week intervals.
 616
 617         if config and not config.quiet: print "...downloading dmesg from %s" %hostname
 618         dmesg = conn.get_dmesg()
 619         child = fdpexpect.fdspawn(dmesg)
 620
 621         steps = debugnode.getDiskSteps()
 622         sequence = debugnode.getDiskSequence(steps, child)
 623
 624         s = Set(sequence)
 625         if config and not config.quiet: print "\tSET: ", s
 626
 627         if len(s) > 1:
 628                 print "...Potential drive errors on %s" % hostname
 629                 if len(s) == 2 and 'floppyerror' in s:
 630                         print "...Should investigate.  Continuing with node."
 631                 else:
 632                         print "...Should investigate.  Skipping node."
 633                         # TODO: send message related to these errors.
 634
 635                         if not found_within(recent_actions, 'baddisk_notice', 7):
 636                                 print "baddisk_notice not found recently"
 637
 638                                 log=conn.get_dmesg().read()
 639                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
 640                                 return "skipping_baddisk"
 641                         else:
 642                                 # NOTE: "" does not add a new action record
 643                                 return ""
 644
 645
 646         print "...Downloading bm.log from %s" %hostname
 647         log = conn.get_bootmanager_log()
 648         bm_log_data = log.read() # get data
 649         log.seek(0)     # reset fd pointer for fdspawn
 650         child = fdpexpect.fdspawn(log)
 651
 652         if hasattr(config, 'collect') and config.collect: return "collect"
 653
 654         if config and not config.quiet: print "...Scanning bm.log for errors"
 655
 656         time.sleep(1)
 657
 658         steps = debugnode.getBootManagerStepPatterns()
 659         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
 660
 661         s = "-".join(sequence)
 662         print "   FOUND SEQUENCE: ", s
 663
 664         # NOTE: We get or set the flag based on the current sequence identifier.
 665         #  By using the sequence identifier, we guarantee that there will be no
 666         #  frequent loops.  I'm guessing there is a better way to track loops,
 667         #  though.
 668
 669         sequences = debugnode.getSequences()
 670         flag_set = True
 671
 672         if s not in sequences:
 673                 print "   HOST %s" % hostname
 674                 print "   UNKNOWN SEQUENCE: %s" % s
 675
 676                 args = {}
 677                 args['hostname'] = hostname
 678                 args['sequence'] = s
 679                 args['bmlog'] = bm_log_data
 680                 args['viart'] = False
 681                 args['saveact'] = True
 682                 args['ccemail'] = True
 683
 684                 sitehist.sendMessage('unknownsequence_notice', **args)
 685
 686                 conn.restart_bootmanager('boot')
 687
 688                 bootman_action = "restart_bootmanager"
 689
 690                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
 691                 # This way, we can check it again after we've fixed it.
 692                 flag_set = False
 693
 694         else:
 695                 bootman_action = sequences[s]
 696
 697                 if   sequences[s] == "restart_bootmanager_boot":
 698                         print "...Restarting BootManager.py on %s "%hostname
 699                         conn.restart_bootmanager('boot')
 700                 elif sequences[s] == "restart_bootmanager_rins":
 701                         print "...Restarting BootManager.py on %s "%hostname
 702                         conn.restart_bootmanager('reinstall')
 703                 elif sequences[s] == "restart_node_rins":
 704                         conn.restart_node('reinstall')
 705                 elif sequences[s] == "restart_node_boot":
 706                         conn.restart_node('boot')
 707                 elif sequences[s] == "fsck_repair":
 708                         conn.fsck_repair_node()
 709                 elif sequences[s] == "repair_node_keys":
 710                         if conn.compare_and_repair_nodekeys():
 711                                 # the keys either are in sync or were forced in sync.
 712                                 # so try to start BM again.
 713                                 conn.restart_bootmanager(conn.get_nodestate())
 714                         else:
 715                                 # there was some failure to synchronize the keys.
 716                                 print "...Unable to repair node keys on %s" %hostname
 717                                 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
 718                                         args = {}
 719                                         args['hostname'] = hostname
 720                                         sitehist.sendMessage('nodeconfig_notice', **args)
 721                                         conn.dump_plconf_file()
 722                                 else:
 723                                         # NOTE: do not add a new action record
 724                                         return ""
 725
 726                 elif sequences[s] == "unknownsequence_notice":
 727                         args = {}
 728                         args['hostname'] = hostname
 729                         args['sequence'] = s
 730                         args['bmlog'] = bm_log_data
 731                         args['viart'] = False
 732                         args['saveact'] = True
 733                         args['ccemail'] = True
 734
 735                         sitehist.sendMessage('unknownsequence_notice', **args)
 736                         conn.restart_bootmanager('boot')
 737
 738                 elif sequences[s] == "nodeconfig_notice":
 739
 740                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
 741                                 args = {}
 742                                 args['hostname'] = hostname
 743                                 sitehist.sendMessage('nodeconfig_notice', **args)
 744                                 conn.dump_plconf_file()
 745                         else:
 746                                 # NOTE: do not add a new action record
 747                                 return ""
 748
 749                 elif sequences[s] == "nodenetwork_email":
 750
 751                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
 752                                 args = {}
 753                                 args['hostname'] = hostname
 754                                 args['bmlog'] = bm_log_data
 755                                 sitehist.sendMessage('nodeconfig_notice', **args)
 756                                 conn.dump_plconf_file()
 757                         else:
 758                                 # NOTE: do not add a new action record
 759                                 return ""
 760
 761                 elif sequences[s] == "noblockdevice_notice":
 762
 763                         if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
 764                                 args = {}
 765                                 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
 766                                 args['hostname'] = hostname
 767
 768                                 sitehist.sendMessage('noblockdevice_notice', **args)
 769                         else:
 770                                 # NOTE: do not add a new action record
 771                                 return ""
 772
 773                 elif sequences[s] == "baddisk_notice":
 774                         # MAKE An ACTION record that this host has failed hardware.  May
 775                         # require either an exception "/minhw" or other manual intervention.
 776                         # Definitely need to send out some more EMAIL.
 777                         # TODO: email notice of broken hardware
 778                         if not found_within(recent_actions, 'baddisk_notice', 7):
 779                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
 780                                 args = {}
 781                                 args['hostname'] = hostname
 782                                 args['log'] = conn.get_dmesg().read()
 783
 784                                 sitehist.sendMessage('baddisk_notice', **args)
 785                                 #conn.set_nodestate('disabled')
 786                         else:
 787                                 # NOTE: do not add a new action record
 788                                 return ""
 789
 790                 elif sequences[s] == "minimalhardware_notice":
 791                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
 792                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
 793                                 args = {}
 794                                 args['hostname'] = hostname
 795                                 args['bmlog'] = bm_log_data
 796                                 sitehist.sendMessage('minimalhardware_notice', **args)
 797                         else:
 798                                 # NOTE: do not add a new action record
 799                                 return ""
 800
 801                 elif sequences[s] == "baddns_notice":
 802                         if not found_within(recent_actions, 'baddns_notice', 1):
 803                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
 804                                 args = {}
 805                                 try:
 806                                         node = plccache.GetNodeByName(hostname)
 807                                         net = api.GetInterfaces(node['interface_ids'])[0]
 808                                 except:
 809                                         email_exception()
 810                                         print traceback.print_exc()
 811                                         # TODO: api error. skip email, b/c all info is not available,
 812                                         # flag_set will not be recorded.
 813                                         return "exception"
 814                                 nodenet_str = network_config_to_str(net)
 815
 816                                 args['hostname'] = hostname
 817                                 args['network_config'] = nodenet_str
 818                                 args['interface_id'] = net['interface_id']
 819
 820                                 sitehist.sendMessage('baddns_notice', **args)
 821                         else:
 822                                 # NOTE: do not add a new action record
 823                                 return ""
 824
 825         return bootman_action
 826
 827
 828 # MAIN -------------------------------------------------------------------
 829
 830 def main():
 831         from monitor import parser as parsermodule
 832         parser = parsermodule.getParser()
 833
 834         parser.set_defaults(child=False, collect=False, nosetup=False, verbose=False,
 835                                                 force=None, quiet=False)
 836         parser.add_option("", "--child", dest="child", action="store_true",
 837                                                 help="This is the child mode of this process.")
 838         parser.add_option("", "--force", dest="force", metavar="boot_state",
 839                                                 help="Force a boot state passed to BootManager.py.")
 840         parser.add_option("", "--quiet", dest="quiet", action="store_true",
 841                                                 help="Extra quiet output messages.")
 842         parser.add_option("", "--verbose", dest="verbose", action="store_true",
 843                                                 help="Extra debug output messages.")
 844         parser.add_option("", "--nonet", dest="nonet", action="store_true",
 845                                                 help="Do not setup the network, use existing log files to re-run a test pass.")
 846         parser.add_option("", "--collect", dest="collect", action="store_true",
 847                                                 help="No action, just collect dmesg, and bm.log")
 848         parser.add_option("", "--nosetup", dest="nosetup", action="store_true",
 849                                                 help="Do not perform the orginary setup phase.")
 850
 851         parser = parsermodule.getParser(['nodesets', 'defaults'], parser)
 852         config = parsermodule.parse_args(parser)
 853
 854         if config.nodelist:
 855                 nodes = config.getListFromFile(config.nodelist)
 856         elif config.node:
 857                 nodes = [ config.node ]
 858         else:
 859                 parser.print_help()
 860                 sys.exit(1)
 861
 862         for node in nodes:
 863                 # get sitehist
 864                 lb = plccache.plcdb_hn2lb[node]
 865                 sitehist = SiteInterface.get_or_make(loginbase=lb)
 866                 #reboot(node, config)
 867                 restore(sitehist, node, config=None, forced_action=None)
 868
 869 if __name__ == "__main__":
 870         main()