monitor/bootman.py

   1 #!/usr/bin/python
   2
   3 # Attempt to reboot a node in debug state.
   4
   5 import os
   6 import sys
   7 import time
   8 import random
   9 import signal
  10 import traceback
  11 import subprocess
  12 from sets import Set
  13
  14 from monitor.util.sshknownhosts import SSHKnownHosts
  15 from monitor.Rpyc import SocketConnection, Async
  16 from monitor.Rpyc.Utils import *
  17
  18 from monitor import getconf
  19 from monitor import config
  20 from monitor import const
  21 from monitor.model import *
  22 from monitor.common import email_exception, found_within
  23 from monitor.database.info.model import *
  24 from monitor.database.info.interface import *
  25 from monitor.wrapper import plc
  26 from monitor.wrapper import plccache
  27 from monitor.wrapper.emailTxt import mailtxt
  28 from monitor.nodeconfig import network_config_to_str
  29
  30 from pcucontrol.util import command as moncommands
  31 from pcucontrol.util.command import Sopen
  32 from pcucontrol.transports.ssh import pxssh as pxssh
  33 from pcucontrol.transports.ssh import fdpexpect as fdpexpect
  34 from pcucontrol.transports.ssh import pexpect as pexpect
  35
  36 api = plc.getAuthAPI()
  37 fb = None
  38
  39 def bootmanager_log_name(hostname):
  40         t_stamp = time.strftime("%Y-%m-%d-%H:%M")
  41         base_filename = "%s-bm.%s.log" % (t_stamp, hostname)
  42         short_target_filename = os.path.join('history', base_filename)
  43         return short_target_filename
  44
  45 def bootmanager_log_action(hostname, short_log_path, logtype="bm.log"):
  46         try:
  47                 node = FindbadNodeRecord.get_latest_by(hostname=hostname)
  48                 loginbase = PlcSite.query.get(node.plc_node_stats['site_id']).plc_site_stats['login_base']
  49                 err = ""
  50         except:
  51                 loginbase = "unknown"
  52                 err = traceback.format_exc()
  53
  54         act = ActionRecord(loginbase=loginbase,
  55                                                 hostname=hostname,
  56                                                 action='log',
  57                                                 action_type=logtype,
  58                                                 log_path=short_log_path,
  59                                                 error_string=err)
  60         return
  61
  62
  63 class ExceptionDoubleSSHError(Exception): pass
  64
  65 class NodeConnection:
  66         def __init__(self, connection, node, config):
  67                 print "init nodeconnection"
  68                 self.node = node
  69                 self.c = connection
  70                 self.config = config
  71
  72         def get_boot_state(self):
  73                 print "get_boot_state(self)"
  74                 try:
  75                         if self.c.modules.os.path.exists('/tmp/source'):
  76                                 return "debug"
  77                         elif self.c.modules.os.path.exists('/vservers'):
  78                                 return "boot"
  79                         else:
  80                                 return "unknown"
  81                 except EOFError:
  82                         traceback.print_exc()
  83                         print self.c.modules.sys.path
  84                 except:
  85                         email_exception()
  86                         traceback.print_exc()
  87
  88                 return "unknown"
  89
  90         def get_dmesg(self):
  91                 t_stamp = time.strftime("%Y-%m-%d-%H:%M")
  92                 self.c.modules.os.system("dmesg > /var/log/dmesg.bm.log")
  93                 download(self.c, "/var/log/dmesg.bm.log", "%s/history/%s-dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node))
  94                 os.system("cp %s/history/%s-dmesg.%s.log %s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, t_stamp, self.node, config.MONITOR_BOOTMANAGER_LOG, self.node))
  95                 log = open("%s/dmesg.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
  96                 return log
  97
  98         def get_bootmanager_log(self):
  99                 bm_name = bootmanager_log_name(self.node)
 100                 download(self.c, "/tmp/bm.log", "%s/%s" % (config.MONITOR_BOOTMANAGER_LOG, bm_name))
 101                 #email_exception(self.node, "collected BM log for %s" % self.node)
 102                 bootmanager_log_action(self.node, bm_name, "collected_bm.log")
 103                 os.system("cp %s/%s %s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, bm_name, config.MONITOR_BOOTMANAGER_LOG, self.node))
 104                 log = open("%s/bm.%s.log" % (config.MONITOR_BOOTMANAGER_LOG, self.node), 'r')
 105                 return log
 106
 107         def dump_plconf_file(self):
 108                 c = self.c
 109                 self.c.modules.sys.path.append("/tmp/source/")
 110                 self.c.modules.os.chdir('/tmp/source')
 111
 112                 log = c.modules.BootManager.log('/tmp/new.log')
 113                 bm = c.modules.BootManager.BootManager(log,'boot')
 114
 115                 BootManagerException = c.modules.Exceptions.BootManagerException
 116                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
 117                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
 118                 bm_continue = True
 119
 120                 InitializeBootManager.Run(bm.VARS, bm.LOG)
 121                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
 122                 except Exception, x:
 123                         bm_continue = False
 124                         print "   ERROR:", x
 125                         print "   Possibly, unable to find valid configuration file"
 126
 127                 if bm_continue:
 128                         for key in bm.VARS.keys():
 129                                 print key, " == ", bm.VARS[key]
 130                 else:
 131                         print "   Unable to read Node Configuration"
 132
 133         def fprobe_repair_node(self):
 134                 # When fprobe data gets too much, it fills the root partition and
 135                 # fails to boot
 136                 c = self.c
 137                 self.c.modules.sys.path.append("/tmp/source/")
 138
 139                 # NOTE: assume that the root fs is already mounted...
 140                 if self.c.modules.os.path.exists('/tmp/mnt/sysimg/var/local/fprobe'):
 141                         print "CLEARING FPROBE DATA on %s" % self.node
 142                         self.c.modules.os.chdir('/tmp/mnt/sysimg/var/local/fprobe')
 143                         cmd = """ ls -lrt . | awk '{if (i<NR/2 && $9) {print "rm "$9;i=i+1;}}' | sh """
 144                         self.c.modules.os.system(cmd)
 145                 else:
 146                         print "COULD NOT CLEAR FPROBE DATA on %s" % self.node
 147
 148         def fsck_repair_node(self):
 149                 c = self.c
 150                 self.c.modules.sys.path.append("/tmp/source/")
 151                 self.c.modules.os.chdir('/tmp/source')
 152                 # TODO: restart
 153                 # TODO: set boot state to node's actually boot state.
 154                 # could be 'boot' or 'safeboot'
 155                 self.c.modules.os.chdir('/tmp/source')
 156                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 157                         print "Running MANUAL FSCK already... try again soon."
 158                 else:
 159                         print "Running MANUAL fsck on %s" % self.node
 160                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
 161                                   "  fsck -v -f -y /dev/planetlab/root &> out.fsck ; " + \
 162                                   "  fsck -v -f -y /dev/planetlab/vservers >> out.fsck 2>&1 ; " + \
 163                                   "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
 164                                   "  rm -f /tmp/BM_RUNNING " + \
 165                                   ") &"
 166                         cmd = cmd % self.get_nodestate()
 167                         self.c.modules.os.system(cmd)
 168                 #self.restart_bootmanager('boot')
 169                 pass
 170
 171         def compare_and_repair_nodekeys(self):
 172                 c = self.c
 173                 self.c.modules.sys.path.append("/tmp/source/")
 174                 self.c.modules.os.chdir('/tmp/source')
 175
 176                 log = c.modules.BootManager.log('/tmp/new.log')
 177                 bm = c.modules.BootManager.BootManager(log,'boot')
 178
 179                 BootManagerException = c.modules.Exceptions.BootManagerException
 180                 InitializeBootManager = c.modules.BootManager.InitializeBootManager
 181                 ReadNodeConfiguration = c.modules.BootManager.ReadNodeConfiguration
 182                 bm_continue = True
 183
 184                 plcnode = plccache.GetNodeByName(self.node)
 185
 186                 InitializeBootManager.Run(bm.VARS, bm.LOG)
 187                 try: ReadNodeConfiguration.Run(bm.VARS, bm.LOG)
 188                 except Exception, x:
 189                         bm_continue = False
 190                         print "exception"
 191                         print x
 192                         print "   Possibly, unable to find valid configuration file"
 193
 194                 if bm_continue:
 195                         print "   NODE: %s" % bm.VARS['NODE_KEY']
 196                         print "   PLC : %s" % plcnode['key']
 197
 198                         if bm.VARS['NODE_KEY'] == plcnode['key']:
 199                                 return True
 200                         else:
 201                                 if api.UpdateNode(self.node, {'key': bm.VARS['NODE_KEY']}):
 202                                         print "   Successfully updated NODE_KEY with PLC"
 203                                         return True
 204                                 else:
 205                                         return False
 206
 207                         #for key in bm.VARS.keys():
 208                         #       print key, " == ", bm.VARS[key]
 209                 else:
 210                         print "   Unable to retrieve NODE_KEY"
 211
 212         def bootmanager_running(self):
 213                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 214                         return True
 215                 else:
 216                         return False
 217
 218         def set_nodestate(self, state='boot'):
 219                 return api.UpdateNode(self.node, {'boot_state' : state})
 220
 221         def get_nodestate(self):
 222                 try:
 223                         return api.GetNodes(self.node, ['boot_state'])[0]['boot_state']
 224                 except:
 225                         traceback.print_exc()
 226                         # NOTE: use last cached value from plc
 227                         fbnode = FindbadNodeRecord.get_latest_by(hostname=self.node).to_dict()
 228                         return fbnode['plc_node_stats']['boot_state']
 229
 230
 231         def restart_node(self, state='boot'):
 232                 api.UpdateNode(self.node, {'boot_state' : state})
 233
 234                 pflags = PersistFlags(self.node, 1*60*60*24, db='restart_persistflags')
 235                 if not pflags.getRecentFlag('gentlekill'):
 236                         print "   Killing all slice processes... : %s" %  self.node
 237                         cmd_slicekill = "ls -d /proc/virtual/[0-9]* | awk -F '/' '{print $4}' | xargs -I{} /usr/sbin/vkill -s 9 --xid {} -- 0"
 238                         self.c.modules.os.system(cmd_slicekill)
 239                         cmd = """ shutdown -r +1 & """
 240                         print "   Restarting %s : %s" % ( self.node, cmd)
 241                         self.c.modules.os.system(cmd)
 242
 243                         pflags.setRecentFlag('gentlekill')
 244                         pflags.save()
 245                 else:
 246                         print "   Restarting with sysrq 'sub' %s" % self.node
 247                         cmd = """ (sleep 5; echo 's' > /proc/sysrq-trigger; echo 'u' > /proc/sysrq-trigger; echo 'b' > /proc/sysrq-trigger ) & """
 248                         self.c.modules.os.system(cmd)
 249
 250                 return
 251
 252         def restart_bootmanager(self, forceState):
 253
 254                 self.c.modules.os.chdir('/tmp/source')
 255                 if self.c.modules.os.path.exists('/tmp/BM_RUNNING'):
 256                         print "   BootManager is already running: try again soon..."
 257                 else:
 258                         print "   Starting 'BootManager.py %s' on %s " % (forceState, self.node)
 259                         cmd = "( touch /tmp/BM_RUNNING ;  " + \
 260                               "  python ./BootManager.py %s &> server.log < /dev/null ; " + \
 261                                   "  rm -f /tmp/BM_RUNNING " + \
 262                                   ") &"
 263                         cmd = cmd % forceState
 264                         self.c.modules.os.system(cmd)
 265
 266                 return
 267
 268
 269 class PlanetLabSession:
 270         globalport = 22000 + int(random.random()*1000)
 271
 272         def __init__(self, node, nosetup, verbose):
 273                 self.verbose = verbose
 274                 self.node = node
 275                 self.port = None
 276                 self.nosetup = nosetup
 277                 self.command = None
 278                 self.setup_host()
 279
 280         def get_connection(self, config):
 281                 try:
 282                         print "SocketConnection(localhost, %s" % self.port
 283                         sc = SocketConnection("localhost", self.port)
 284                         print "NodeConnection(%s, %s)" % (sc, self.node)
 285                         conn = NodeConnection(sc, self.node, config)
 286                 except:
 287                         # NOTE: try twice since this can sometimes fail the first time. If
 288                         #               it fails again, let it go.
 289                         conn = NodeConnection(SocketConnection("localhost", self.port), self.node, config)
 290                 return conn
 291
 292         def setup_host(self):
 293                 self.port = PlanetLabSession.globalport
 294                 PlanetLabSession.globalport = PlanetLabSession.globalport + 1
 295
 296                 args = {}
 297                 args['port'] = self.port
 298                 args['user'] = 'root'
 299                 args['hostname'] = self.node
 300                 args['monitordir'] = config.MONITOR_SCRIPT_ROOT
 301                 ssh_port = 22
 302
 303                 if self.nosetup:
 304                         print "Skipping setup"
 305                         return
 306
 307                 # COPY Rpyc files to host
 308                 #cmd = "rsync -vvv -az -e ssh %(monitordir)s/Rpyc/ %(user)s@%(hostname)s:Rpyc 2> /dev/null" % args
 309                 cmd = """rsync -vvv -az -e "ssh -o BatchMode=yes" %(monitordir)s/monitor/Rpyc/ %(user)s@%(hostname)s:Rpyc""" % args
 310                 if self.verbose: print cmd
 311                 print cmd
 312                 # TODO: Add timeout
 313                 timeout = 120
 314                 localos = moncommands.CMD()
 315
 316                 ret = localos.system(cmd, timeout)
 317                 print ret
 318                 if ret != 0:
 319                         print "\tUNKNOWN SSH KEY FOR %s; making an exception" % self.node
 320                         #print "MAKE EXPLICIT EXCEPTION FOR %s" % self.node
 321                         k = SSHKnownHosts(); k.updateDirect(self.node); k.write(); del k
 322                         print "trying: ", cmd
 323                         print [ "%s=%s" % (a, os.environ[a]) for a in filter(lambda x: 'SSH' in x, os.environ.keys()) ]
 324                         ret = localos.system(cmd, timeout)
 325                         print ret
 326                         if ret != 0:
 327                                 print "\tFAILED TWICE"
 328                                 #email_exception("%s rsync failed twice" % self.node)
 329                                 raise ExceptionDoubleSSHError("Failed twice trying to login with updated ssh host key")
 330
 331                 t1 = time.time()
 332                 # KILL any already running servers.
 333                 ssh = moncommands.SSH(args['user'], args['hostname'], ssh_port)
 334                 (ov,ev) = ssh.run_noexcept2("""<<\EOF
 335             rm -f out.log
 336             echo "kill server" >> out.log
 337                         netstat -ap | grep python | grep 18812 | awk '{print $7}' | awk -F / '{print $1}' | xargs kill
 338             ps ax | grep Rpyc | grep -v grep | awk '{print $1}' | xargs kill 2> /dev/null ;
 339             echo "export" >> out.log
 340             export PYTHONPATH=$HOME  ;
 341             echo "start server" >> out.log
 342             python Rpyc/Servers/forking_server.py &> server.log &
 343             echo "done" >> out.log
 344 EOF""")
 345                 print "setup rpyc server over ssh"
 346                 print ssh.ret
 347
 348                 # TODO: Add timeout
 349                 # This was tricky to make synchronous.  The combination of ssh-clients-4.7p1,
 350                 # and the following options seems to work well.
 351                 cmd = """ssh -o ExitOnForwardFailure=yes -o BatchMode=yes """ + \
 352                           """-o PermitLocalCommand=yes -o LocalCommand='echo "READY"' """ + \
 353                           """-o ConnectTimeout=120 """ + \
 354                           """-n -N -L %(port)s:localhost:18812 """ + \
 355                           """%(user)s@%(hostname)s"""
 356                 cmd = cmd % args
 357                 if self.verbose: print cmd
 358                 print cmd
 359                 self.command = Sopen(cmd, shell=True, stdout=subprocess.PIPE)
 360                 # TODO: the read() here may block indefinitely.  Need a better
 361                 # approach therefore, that includes a timeout.
 362                 #ret = self.command.stdout.read(5)
 363                 ret = moncommands.read_t(self.command.stdout, 5)
 364
 365                 t2 = time.time()
 366                 if 'READY' in ret:
 367                         # NOTE: There is still a slight race for machines that are slow...
 368                         self.timeout = 2*(t2-t1)
 369                         print "Sleeping for %s sec" % self.timeout
 370                         time.sleep(self.timeout)
 371                         return
 372
 373                 if self.command.returncode is not None:
 374                         print "Failed to establish tunnel!"
 375                         raise Exception("SSH Tunnel exception : %s %s" % (self.node, self.command.returncode))
 376
 377                 raise Exception("Unknown SSH Tunnel Exception: still running, but did not report 'READY'")
 378
 379         def __del__(self):
 380                 if self.command:
 381                         if self.verbose: print "Killing SSH session %s" % self.port
 382                         print "Killing SSH session %s" % self.port
 383                         self.command.kill()
 384
 385
 386 def steps_to_list(steps, index=1):
 387         return map(lambda x: x[index], steps)
 388
 389 def index_to_id(steps,index):
 390         if index < len(steps):
 391                 return steps[index][0]
 392         else:
 393                 return "done"
 394
 395 class DebugInterface:
 396         def __init__(self, hostname):
 397                 self.hostname = hostname
 398                 self.session = None
 399
 400         def getConnection(self):
 401                 print "Creating session for %s" % self.hostname
 402                 # update known_hosts file (in case the node has rebooted since last run)
 403                 try:
 404                         k = SSHKnownHosts(); k.update(self.hostname); k.write(); del k
 405                 except:
 406                         email_exception()
 407                         print traceback.print_exc()
 408                         return False
 409
 410                 msg = "ERROR setting up session for %s" % self.hostname
 411                 try:
 412                         if config == None:
 413                                 self.session = PlanetLabSession(self.hostname, False, True)
 414                         else:
 415                                 self.session = PlanetLabSession(self.hostname, config.nosetup, config.verbose)
 416                 except ExceptionDoubleSSHError, e:
 417                         print msg
 418                         return False
 419                 except Exception, e:
 420                         traceback.print_exc()
 421                         email_exception(msg)
 422                         return False
 423
 424                 print "Getting connection: 1st try"
 425                 try:
 426                         conn = self.session.get_connection(config)
 427                 except EOFError:
 428                         # NOTE: sometimes the wait in setup_host() is not long enough.
 429                         # So, here we try to wait a little longer before giving up entirely.
 430                         try:
 431                                 print "Getting connection: 2nd try"
 432                                 time.sleep(self.session.timeout*5)
 433                                 conn = self.session.get_connection(config)
 434                         except EOFError:
 435                                 # failed twice... no need to report this really, it's just in a
 436                                 # weird state...
 437                                 print "Getting connection: failed"
 438                                 email_exception(self.hostname, "failed twice to get connection")
 439                                 return False
 440                         except:
 441                                 traceback.print_exc()
 442                                 email_exception(self.hostname)
 443                                 return False
 444                 print "Getting connection: ok"
 445                 #print "trying to use conn before returning it."
 446                 #print conn.c.modules.sys.path
 447                 #print conn.c.modules.os.path.exists('/tmp/source')
 448                 #time.sleep(1)
 449
 450                 #print "conn: %s" % conn
 451                 return conn
 452
 453         def getSequences(self):
 454
 455                 # NOTE: The DB is now the autoritative record for all BM sequences.
 456                 #               An admin can introduce new patterns and actions without touching code.
 457                 sequences = {}
 458
 459                 bms = BootmanSequenceRecord.query.all()
 460                 for s in bms:
 461                         sequences[s.sequence] = s.action
 462
 463                 return sequences
 464
 465         def getDiskSteps(self):
 466                 steps = [
 467                         ('scsierror2' , 'sd \d:\d:\d:\d: ioctl_internal_command return code = \d+'),
 468                         ('scsierror'  , 'SCSI error : <\d+ \d+ \d+ \d+> return code = 0x\d+'),
 469                         ('ioerror'    , 'end_request: I/O error, dev sd\w+, sector \d+'),
 470                         ('ccisserror' , 'cciss: cmd \w+ has CHECK CONDITION'),
 471
 472                         ('buffererror', 'Buffer I/O error on device dm-\d, logical block \d+'),
 473
 474                         ('hdaseekerror', 'hda: dma_intr: status=0x\d+ { DriveReady SeekComplete Error }'),
 475                         ('hdacorrecterror', 'hda: dma_intr: error=0x\d+ { UncorrectableError }, LBAsect=\d+, sector=\d+'),
 476
 477                         ('atareadyerror'   , 'ata\d+: status=0x\d+ { DriveReady SeekComplete Error }'),
 478                         ('atacorrecterror' , 'ata\d+: error=0x\d+ { UncorrectableError }'),
 479
 480                         ('sdXerror'   , 'sd\w: Current: sense key: Medium Error'),
 481                         ('ext3error'   , 'EXT3-fs error (device dm-\d+): ext3_find_entry: reading directory #\d+ offset \d+'),
 482
 483                         ('floppytimeout','floppy0: floppy timeout called'),
 484                         ('floppyerror',  'end_request: I/O error, dev fd\w+, sector \d+'),
 485
 486                         # hda: dma_intr: status=0x51 { DriveReady SeekComplete Error }
 487                         # hda: dma_intr: error=0x40 { UncorrectableError }, LBAsect=23331263, sector=23331263
 488
 489                         # floppy0: floppy timeout called
 490                         # end_request: I/O error, dev fd0, sector 0
 491
 492                         # Buffer I/O error on device dm-2, logical block 8888896
 493                         # ata1: status=0x51 { DriveReady SeekComplete Error }
 494                         # ata1: error=0x40 { UncorrectableError }
 495                         # SCSI error : <0 0 0 0> return code = 0x8000002
 496                         # sda: Current: sense key: Medium Error
 497                         #       Additional sense: Unrecovered read error - auto reallocate failed
 498
 499                         # SCSI error : <0 2 0 0> return code = 0x40001
 500                         # end_request: I/O error, dev sda, sector 572489600
 501                 ]
 502                 return steps
 503
 504         def getDiskSequence(self, steps, child):
 505                 sequence = []
 506                 while True:
 507                         id = index_to_id(steps, child.expect( steps_to_list(steps) + [ pexpect.EOF ]))
 508                         sequence.append(id)
 509
 510                         if id == "done":
 511                                 break
 512                 return sequence
 513
 514         def getBootManagerStepPatterns(self):
 515                 steps = [
 516                         ('bminit'               , 'Initializing the BootManager.'),
 517                         ('cfg'                  , 'Reading node configuration file.'),
 518                         ('auth'                 , 'Authenticating node with PLC.'),
 519                         ('getplc'               , 'Retrieving details of node from PLC.'),
 520                         ('update'               , 'Updating node boot state at PLC.'),
 521                         ('hardware'             , 'Checking if hardware requirements met.'),
 522                         ('installinit'  , 'Install: Initializing.'),
 523                         ('installdisk'  , 'Install: partitioning disks.'),
 524                         ('installbootfs', 'Install: bootstrapfs tarball.'),
 525                         ('installcfg'   , 'Install: Writing configuration files.'),
 526                         ('installstop'  , 'Install: Shutting down installer.'),
 527                         ('update2'              , 'Updating node boot state at PLC.'),
 528                         ('installinit2' , 'Install: Initializing.'),
 529                         ('validate'             , 'Validating node installation.'),
 530                         ('rebuildinitrd', 'Rebuilding initrd'),
 531                         ('netcfg'               , 'Install: Writing Network Configuration files.'),
 532                         ('update3'              , 'Updating node configuration.'),
 533                         ('disk'                 , 'Checking for unused disks to add to LVM.'),
 534                         ('update4'              , 'Sending hardware configuration to PLC.'),
 535                         ('debug'                , 'Starting debug mode'),
 536                         ('bmexceptmount', 'BootManagerException during mount'),
 537                         ('bmexceptvgscan', 'BootManagerException during vgscan/vgchange'),
 538                         ('bmexceptrmfail', 'Unable to remove directory tree: /tmp/mnt'),
 539                         ('exception'    , 'Exception'),
 540                         ('nocfg'        , 'Found configuration file planet.cnf on floppy, but was unable to parse it.'),
 541                         ('protoerror2'  , '500 Internal Server Error'),
 542                         ('protoerror'   , 'XML RPC protocol error'),
 543                         ('nodehostname' , 'Configured node hostname does not resolve'),
 544                         ('implementerror', 'Implementation Error'),
 545                         ('fsckabort'    , 'is mounted.  e2fsck: Cannot continue, aborting'),
 546                         ('fsckfail'             , 'Running e2fsck -v -p /dev/planetlab/root failed'),
 547                         ('fsckfail2'    , 'Running e2fsck -v -p /dev/planetlab/vservers failed'),
 548                         ('readonlyfs'   , '\[Errno 30\] Read-only file system'),
 549                         ('baddisk'      , "IOError: \[Errno 13\] Permission denied: '/tmp/mnt/sysimg//vservers/\w+/etc/hosts'"),
 550                         ('noinstall'    , 'notinstalled'),
 551                         ('bziperror'    , 'bzip2: Data integrity error when decompressing.'),
 552                         ('noblockdev'   , "No block devices detected."),
 553                         ('missingkernel', "missingkernel"),
 554                         ('dnserror'     , 'Name or service not known'),
 555                         ('noparseconfig', "Found configuration file plnode.txt on floppy, but was unable to parse it"),
 556                         ('noconfig'             , "Unable to find and read a node configuration file"),
 557                         ('downloadfail' , 'Unable to download main tarball /boot/bootstrapfs-planetlab-i386.tar.bz2 from server.'),
 558                         ('disktoosmall' , 'The total usable disk size of all disks is insufficient to be usable as a PlanetLab node.'),
 559                         ('hardwarerequirefail' , 'Hardware requirements not met'),
 560                         ('mkfsfail'         , 'while running: Running mkfs.ext2 -q  -m 0 -j /dev/planetlab/vservers failed'),
 561                         ('nofilereference', "No such file or directory: '/tmp/mnt/sysimg//vservers/.vref/planetlab-f8-i386/etc/hosts'"),
 562                         ('kernelcopyfail', "cp: cannot stat `/tmp/mnt/sysimg/boot/kernel-boot': No such file or directory"),
 563                         ('chrootfail'   , 'Running chroot /tmp/mnt/sysimg'),
 564                         ('modulefail'   , 'Unable to get list of system modules'),
 565                         ('writeerror'   , 'write error: No space left on device'),
 566                         ('nospace'      , "No space left on device"),
 567                         ('nonode'       , 'Failed to authenticate call: No such node'),
 568                         ('authfail'     , 'Failed to authenticate call: Call could not be authenticated'),
 569                         ('authfail2'    , 'Authentication Failed'),
 570                         ('bootcheckfail'  , 'BootCheckAuthentication'),
 571                         ('bootupdatefail' , 'BootUpdateNode'),
 572                 ]
 573                 return steps
 574
 575         def getBootManagerSequenceFromLog(self, steps, child):
 576                 sequence = []
 577                 while True:
 578
 579                         index = child.expect( steps_to_list(steps) + [ pexpect.EOF ])
 580                         id = index_to_id(steps,index)
 581                         sequence.append(id)
 582
 583                         if id == "exception":
 584                                 print "...Found An Exception!!!"
 585                         elif id == "done": #index == len(steps_to_list(steps)):
 586                                 #print "Reached EOF"
 587                                 break
 588
 589                 return sequence
 590
 591 def restore(sitehist, hostname, config=None, forced_action=None):
 592         ret = restore_basic(sitehist, hostname, config, forced_action)
 593         session.flush()
 594         return ret
 595
 596 def restore_basic(sitehist, hostname, config=None, forced_action=None):
 597
 598         # NOTE: Nothing works if the bootcd is REALLY old.
 599         #       So, this is the first step.
 600
 601         bootman_action = "unknown"
 602
 603         fbnode = FindbadNodeRecord.get_latest_by(hostname=hostname).to_dict()
 604         recent_actions = sitehist.getRecentActions(hostname=hostname)
 605
 606         if fbnode['observed_category'] == "OLDBOOTCD":
 607                 print "\t...Notify owner to update BootImage!!!"
 608
 609                 if not found_within(recent_actions, 'newbootcd_notice', 3.5):
 610                         sitehist.sendMessage('newbootcd_notice', hostname=hostname)
 611
 612                         print "\tDisabling %s due to out-of-date BootImage" % hostname
 613                         api.UpdateNode(hostname, {'boot_state' : 'disabled'})
 614
 615                 # NOTE: nothing else is possible.
 616                 return "disabled"
 617
 618         debugnode = DebugInterface(hostname)
 619         conn = debugnode.getConnection()
 620         if type(conn) == type(False): return "connect_failed"
 621
 622         boot_state = conn.get_boot_state()
 623         if boot_state != "debug":
 624                 print "... %s in %s state: skipping..." % (hostname , boot_state)
 625                 return "skipped" #boot_state == "boot"
 626
 627         if conn.bootmanager_running():
 628                 print "...BootManager is currently running.  Skipping host %s" %hostname
 629                 return "skipped" # True
 630
 631         # Read persistent flags, tagged on one week intervals.
 632
 633         if config and not config.quiet: print "...downloading dmesg from %s" %hostname
 634         dmesg = conn.get_dmesg()
 635         child = fdpexpect.fdspawn(dmesg)
 636
 637         steps = debugnode.getDiskSteps()
 638         sequence = debugnode.getDiskSequence(steps, child)
 639
 640         s = Set(sequence)
 641         if config and not config.quiet: print "\tSET: ", s
 642
 643         if len(s) > 1:
 644                 print "...Potential drive errors on %s" % hostname
 645                 if len(s) == 2 and 'floppyerror' in s:
 646                         print "...Should investigate.  Continuing with node."
 647                 else:
 648                         print "...Should investigate.  Skipping node."
 649                         # TODO: send message related to these errors.
 650
 651                         if not found_within(recent_actions, 'baddisk_notice', 7):
 652                                 print "baddisk_notice not found recently"
 653
 654                                 log=conn.get_dmesg().read()
 655                                 sitehist.sendMessage('baddisk_notice', hostname=hostname, log=log)
 656                                 return "skipping_baddisk"
 657                         else:
 658                                 # NOTE: "" does not add a new action record
 659                                 return ""
 660
 661
 662         print "...Downloading bm.log from %s" %hostname
 663         log = conn.get_bootmanager_log()
 664         bm_log_data = log.read() # get data
 665         log.seek(0)     # reset fd pointer for fdspawn
 666         child = fdpexpect.fdspawn(log)
 667
 668         if hasattr(config, 'collect') and config.collect: return "collect"
 669
 670         if config and not config.quiet: print "...Scanning bm.log for errors"
 671
 672         time.sleep(1)
 673
 674         steps = debugnode.getBootManagerStepPatterns()
 675         sequence = debugnode.getBootManagerSequenceFromLog(steps, child)
 676
 677         s = "-".join(sequence)
 678         print "   FOUND SEQUENCE: ", s
 679
 680         # NOTE: We get or set the flag based on the current sequence identifier.
 681         #  By using the sequence identifier, we guarantee that there will be no
 682         #  frequent loops.  I'm guessing there is a better way to track loops,
 683         #  though.
 684
 685         sequences = debugnode.getSequences()
 686         flag_set = True
 687
 688         if s not in sequences:
 689                 print "   HOST %s" % hostname
 690                 print "   UNKNOWN SEQUENCE: %s" % s
 691
 692                 args = {}
 693                 args['hostname'] = hostname
 694                 args['sequence'] = s
 695                 args['bmlog'] = bm_log_data
 696                 args['viart'] = False
 697                 args['saveact'] = True
 698                 args['ccemail'] = True
 699
 700                 if 'nospace' in s:
 701                         # NOTE: sequence is unknown and contains nospace, so try the
 702                         # fprobe repair trick first.
 703                         conn.fprobe_repair_node()
 704
 705                 sitehist.sendMessage('unknownsequence_notice', **args)
 706                 conn.restart_bootmanager('boot')
 707                 bootman_action = "restart_bootmanager"
 708
 709                 # NOTE: Do not set the pflags value for this sequence if it's unknown.
 710                 # This way, we can check it again after we've fixed it.
 711                 flag_set = False
 712
 713         else:
 714                 bootman_action = sequences[s]
 715
 716                 if   sequences[s] == "restart_bootmanager_boot":
 717                         print "...Restarting BootManager.py on %s "%hostname
 718                         conn.restart_bootmanager('boot')
 719                 elif sequences[s] == "restart_bootmanager_rins":
 720                         print "...Restarting BootManager.py on %s "%hostname
 721                         conn.restart_bootmanager('reinstall')
 722                 elif sequences[s] == "restart_node_rins":
 723                         conn.restart_node('reinstall')
 724                 elif sequences[s] == "restart_node_boot":
 725                         conn.restart_node('boot')
 726                 elif sequences[s] == "fsck_repair":
 727                         conn.fsck_repair_node()
 728                 elif sequences[s] == "repair_node_keys":
 729                         if conn.compare_and_repair_nodekeys():
 730                                 # the keys either are in sync or were forced in sync.
 731                                 # so try to start BM again.
 732                                 conn.restart_bootmanager(conn.get_nodestate())
 733                         else:
 734                                 # there was some failure to synchronize the keys.
 735                                 print "...Unable to repair node keys on %s" %hostname
 736                                 if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
 737                                         args = {}
 738                                         args['hostname'] = hostname
 739                                         sitehist.sendMessage('nodeconfig_notice', **args)
 740                                         conn.dump_plconf_file()
 741                                 else:
 742                                         # NOTE: do not add a new action record
 743                                         return ""
 744
 745                 elif sequences[s] == "unknownsequence_notice":
 746                         args = {}
 747                         args['hostname'] = hostname
 748                         args['sequence'] = s
 749                         args['bmlog'] = bm_log_data
 750                         args['viart'] = False
 751                         args['saveact'] = True
 752                         args['ccemail'] = True
 753
 754                         sitehist.sendMessage('unknownsequence_notice', **args)
 755                         conn.restart_bootmanager('boot')
 756
 757                 elif sequences[s] == "nodeconfig_notice":
 758
 759                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
 760                                 args = {}
 761                                 args['hostname'] = hostname
 762                                 sitehist.sendMessage('nodeconfig_notice', **args)
 763                                 conn.dump_plconf_file()
 764                         else:
 765                                 # NOTE: do not add a new action record
 766                                 return ""
 767
 768                 elif sequences[s] == "nodenetwork_email":
 769
 770                         if not found_within(recent_actions, 'nodeconfig_notice', 3.5):
 771                                 args = {}
 772                                 args['hostname'] = hostname
 773                                 args['bmlog'] = bm_log_data
 774                                 sitehist.sendMessage('nodeconfig_notice', **args)
 775                                 conn.dump_plconf_file()
 776                         else:
 777                                 # NOTE: do not add a new action record
 778                                 return ""
 779
 780                 elif sequences[s] == "noblockdevice_notice":
 781
 782                         if not found_within(recent_actions, 'noblockdevice_notice', 3.5):
 783                                 args = {}
 784                                 #args.update(getconf.getconf(hostname)) # NOTE: Generates boot images for the user:
 785                                 args['hostname'] = hostname
 786
 787                                 sitehist.sendMessage('noblockdevice_notice', **args)
 788                         else:
 789                                 # NOTE: do not add a new action record
 790                                 return ""
 791
 792                 elif sequences[s] == "baddisk_notice":
 793                         # MAKE An ACTION record that this host has failed hardware.  May
 794                         # require either an exception "/minhw" or other manual intervention.
 795                         # Definitely need to send out some more EMAIL.
 796                         # TODO: email notice of broken hardware
 797                         if not found_within(recent_actions, 'baddisk_notice', 7):
 798                                 print "...NOTIFYING OWNERS OF BROKEN HARDWARE on %s!!!" % hostname
 799                                 args = {}
 800                                 args['hostname'] = hostname
 801                                 args['log'] = conn.get_dmesg().read()
 802
 803                                 sitehist.sendMessage('baddisk_notice', **args)
 804                                 #conn.set_nodestate('disabled')
 805                         else:
 806                                 # NOTE: do not add a new action record
 807                                 return ""
 808
 809                 elif sequences[s] == "minimalhardware_notice":
 810                         if not found_within(recent_actions, 'minimalhardware_notice', 7):
 811                                 print "...NOTIFYING OWNERS OF MINIMAL HARDWARE FAILURE on %s!!!" % hostname
 812                                 args = {}
 813                                 args['hostname'] = hostname
 814                                 args['bmlog'] = bm_log_data
 815                                 sitehist.sendMessage('minimalhardware_notice', **args)
 816                         else:
 817                                 # NOTE: do not add a new action record
 818                                 return ""
 819
 820                 elif sequences[s] == "baddns_notice":
 821                         if not found_within(recent_actions, 'baddns_notice', 1):
 822                                 print "...NOTIFYING OWNERS OF DNS FAILURE on %s!!!" % hostname
 823                                 args = {}
 824                                 try:
 825                                         node = plccache.GetNodeByName(hostname)
 826                                         net = api.GetInterfaces(node['interface_ids'])[0]
 827                                 except:
 828                                         email_exception()
 829                                         print traceback.print_exc()
 830                                         # TODO: api error. skip email, b/c all info is not available,
 831                                         # flag_set will not be recorded.
 832                                         return "exception"
 833                                 nodenet_str = network_config_to_str(net)
 834
 835                                 args['hostname'] = hostname
 836                                 args['network_config'] = nodenet_str
 837                                 args['interface_id'] = net['interface_id']
 838
 839                                 sitehist.sendMessage('baddns_notice', **args)
 840                         else:
 841                                 # NOTE: do not add a new action record
 842                                 return ""
 843
 844         return bootman_action
 845
 846
 847 if __name__ == "__main__":
 848         print "ERROR: Can not execute module as a command! Please use commands/%s.py" % os.path.splitext(__file__)[0]