Merge remote-tracking branch 'origin/5.0' into bootmanager-vender
[bootmanager.git] / source / BootManager.py
diff --git a/source/BootManager.py b/source/BootManager.py
new file mode 100755 (executable)
index 0000000..3c3f1d4
--- /dev/null
@@ -0,0 +1,416 @@
+#!/usr/bin/python -u
+#
+# Copyright (c) 2003 Intel Corporation
+# All rights reserved.
+#
+# Copyright (c) 2004-2006 The Trustees of Princeton University
+# All rights reserved.
+
+import string
+import sys, os, traceback
+import time
+import gzip
+
+from steps import *
+from Exceptions import *
+import notify_messages
+import BootServerRequest
+import utils
+
+# all output is written to this file
+BM_NODE_LOG= "/tmp/bm.log"
+VARS_FILE = "configuration"
+
+# the new contents of PATH when the boot manager is running
+BIN_PATH= ('/usr/local/bin',
+           '/usr/local/sbin',
+           '/usr/bin',
+           '/usr/sbin',
+           '/bin',
+           '/sbin')
+
+def read_configuration_file(filename):
+    # read in and store all variables in VARS_FILE into each line
+    # is in the format name=val (any whitespace around the = is
+    # removed. everything after the = to the end of the line is
+    # the value
+    vars = {}
+    vars_file= file(filename,'r')
+    validConfFile = True
+    for line in vars_file:
+        # if its a comment or a whitespace line, ignore
+        if line[:1] == "#" or string.strip(line) == "":
+            continue
+
+        parts= string.split(line,"=")
+        if len(parts) != 2:
+            validConfFile = False
+            raise Exception( "Invalid line in vars file: %s" % line )
+
+        name= string.strip(parts[0])
+        value= string.strip(parts[1])
+        value= value.replace("'", "")   # remove quotes
+        value= value.replace('"', "")   # remove quotes
+        vars[name]= value
+
+    vars_file.close()
+    if not validConfFile:
+        raise Exception( "Unable to read configuration vars." )
+
+    # find out which directory we are running it, and set a variable
+    # for that. future steps may need to get files out of the bootmanager
+    # directory
+    current_dir= os.getcwd()
+    vars['BM_SOURCE_DIR']= current_dir
+
+    return vars
+
+##############################
+class log:
+
+    format="%H:%M:%S(%Z) "
+
+    def __init__( self, OutputFilePath= None ):
+        try:
+            self.OutputFile= open( OutputFilePath, "w")
+            self.OutputFilePath= OutputFilePath
+        except:
+            print( "bootmanager log : Unable to open output file %r, continuing"%OutputFilePath )
+            self.OutputFile= None
+
+        self.VARS = None
+        try:
+            vars = read_configuration_file(VARS_FILE)
+            self.VARS = vars
+        except Exception, e:
+            self.LogEntry( str(e) )
+            return
+    
+    def LogEntry( self, str, inc_newline= 1, display_screen= 1 ):
+        now=time.strftime(log.format, time.localtime())
+        if self.OutputFile:
+            self.OutputFile.write( now+str )
+        if display_screen:
+            sys.stdout.write( now+str )
+            
+        if inc_newline:
+            if display_screen:
+                sys.stdout.write( "\n" )
+            if self.OutputFile:
+                self.OutputFile.write( "\n" )
+
+        if self.OutputFile:
+            self.OutputFile.flush()
+
+    def write( self, str ):
+        """
+        make log behave like a writable file object (for traceback
+        prints)
+        """
+        self.LogEntry( str, 0, 1 )
+    
+    # bm log uploading is available back again, as of nodeconfig-5.0-2
+    def Upload( self, extra_file=None ):
+        """
+        upload the contents of the log to the server
+        """
+        if self.OutputFile is not None:
+            self.OutputFile.flush()
+
+            self.LogEntry( "Uploading logs to %s" % self.VARS['UPLOAD_LOG_SCRIPT'] )
+            
+            self.OutputFile.close()
+            self.OutputFile= None
+
+            hostname= self.VARS['INTERFACE_SETTINGS']['hostname'] + "." + \
+                      self.VARS['INTERFACE_SETTINGS']['domainname']
+            bs_request = BootServerRequest.BootServerRequest(self.VARS)
+            try:
+                # this was working until f10
+                bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
+                                       GetVars = None, PostVars = None,
+                                       DoSSL = True, DoCertCheck = True,
+                                       FormData = ["log=@" + self.OutputFilePath,
+                                                   "hostname=" + hostname, 
+                                                   "type=bm.log"])
+            except:
+                # new pycurl
+                import pycurl
+                bs_request.MakeRequest(PartialPath = self.VARS['UPLOAD_LOG_SCRIPT'],
+                                       GetVars = None, PostVars = None,
+                                       DoSSL = True, DoCertCheck = True,
+                                       FormData = [('log',(pycurl.FORM_FILE, self.OutputFilePath)),
+                                                   ("hostname",hostname),
+                                                   ("type","bm.log")])
+        if extra_file is not None:
+            # NOTE: for code-reuse, evoke the bash function 'upload_logs'; 
+            # by adding --login, bash reads .bash_profile before execution.
+            # Also, never fail, since this is an optional feature.
+            utils.sysexec_noerr( """bash --login -c "upload_logs %s" """ % extra_file, self)
+
+
+##############################
+class BootManager:
+
+    # file containing initial variables/constants
+
+    # the set of valid node run states
+    NodeRunStates = {'reinstall':None,
+                     'boot':None,
+                     'safeboot':None,
+                     'disabled':None,
+                     }
+    
+    def __init__(self, log, forceState):
+        # override machine's current state from the command line
+        self.forceState = forceState
+
+        # the main logging point
+        self.LOG= log
+
+        # set to 1 if we can run after initialization
+        self.CAN_RUN = 0
+
+        if log.VARS:
+            # this contains a set of information used and updated by each step
+            self.VARS= log.VARS
+        else:
+            return
+             
+        # not sure what the current PATH is set to, replace it with what
+        # we know will work with all the boot cds
+        os.environ['PATH']= string.join(BIN_PATH,":")
+
+        self.CAN_RUN= 1
+
+    def Run(self):
+        """
+        core boot manager logic.
+
+        the way errors are handled is as such: if any particular step
+        cannot continue or unexpectibly fails, an exception is thrown.
+        in this case, the boot manager cannot continue running.
+
+        these step functions can also return a 0/1 depending on whether
+        or not it succeeded. In the case of steps like ConfirmInstallWithUser,
+        a 0 is returned and no exception is thrown if the user chose not
+        to confirm the install. The same goes with the CheckHardwareRequirements.
+        If requriements not met, but tests were succesfull, return 0.
+
+        for steps that run within the installer, they are expected to either
+        complete succesfully and return 1, or throw an execption.
+
+        For exact return values and expected operations, see the comments
+        at the top of each of the invididual step functions.
+        """
+
+        def _nodeNotInstalled(message='MSG_NODE_NOT_INSTALLED'):
+            # called by the _xxxState() functions below upon failure
+            self.VARS['RUN_LEVEL']= 'failboot'
+            notify = getattr(notify_messages, message)
+            self.VARS['STATE_CHANGE_NOTIFY']= 1
+            self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= notify
+            raise BootManagerException, notify
+
+        def _bootRun():
+            # implements the boot logic, which consists of first
+            # double checking that the node was properly installed,
+            # checking whether someone added or changed disks, and
+            # then finally chain boots.
+
+            # starting the fallback/debug ssh daemon for safety:
+            # if the node install somehow hangs, or if it simply takes ages, 
+            # we can still enter and investigate
+            try:
+                StartDebug.Run(self.VARS, self.LOG, last_resort = False)
+            except:
+                pass
+
+            InstallInit.Run( self.VARS, self.LOG )                    
+            ret = ValidateNodeInstall.Run( self.VARS, self.LOG )
+            if ret == 1:
+                WriteModprobeConfig.Run( self.VARS, self.LOG )
+                WriteNetworkConfig.Run( self.VARS, self.LOG )
+                CheckForNewDisks.Run( self.VARS, self.LOG )
+                SendHardwareConfigToPLC.Run( self.VARS, self.LOG )
+                ChainBootNode.Run( self.VARS, self.LOG )
+            elif ret == -1:
+                _nodeNotInstalled('MSG_NODE_FILESYSTEM_CORRUPT')
+            elif ret == -2:
+                _nodeNotInstalled('MSG_NODE_MOUNT_FAILED')
+            elif ret == -3:
+                _nodeNotInstalled('MSG_NODE_MISSING_KERNEL')
+            else:
+                _nodeNotInstalled()
+
+        def _reinstallRun():
+
+            # starting the fallback/debug ssh daemon for safety:
+            # if the node install somehow hangs, or if it simply takes ages, 
+            # we can still enter and investigate
+            try:
+                StartDebug.Run(self.VARS, self.LOG, last_resort = False)
+            except:
+                pass
+
+            # implements the reinstall logic, which will check whether
+            # the min. hardware requirements are met, install the
+            # software, and upon correct installation will switch too
+            # 'boot' state and chainboot into the production system
+            if not CheckHardwareRequirements.Run( self.VARS, self.LOG ):
+                self.VARS['RUN_LEVEL']= 'failboot'
+                raise BootManagerException, "Hardware requirements not met."
+
+            # runinstaller
+            InstallInit.Run( self.VARS, self.LOG )                    
+            InstallPartitionDisks.Run( self.VARS, self.LOG )            
+            InstallBootstrapFS.Run( self.VARS, self.LOG )            
+            InstallWriteConfig.Run( self.VARS, self.LOG )
+            InstallUninitHardware.Run( self.VARS, self.LOG )
+            self.VARS['BOOT_STATE']= 'boot'
+            self.VARS['STATE_CHANGE_NOTIFY']= 1
+            self.VARS['STATE_CHANGE_NOTIFY_MESSAGE']= \
+                 notify_messages.MSG_INSTALL_FINISHED
+            UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
+            _bootRun()
+            
+        def _installRun():
+            # implements the new install logic, which will first check
+            # with the user whether it is ok to install on this
+            # machine, switch to 'reinstall' state and then invoke the reinstall
+            # logic.  See reinstallState logic comments for further
+            # details.
+            if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ):
+                return 0
+            self.VARS['BOOT_STATE']= 'reinstall'
+            _reinstallRun()
+
+        def _debugRun(state='failboot'):
+            # implements debug logic, which starts the sshd and just waits around
+            self.VARS['RUN_LEVEL']=state
+            StartDebug.Run( self.VARS, self.LOG )
+            # fsck/mount fs if present, and ignore return value if it's not.
+            ValidateNodeInstall.Run( self.VARS, self.LOG )
+
+        def _badstateRun():
+            # should never happen; log event
+            self.LOG.write( "\nInvalid BOOT_STATE = %s\n" % self.VARS['BOOT_STATE'])
+            _debugRun()
+
+        # setup state -> function hash table
+        BootManager.NodeRunStates['reinstall']  = _reinstallRun
+        BootManager.NodeRunStates['boot']       = _bootRun
+        BootManager.NodeRunStates['safeboot']   = lambda : _debugRun('safeboot')
+        BootManager.NodeRunStates['disabled']   = lambda : _debugRun('disabled')
+
+        success = 0
+        try:
+            InitializeBootManager.Run( self.VARS, self.LOG )
+            ReadNodeConfiguration.Run( self.VARS, self.LOG )
+            AuthenticateWithPLC.Run( self.VARS, self.LOG )
+            UpdateLastBootOnce.Run( self.VARS, self.LOG )
+            StartRunlevelAgent.Run( self.VARS, self.LOG )
+            GetAndUpdateNodeDetails.Run( self.VARS, self.LOG )
+
+            # override machine's current state from the command line
+            if self.forceState is not None:
+                self.VARS['BOOT_STATE']= self.forceState
+                UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
+
+            stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
+            stateRun()
+            success = 1
+
+        except KeyError, e:
+            self.LOG.write( "\n\nKeyError while running: %s\n" % str(e) )
+        except BootManagerException, e:
+            self.LOG.write( "\n\nException while running: %s\n" % str(e) )
+        except BootManagerAuthenticationException, e:
+            self.LOG.write( "\n\nFailed to Authenticate Node: %s\n" % str(e) )
+            # sets /tmp/CANCEL_BOOT flag
+            StartDebug.Run(self.VARS, self.LOG )
+            # Return immediately b/c any other calls to API will fail
+            return success
+        except:
+            self.LOG.write( "\n\nImplementation Error\n")
+            traceback.print_exc(file=self.LOG.OutputFile)
+            traceback.print_exc()
+
+        if not success:
+            try:
+                _debugRun()
+            except BootManagerException, e:
+                self.LOG.write( "\n\nException while running: %s\n" % str(e) )
+            except:
+                self.LOG.write( "\n\nImplementation Error\n")
+                traceback.print_exc(file=self.LOG.OutputFile)
+                traceback.print_exc()
+
+        return success
+            
+            
+def main(argv):
+
+    import utils
+    utils.prompt_for_breakpoint_mode()
+
+#    utils.breakpoint ("Entering BootManager::main")
+    
+    # set to 1 if error occurred
+    error= 0
+    
+    # all output goes through this class so we can save it and post
+    # the data back to PlanetLab central
+    LOG= log( BM_NODE_LOG )
+
+    # NOTE: assume CWD is BM's source directory, but never fail
+    utils.sysexec_noerr("./setup_bash_history_scripts.sh", LOG)
+
+    LOG.LogEntry( "BootManager started at: %s" % \
+                  time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
+
+    try:
+        forceState = None
+        if len(argv) == 2:
+            fState = argv[1]
+            if BootManager.NodeRunStates.has_key(fState):
+                forceState = fState
+            else:
+                LOG.LogEntry("FATAL: cannot force node run state to=%s" % fState)
+                error = 1
+    except:
+        traceback.print_exc(file=LOG.OutputFile)
+        traceback.print_exc()
+        
+    if error:
+        LOG.LogEntry( "BootManager finished at: %s" % \
+                      time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
+        LOG.Upload()
+        return error
+
+    try:
+        bm= BootManager(LOG,forceState)
+        if bm.CAN_RUN == 0:
+            LOG.LogEntry( "Unable to initialize BootManager." )
+        else:
+            LOG.LogEntry( "Running version %s of BootManager." % bm.VARS['VERSION'] )
+            success= bm.Run()
+            if success:
+                LOG.LogEntry( "\nDone!" );
+            else:
+                LOG.LogEntry( "\nError occurred!" );
+                error = 1
+    except:
+        traceback.print_exc(file=LOG.OutputFile)
+        traceback.print_exc()
+
+    LOG.LogEntry( "BootManager finished at: %s" % \
+                  time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()) )
+    LOG.Upload()
+
+    return error
+
+    
+if __name__ == "__main__":
+    error = main(sys.argv)
+    sys.exit(error)