From: Stephen Soltesz Date: Fri, 27 May 2011 20:40:06 +0000 (-0400) Subject: Add Runlevelagent to BM. Add UpdateLastBootOnce() to record last_* in DB. X-Git-Tag: bootmanager-5.0-18~3 X-Git-Url: http://git.onelab.eu/?p=bootmanager.git;a=commitdiff_plain;h=7a73fb13b2a2af2d470da53fbb43760ea2544638 Add Runlevelagent to BM. Add UpdateLastBootOnce() to record last_* in DB. remove the dependency on monitor runlevelagent package. add UpdateLastBootOnce to update last_time_spent_online/_offline ensure that UpdateLastBootOnce() is called only once per boot. remove calls to runlevelagent in BM --- diff --git a/bootmanager.spec b/bootmanager.spec index 945d59a..70650fa 100644 --- a/bootmanager.spec +++ b/bootmanager.spec @@ -26,7 +26,7 @@ Requires: httpd Requires: PLCAPI >= 5.0 # the python code packaged in these are shipped on the node as well -Requires: pypcilib pyplnet monitor-runlevelagent +Requires: pypcilib pyplnet ### avoid having yum complain about updates, as stuff is moving around # plc.d/bootmanager @@ -72,11 +72,6 @@ mkdir -p /var/log/bm chown apache:apache /var/log/bm chmod 700 /var/log/bm -# NOTE: do not run this agent when installed on a myplc. -# xxx - a bit hacky maybe -chkconfig monitor-runlevelagent off -chkconfig --del monitor-runlevelagent - %files %defattr(-,root,root,-) %{_datadir}/%{name} diff --git a/build.sh b/build.sh index 2299dca..ee4bd32 100755 --- a/build.sh +++ b/build.sh @@ -63,7 +63,7 @@ fi # Add python code from the following packages # make sure they are in the 'Requires' header of the specfile -required_rpms="pypcilib pyplnet monitor-runlevelagent" +required_rpms="pypcilib pyplnet" extra_libs=`mktemp -d "/tmp/.bootmanager.XXXXXX"` mkdir $extra_libs/source cp -p $(rpm -ql $required_rpms | grep -v '\.py[co]$') $extra_libs/source diff --git a/source/BootManager.py b/source/BootManager.py index f47748d..3c3f1d4 100755 --- a/source/BootManager.py +++ b/source/BootManager.py @@ -283,13 +283,11 @@ class BootManager: if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ): return 0 self.VARS['BOOT_STATE']= 'reinstall' - UpdateRunLevelWithPLC.Run( self.VARS, self.LOG ) _reinstallRun() def _debugRun(state='failboot'): # implements debug logic, which starts the sshd and just waits around self.VARS['RUN_LEVEL']=state - UpdateRunLevelWithPLC.Run( self.VARS, self.LOG ) StartDebug.Run( self.VARS, self.LOG ) # fsck/mount fs if present, and ignore return value if it's not. ValidateNodeInstall.Run( self.VARS, self.LOG ) @@ -310,6 +308,7 @@ class BootManager: InitializeBootManager.Run( self.VARS, self.LOG ) ReadNodeConfiguration.Run( self.VARS, self.LOG ) AuthenticateWithPLC.Run( self.VARS, self.LOG ) + UpdateLastBootOnce.Run( self.VARS, self.LOG ) StartRunlevelAgent.Run( self.VARS, self.LOG ) GetAndUpdateNodeDetails.Run( self.VARS, self.LOG ) @@ -317,7 +316,6 @@ class BootManager: if self.forceState is not None: self.VARS['BOOT_STATE']= self.forceState UpdateBootStateWithPLC.Run( self.VARS, self.LOG ) - UpdateRunLevelWithPLC.Run( self.VARS, self.LOG ) stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun) stateRun() diff --git a/source/RunlevelAgent.py b/source/RunlevelAgent.py new file mode 100755 index 0000000..e3047b3 --- /dev/null +++ b/source/RunlevelAgent.py @@ -0,0 +1,210 @@ +#!/usr/bin/python +# +# RunlevelAgent - acts as a heartbeat back to myplc reporting that the node is +# online and whether it is in boot or pre-boot run-level. +# This is useful to identify nodes that are behind a firewall, as well as to +# have the machine report run-time status both in safeboot and boot modes, +# so that it is immediately visible at myplc (gui or api). +# + +import xml, xmlrpclib +import logging +import time +import traceback +import sys +import os +import string + +CONFIG_FILE="/tmp/source/configuration" +SESSION_FILE="/etc/planetlab/session" +RLA_PID_FILE="/var/run/rla.pid" + +def read_config_file(filename): + ## NOTE: text copied from BootManager.py + # TODO: unify this code to make it common. i.e. use ConfigParser module + vars = {} + vars_file= file(filename,'r') + validConfFile = True + for line in vars_file: + # if its a comment or a whitespace line, ignore + if line[:1] == "#" or string.strip(line) == "": + continue + + parts= string.split(line,"=") + if len(parts) != 2: + print "Invalid line in vars file: %s" % line + validConfFile = False + break + + name= string.strip(parts[0]) + value= string.strip(parts[1]) + vars[name]= value + + vars_file.close() + if not validConfFile: + print "Unable to read configuration vars." + + return vars + +try: + sys.path = ['/etc/planetlab'] + sys.path + import plc_config + api_server_url = "https://" + plc_config.PLC_API_HOST + plc_config.PLC_API_PATH +except: + filename=CONFIG_FILE + vars = read_config_file(filename) + api_server_url = vars['BOOT_API_SERVER'] + + +class Auth: + def __init__(self, username=None, password=None, **kwargs): + if 'session' in kwargs: + self.auth= { 'AuthMethod' : 'session', + 'session' : kwargs['session'] } + else: + if username==None and password==None: + self.auth = {'AuthMethod': "anonymous"} + else: + self.auth = {'Username' : username, + 'AuthMethod' : 'password', + 'AuthString' : password} +class PLC: + def __init__(self, auth, url): + self.auth = auth + self.url = url + self.api = xmlrpclib.Server(self.url, verbose=False, allow_none=True) + + def __getattr__(self, name): + method = getattr(self.api, name) + if method is None: + raise AssertionError("method does not exist") + + return lambda *params : method(self.auth.auth, *params) + + def __repr__(self): + return self.api.__repr__() + +def extract_from(filename, pattern): + f = os.popen("grep -E %s %s" % (pattern, filename)) + val = f.read().strip() + return val + +def check_running(commandname): + f = os.popen("ps ax | grep -E %s | grep -v grep" % (commandname)) + val = f.read().strip() + return val + + +def save_pid(): + # save PID + try: + pid = os.getpid() + f = open(RLA_PID_FILE, 'w') + f.write("%s\n" % pid) + f.close() + except: + print "Uuuhhh.... this should not occur." + sys.exit(1) + +def start_and_run(): + + save_pid() + + # Keep trying to authenticate session, waiting for NM to re-write the + # session file, or DNS to succeed, until AuthCheck succeeds. + while True: + try: + f=open(SESSION_FILE,'r') + session_str=f.read().strip() + api = PLC(Auth(session=session_str), api_server_url) + # NOTE: What should we do if this call fails? + # TODO: handle dns failure here. + api.AuthCheck() + break + except: + print "Retry in 30 seconds: ", os.popen("uptime").read().strip() + traceback.print_exc() + time.sleep(30) + + try: + env = 'production' + if len(sys.argv) > 2: + env = sys.argv[2] + except: + traceback.print_exc() + + while True: + try: + # NOTE: here we are inferring the runlevel by environmental + # observations. We know how this process was started by the + # given command line argument. Then in bootmanager + # runlevel, the bm.log gives information about the current + # activity. + # other options: + # call plc for current boot state? + # how long have we been running? + if env == "bootmanager": + bs_val = extract_from('/tmp/bm.log', "'Current boot state:'") + if len(bs_val) > 0: bs_val = bs_val.split()[-1] + ex_val = extract_from('/tmp/bm.log', 'Exception') + fs_val = extract_from('/tmp/bm.log', 'mke2fs') + bm_val = check_running("BootManager.py") + + if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']: + api.ReportRunlevel({'run_level' : 'safeboot'}) + + elif len(ex_val) > len("Exception"): + api.ReportRunlevel({'run_level' : 'failboot'}) + + elif len(fs_val) > 0 and len(bm_val) > 0: + api.ReportRunlevel({'run_level' : 'reinstall'}) + + else: + api.ReportRunlevel({'run_level' : 'failboot'}) + + elif env == "production": + api.ReportRunlevel({'run_level' : 'boot'}) + else: + api.ReportRunlevel({'run_level' : 'failboot'}) + + except: + print "reporting error: ", os.popen("uptime").read().strip() + traceback.print_exc() + + sys.stdout.flush() + # TODO: change to a configurable value + time.sleep(60*15) + +def agent_running(): + try: + os.stat(RLA_PID_FILE) + f = os.popen("ps ax | grep RunlevelAgent | grep -Ev 'grep|vim' | awk '{print $1}' | wc -l") + l = f.read().strip() + if int(l) >= 2: + return True + else: + try: + os.unlink(RLA_PID_FILE) + except: + pass + return False + except: + return False + + +def shutdown(): + import signal + + pid = open(RLA_PID_FILE, 'r').read().strip() + + # Try three different ways to kill the process. Just to be sure. + os.kill(int(pid), signal.SIGKILL) + os.system("pkill RunlevelAgent.py") + os.system("ps ax | grep RunlevelAgent | grep -v grep | awk '{print $1}' | xargs kill -9 ") + +if __name__ == "__main__": + if "start" in sys.argv and not agent_running(): + start_and_run() + + if "stop" in sys.argv and agent_running(): + shutdown() diff --git a/source/steps/ChainBootNode.py b/source/steps/ChainBootNode.py index a279c1c..6b1a0c4 100644 --- a/source/steps/ChainBootNode.py +++ b/source/steps/ChainBootNode.py @@ -143,6 +143,7 @@ def Run( vars, log ): update_vals['ssh_rsa_key']= ssh_host_key BootAPI.call_api_function( vars, "BootUpdateNode", (update_vals,) ) + # get the kernel version option = '' if NODE_MODEL_OPTIONS & ModelOptions.SMP: @@ -165,6 +166,9 @@ def Run( vars, log ): ROOT_MOUNTED= 0 vars['ROOT_MOUNTED']= 0 + # Change runlevel to 'boot' prior to kexec. + StopRunLevelAgent.Run( vars, log ) + log.write( "Unloading modules and chain booting to new kernel.\n" ) # further use of log after Upload will only output to screen diff --git a/source/steps/StartRunlevelAgent.py b/source/steps/StartRunlevelAgent.py index 00c5f08..d4ee55c 100644 --- a/source/steps/StartRunlevelAgent.py +++ b/source/steps/StartRunlevelAgent.py @@ -23,11 +23,11 @@ def Run( vars, log ): log.write( "\n\nStep: Starting RunlevelAgent.py\n" ) try: - cmd = "%s/monitor-runlevelagent" % vars['BM_SOURCE_DIR'] + cmd = "%s/RunlevelAgent.py" % vars['BM_SOURCE_DIR'] # raise error if script is not present. os.stat(cmd) # init script only starts RLA once. - os.system("/bin/sh %s start bootmanager" % cmd) + os.system("/usr/bin/python %s start bootmanager &" % cmd) except KeyError, var: raise BootManagerException, "Missing variable in vars: %s\n" % var except ValueError, var: diff --git a/source/steps/StopRunlevelAgent.py b/source/steps/StopRunlevelAgent.py new file mode 100644 index 0000000..c88b35f --- /dev/null +++ b/source/steps/StopRunlevelAgent.py @@ -0,0 +1,43 @@ +#!/usr/bin/python +# +# Copyright (c) 2003 Intel Corporation +# All rights reserved. +# +# Copyright (c) 2004-2006 The Trustees of Princeton University +# All rights reserved. + + +import os + +from Exceptions import * +import BootAPI + + +def Run( vars, log ): + """ + Stop the RunlevelAgent.py script. Should proceed + kexec to reset run_level to 'boot' before kexec + """ + + log.write( "\n\nStep: Stopping RunlevelAgent.py\n" ) + + try: + cmd = "%s/RunlevelAgent.py" % vars['BM_SOURCE_DIR'] + # raise error if script is not present. + os.stat(cmd) + os.system("/usr/bin/python %s stop" % cmd) + except KeyError, var: + raise BootManagerException, "Missing variable in vars: %s\n" % var + except ValueError, var: + raise BootManagerException, "Variable in vars, shouldn't be: %s\n" % var + + try: + update_vals= {} + update_vals['run_level']='boot' + BootAPI.call_api_function( vars, "ReportRunlevel", (update_vals,) ) + except BootManagerException, e: + log.write( "Unable to update boot state for this node at PLC: %s.\n" % e ) + + return 1 + + diff --git a/source/steps/UpdateLastBootOnce.py b/source/steps/UpdateLastBootOnce.py new file mode 100644 index 0000000..5d689ac --- /dev/null +++ b/source/steps/UpdateLastBootOnce.py @@ -0,0 +1,33 @@ +#!/usr/bin/python +# +# Copyright (c) 2003 Intel Corporation +# All rights reserved. +# +# Copyright (c) 2004-2006 The Trustees of Princeton University +# All rights reserved. + +from Exceptions import * +import BootAPI +import notify_messages +import os.path + + +def Run( vars, log ): + """ + UpdateLastBootOnce will update the last_* values for the node only + once per boot. This helps calculate last_time_spent_online and + last_time_spent_offline for collecting run-time metrics. + """ + + log.write( "\n\nStep: Updating node last boot times at PLC.\n" ) + + update_vals= {} + try: + if not os.path.isfile("/tmp/UPDATE_LAST_BOOT_ONCE"): + BootAPI.call_api_function( vars, "BootUpdateNode", (update_vals,) ) + log.write( "Successfully updated boot state for this node at PLC\n" ) + os.system("touch /tmp/UPDATE_LAST_BOOT_ONCE") + except BootManagerException, e: + log.write( "Unable to update last boot times for this node at PLC: %s.\n" % e ) + + return 1 diff --git a/source/steps/__init__.py b/source/steps/__init__.py index d46c3e9..bfef9bc 100644 --- a/source/steps/__init__.py +++ b/source/steps/__init__.py @@ -16,6 +16,7 @@ __all__ = ["ReadNodeConfiguration", "GetAndUpdateNodeDetails", "ConfirmInstallWithUser", "UpdateBootStateWithPLC", + "UpdateLastBootOnce", "UpdateRunLevelWithPLC", "CheckHardwareRequirements", "SendHardwareConfigToPLC", @@ -26,6 +27,7 @@ __all__ = ["ReadNodeConfiguration", "ValidateNodeInstall", "StartDebug", "StartRunlevelAgent", + "StopRunlevelAgent", "InstallBootstrapFS", "InstallInit", "InstallPartitionDisks",