#
%define name bootmanager
%define version 5.0
-%define taglevel 17
+%define taglevel 18
%define release %{taglevel}%{?pldistro:.%{pldistro}}%{?date:.%{date}}
Requires: PLCAPI >= 5.0
# the python code packaged in these are shipped on the node as well
-Requires: pypcilib pyplnet monitor-runlevelagent
+Requires: pypcilib pyplnet
### avoid having yum complain about updates, as stuff is moving around
# plc.d/bootmanager
chown apache:apache /var/log/bm
chmod 700 /var/log/bm
-# NOTE: do not run this agent when installed on a myplc.
-# xxx - a bit hacky maybe
-chkconfig monitor-runlevelagent off
-chkconfig --del monitor-runlevelagent
-
%files
%defattr(-,root,root,-)
%{_datadir}/%{name}
/etc/plc.d/bootmanager
%changelog
+* Wed Jun 08 2011 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - bootmanager-5.0-18
+- {Start,Stop,}RunLevelAgent now ship with bootmanager
+- new UpdateLastBootOnce
+- root_size bumped to 14Gb which is more in line with modern h/w
+- more safely tries to umount /dev/ and /sys
+- support for raid partitions
+- mkswap -f
+- blacklist files from /etc/modprobe.conf/* instead
+
* Thu Feb 17 2011 Thierry Parmentelat <thierry.parmentelat@sophia.inria.fr> - bootmanager-5.0-17
- on install of boostrapfs, keep track in /bm-install.log with date & flavour
# Add python code from the following packages
# make sure they are in the 'Requires' header of the specfile
-required_rpms="pypcilib pyplnet monitor-runlevelagent"
+required_rpms="pypcilib pyplnet"
extra_libs=`mktemp -d "/tmp/.bootmanager.XXXXXX"`
mkdir $extra_libs/source
cp -p $(rpm -ql $required_rpms | grep -v '\.py[co]$') $extra_libs/source
if not ConfirmInstallWithUser.Run( self.VARS, self.LOG ):
return 0
self.VARS['BOOT_STATE']= 'reinstall'
- UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
_reinstallRun()
def _debugRun(state='failboot'):
# implements debug logic, which starts the sshd and just waits around
self.VARS['RUN_LEVEL']=state
- UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
StartDebug.Run( self.VARS, self.LOG )
# fsck/mount fs if present, and ignore return value if it's not.
ValidateNodeInstall.Run( self.VARS, self.LOG )
InitializeBootManager.Run( self.VARS, self.LOG )
ReadNodeConfiguration.Run( self.VARS, self.LOG )
AuthenticateWithPLC.Run( self.VARS, self.LOG )
+ UpdateLastBootOnce.Run( self.VARS, self.LOG )
StartRunlevelAgent.Run( self.VARS, self.LOG )
GetAndUpdateNodeDetails.Run( self.VARS, self.LOG )
if self.forceState is not None:
self.VARS['BOOT_STATE']= self.forceState
UpdateBootStateWithPLC.Run( self.VARS, self.LOG )
- UpdateRunLevelWithPLC.Run( self.VARS, self.LOG )
stateRun = BootManager.NodeRunStates.get(self.VARS['BOOT_STATE'],_badstateRun)
stateRun()
--- /dev/null
+#!/usr/bin/python
+#
+# RunlevelAgent - acts as a heartbeat back to myplc reporting that the node is
+# online and whether it is in boot or pre-boot run-level.
+# This is useful to identify nodes that are behind a firewall, as well as to
+# have the machine report run-time status both in safeboot and boot modes,
+# so that it is immediately visible at myplc (gui or api).
+#
+
+import xml, xmlrpclib
+import logging
+import time
+import traceback
+import sys
+import os
+import string
+
+CONFIG_FILE="/tmp/source/configuration"
+SESSION_FILE="/etc/planetlab/session"
+RLA_PID_FILE="/var/run/rla.pid"
+
+def read_config_file(filename):
+ ## NOTE: text copied from BootManager.py
+ # TODO: unify this code to make it common. i.e. use ConfigParser module
+ vars = {}
+ vars_file= file(filename,'r')
+ validConfFile = True
+ for line in vars_file:
+ # if its a comment or a whitespace line, ignore
+ if line[:1] == "#" or string.strip(line) == "":
+ continue
+
+ parts= string.split(line,"=")
+ if len(parts) != 2:
+ print "Invalid line in vars file: %s" % line
+ validConfFile = False
+ break
+
+ name= string.strip(parts[0])
+ value= string.strip(parts[1])
+ vars[name]= value
+
+ vars_file.close()
+ if not validConfFile:
+ print "Unable to read configuration vars."
+
+ return vars
+
+try:
+ sys.path = ['/etc/planetlab'] + sys.path
+ import plc_config
+ api_server_url = "https://" + plc_config.PLC_API_HOST + plc_config.PLC_API_PATH
+except:
+ filename=CONFIG_FILE
+ vars = read_config_file(filename)
+ api_server_url = vars['BOOT_API_SERVER']
+
+
+class Auth:
+ def __init__(self, username=None, password=None, **kwargs):
+ if 'session' in kwargs:
+ self.auth= { 'AuthMethod' : 'session',
+ 'session' : kwargs['session'] }
+ else:
+ if username==None and password==None:
+ self.auth = {'AuthMethod': "anonymous"}
+ else:
+ self.auth = {'Username' : username,
+ 'AuthMethod' : 'password',
+ 'AuthString' : password}
+class PLC:
+ def __init__(self, auth, url):
+ self.auth = auth
+ self.url = url
+ self.api = xmlrpclib.Server(self.url, verbose=False, allow_none=True)
+
+ def __getattr__(self, name):
+ method = getattr(self.api, name)
+ if method is None:
+ raise AssertionError("method does not exist")
+
+ return lambda *params : method(self.auth.auth, *params)
+
+ def __repr__(self):
+ return self.api.__repr__()
+
+def extract_from(filename, pattern):
+ f = os.popen("grep -E %s %s" % (pattern, filename))
+ val = f.read().strip()
+ return val
+
+def check_running(commandname):
+ f = os.popen("ps ax | grep -E %s | grep -v grep" % (commandname))
+ val = f.read().strip()
+ return val
+
+
+def save_pid():
+ # save PID
+ try:
+ pid = os.getpid()
+ f = open(RLA_PID_FILE, 'w')
+ f.write("%s\n" % pid)
+ f.close()
+ except:
+ print "Uuuhhh.... this should not occur."
+ sys.exit(1)
+
+def start_and_run():
+
+ save_pid()
+
+ # Keep trying to authenticate session, waiting for NM to re-write the
+ # session file, or DNS to succeed, until AuthCheck succeeds.
+ while True:
+ try:
+ f=open(SESSION_FILE,'r')
+ session_str=f.read().strip()
+ api = PLC(Auth(session=session_str), api_server_url)
+ # NOTE: What should we do if this call fails?
+ # TODO: handle dns failure here.
+ api.AuthCheck()
+ break
+ except:
+ print "Retry in 30 seconds: ", os.popen("uptime").read().strip()
+ traceback.print_exc()
+ time.sleep(30)
+
+ try:
+ env = 'production'
+ if len(sys.argv) > 2:
+ env = sys.argv[2]
+ except:
+ traceback.print_exc()
+
+ while True:
+ try:
+ # NOTE: here we are inferring the runlevel by environmental
+ # observations. We know how this process was started by the
+ # given command line argument. Then in bootmanager
+ # runlevel, the bm.log gives information about the current
+ # activity.
+ # other options:
+ # call plc for current boot state?
+ # how long have we been running?
+ if env == "bootmanager":
+ bs_val = extract_from('/tmp/bm.log', "'Current boot state:'")
+ if len(bs_val) > 0: bs_val = bs_val.split()[-1]
+ ex_val = extract_from('/tmp/bm.log', 'Exception')
+ fs_val = extract_from('/tmp/bm.log', 'mke2fs')
+ bm_val = check_running("BootManager.py")
+
+ if bs_val in ['diag', 'diagnose', 'safeboot', 'disabled', 'disable']:
+ api.ReportRunlevel({'run_level' : 'safeboot'})
+
+ elif len(ex_val) > len("Exception"):
+ api.ReportRunlevel({'run_level' : 'failboot'})
+
+ elif len(fs_val) > 0 and len(bm_val) > 0:
+ api.ReportRunlevel({'run_level' : 'reinstall'})
+
+ else:
+ api.ReportRunlevel({'run_level' : 'failboot'})
+
+ elif env == "production":
+ api.ReportRunlevel({'run_level' : 'boot'})
+ else:
+ api.ReportRunlevel({'run_level' : 'failboot'})
+
+ except:
+ print "reporting error: ", os.popen("uptime").read().strip()
+ traceback.print_exc()
+
+ sys.stdout.flush()
+ # TODO: change to a configurable value
+ time.sleep(60*15)
+
+def agent_running():
+ try:
+ os.stat(RLA_PID_FILE)
+ f = os.popen("ps ax | grep RunlevelAgent | grep -Ev 'grep|vim' | awk '{print $1}' | wc -l")
+ l = f.read().strip()
+ if int(l) >= 2:
+ return True
+ else:
+ try:
+ os.unlink(RLA_PID_FILE)
+ except:
+ pass
+ return False
+ except:
+ return False
+
+
+def shutdown():
+ import signal
+
+ pid = open(RLA_PID_FILE, 'r').read().strip()
+
+ # Try three different ways to kill the process. Just to be sure.
+ os.kill(int(pid), signal.SIGKILL)
+ os.system("pkill RunlevelAgent.py")
+ os.system("ps ax | grep RunlevelAgent | grep -v grep | awk '{print $1}' | xargs kill -9 ")
+
+if __name__ == "__main__":
+ if "start" in sys.argv and not agent_running():
+ start_and_run()
+
+ if "stop" in sys.argv and agent_running():
+ shutdown()
import UpdateNodeConfiguration
import MakeInitrd
+import StopRunlevelAgent
from Exceptions import *
import utils
import systeminfo
update_vals['ssh_rsa_key']= ssh_host_key
BootAPI.call_api_function( vars, "BootUpdateNode", (update_vals,) )
+
# get the kernel version
option = ''
if NODE_MODEL_OPTIONS & ModelOptions.SMP:
ROOT_MOUNTED= 0
vars['ROOT_MOUNTED']= 0
+ # Change runlevel to 'boot' prior to kexec.
+ StopRunlevelAgent.Run( vars, log )
+
log.write( "Unloading modules and chain booting to new kernel.\n" )
# further use of log after Upload will only output to screen
import os, sys
import string
import popen2
-
+import time
from Exceptions import *
import utils
import BootServerRequest
-
+import BootAPI
import ModelOptions
def Run( vars, log ):
# list of devices to be used with vgcreate
vg_device_list= ""
- # initialize the physical volumes
+ # get partitions
+ partitions = []
for device in used_devices:
-
part_path= get_partition_path_from_device( device, vars, log )
-
+ partitions.append(part_path)
+
+ # create raid partition
+ raid_partition = create_raid_partition(partitions, vars, log)
+ if raid_partition != None:
+ partitions = [raid_partition]
+ log.write("PARTITIONS %s\n" % str(partitions))
+ # initialize the physical volumes
+ for part_path in partitions:
if not create_lvm_physical_volume( part_path, vars, log ):
raise BootManagerException, "Could not create lvm physical volume " \
"on partition %s" % part_path
-
vg_device_list = vg_device_list + " " + part_path
# create an lvm volume group
return 1
+def create_raid_partition(partitions, vars, log):
+ """
+ create raid array using specified partitions.
+ """
+ raid_part = None
+ raid_enabled = False
+ node_tags = BootAPI.call_api_function( vars, "GetNodeTags",
+ ({'node_id': vars['NODE_ID']},))
+ for node_tag in node_tags:
+ if node_tag['tagname'] == 'raid_enabled' and \
+ node_tag['value'] == '1':
+ raid_enabled = True
+ break
+ if not raid_enabled:
+ return raid_part
+
+ try:
+ log.write( "Software raid enabled.\n" )
+ # wipe everything
+ utils.sysexec_noerr("mdadm --stop /dev/md0", log)
+ time.sleep(1)
+ for part_path in partitions:
+ utils.sysexec_noerr("mdadm --zero-superblock %s " % part_path, log)
+
+ # assume each partiton is on a separate disk
+ num_parts = len(partitions)
+ if num_parts < 2:
+ log.write( "Not enough disks for raid. Found: %s\n" % partitions )
+ raise BootManagerException("Not enough disks for raid. Found: %s\n" % partitions)
+ if num_parts == 2:
+ lvl = 1
+ else:
+ lvl = 5
+
+ # make the array
+ part_list = " ".join(partitions)
+ raid_part = "/dev/md0"
+ cmd = "mdadm --create %(raid_part)s --chunk=128 --level=raid%(lvl)s " % locals() + \
+ "--raid-devices=%(num_parts)s %(part_list)s" % locals()
+ utils.sysexec(cmd, log)
+
+ except BootManagerException, e:
+ log.write("create_raid_partition failed.\n")
+ raid_part = None
+
+ return raid_part
+
def get_partition_path_from_device( device, vars, log ):
"""
log.write( "\n\nStep: Starting RunlevelAgent.py\n" )
try:
- cmd = "%s/monitor-runlevelagent" % vars['BM_SOURCE_DIR']
+ cmd = "%s/RunlevelAgent.py" % vars['BM_SOURCE_DIR']
# raise error if script is not present.
os.stat(cmd)
# init script only starts RLA once.
- os.system("/bin/sh %s start bootmanager" % cmd)
+ os.system("/usr/bin/python %s start bootmanager &" % cmd)
except KeyError, var:
raise BootManagerException, "Missing variable in vars: %s\n" % var
except ValueError, var:
--- /dev/null
+#!/usr/bin/python
+#
+# Copyright (c) 2003 Intel Corporation
+# All rights reserved.
+#
+# Copyright (c) 2004-2006 The Trustees of Princeton University
+# All rights reserved.
+
+
+import os
+
+from Exceptions import *
+import BootAPI
+
+
+def Run( vars, log ):
+ """
+ Stop the RunlevelAgent.py script. Should proceed
+ kexec to reset run_level to 'boot' before kexec
+ """
+
+ log.write( "\n\nStep: Stopping RunlevelAgent.py\n" )
+
+ try:
+ cmd = "%s/RunlevelAgent.py" % vars['BM_SOURCE_DIR']
+ # raise error if script is not present.
+ os.stat(cmd)
+ os.system("/usr/bin/python %s stop" % cmd)
+ except KeyError, var:
+ raise BootManagerException, "Missing variable in vars: %s\n" % var
+ except ValueError, var:
+ raise BootManagerException, "Variable in vars, shouldn't be: %s\n" % var
+
+ try:
+ update_vals= {}
+ update_vals['run_level']='boot'
+ BootAPI.call_api_function( vars, "ReportRunlevel", (update_vals,) )
+ except BootManagerException, e:
+ log.write( "Unable to update boot state for this node at PLC: %s.\n" % e )
+
+ return 1
+
+
--- /dev/null
+#!/usr/bin/python
+#
+# Copyright (c) 2003 Intel Corporation
+# All rights reserved.
+#
+# Copyright (c) 2004-2006 The Trustees of Princeton University
+# All rights reserved.
+
+from Exceptions import *
+import BootAPI
+import notify_messages
+import os.path
+
+
+def Run( vars, log ):
+ """
+ UpdateLastBootOnce will update the last_* values for the node only
+ once per boot. This helps calculate last_time_spent_online and
+ last_time_spent_offline for collecting run-time metrics.
+ """
+
+ log.write( "\n\nStep: Updating node last boot times at PLC.\n" )
+
+ update_vals= {}
+ try:
+ if not os.path.isfile("/tmp/UPDATE_LAST_BOOT_ONCE"):
+ BootAPI.call_api_function( vars, "BootUpdateNode", (update_vals,) )
+ log.write( "Successfully updated boot state for this node at PLC\n" )
+ os.system("touch /tmp/UPDATE_LAST_BOOT_ONCE")
+ except BootManagerException, e:
+ log.write( "Unable to update last boot times for this node at PLC: %s.\n" % e )
+
+ return 1
"GetAndUpdateNodeDetails",
"ConfirmInstallWithUser",
"UpdateBootStateWithPLC",
+ "UpdateLastBootOnce",
"UpdateRunLevelWithPLC",
"CheckHardwareRequirements",
"SendHardwareConfigToPLC",
"ValidateNodeInstall",
"StartDebug",
"StartRunlevelAgent",
+ "StopRunlevelAgent",
"InstallBootstrapFS",
"InstallInit",
"InstallPartitionDisks",